Newer
Older
#from Modules_manager.PCA_ import pca_maker
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")

BARTHES Nicolas
committed
from Class_Mod.DATA_HANDLING import *
# load images for web interface
img_sselect = Image.open("images\sselect.JPG")
img_general = Image.open("images\general.JPG")
img_predict = Image.open("images\predict.JPG")
with st.sidebar:
st.markdown("[Sample Selection](#sample-selection)")
st.markdown("[Model Development](#create-a-model)")
st.markdown("[Predictions Making](#predict)")
st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie-PACE :goat:")

BARTHES Nicolas
committed
st.write("Samples selection (PCA, [UMAP](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html), ...), Predictive Modelling ([Pinard](https://github.com/GBeurier/pinard), [LWPLSR](https://doi.org/10.1002/cem.3209), ...), and Predictions using your data (CSV or DX files) and/or PACE NIRS Database.")
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
################################### Data Loading and Visualization ########################################
container1 = st.container(border=True)
col2, col1 = st.columns([3, 1])
container2 = st.container(border=True)
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
scores, loadings, pc = st.columns([2, 2, 0.5])
influence, hotelling, qexp = st.columns([2, 2, 1])
with container1:
col1.header("NIRS Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue')
with col1:
# loader for csv file containing NIRS spectra
sselectx_csv = st.file_uploader("Load NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if sselectx_csv is not None:
# Select list for CSV delimiter
psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
# Select list for CSV header True / False
phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
st.success("The data have been loaded successfully", icon="✅")
## Visualize spectra
if sselectx_csv is not None:
with col2:
fig, ax = plt.subplots(figsize = (30,7))
data_import.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal', fontsize=18)
plt.margins(x = 0)
st.pyplot(fig)
st.write("Summary")
info = pd.DataFrame({'N':[data_import.shape[0]],
'Min': [np.min(data_import)],
'Max':[np.max(data_import)],}, index = ['Values']).T
info.rename_axis('information')
st.table(data=info)
######################################################################################
############################## Exploratory data analysis ###############################
with container2:
plot_type=['', 'PCA','UMAP', 'NMF']
cluster_methods = ['', 'Kmeans','UMAP', 'AP']
with pc:
type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38)
# compute UMAP - umap_maker in application_functions.py
if type_plot == 'PCA':
model = LinearPCA(data_import, Ncomp=5)
elif type_plot =='UMAP':
model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
# add 2 select lists to choose which component to plot
axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
if type_cluster == 'Kmeans':
cl = Sk_Kmeans(pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1), max_clusters = 30)
with scores:
t = model.scores_
if type_cluster in ['Kmeans','UMAP', 'AP']:
st.write('Scree plot')
fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
st.plotly_chart(fig2)
ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
data, colors = cl.fit_optimal(nclusters=ncluster)
#fig = px.scatter(data, x=axis1, y=axis2, color= colors)
st.write('Scores plot')
fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
else:
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)
st.plotly_chart(fig)
if type_plot =='PCA':
with loadings:
st.write('Loadings plot')
p = model.loadings_
pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
df1 = pp.melt(id_vars="wl")
fig = px.line(df1, x = 'wl', y = 'value', color='variable')
fig.update_layout(
legend=dict(x=1, y=0,
font=dict(
family="Courier", size=12, color="black"),
bordercolor="Black", borderwidth=2)
)
st.plotly_chart(fig)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
leverage = model.leverage_
residuals = model.residuals
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1])
st.plotly_chart(fig)
st.write('T²-Hotelling vs Q residuals plot')
ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)
t = model.scores_
fig = px.scatter(t, x=axis1, y=t.columns[1])
st.plotly_chart(fig)
else:
st.markdown('Select a dimensionality reduction technique from the dropdown list')
########################################################################################
container2 = st.container(border=True)
M1, M2, M3 = st.columns([2,2,2])
M4, M5 = st.columns([6,2])
container3 = st.container(border=True)
M7, M8 = st.columns([2,2])
available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]
with container2:
st.header("Calibration Model Development", divider='blue')
st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra")
xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
# Select list for CSV delimiter
sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
hdr = M3.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
if hdr == 'yes':
col = 0
else:
col = False
rd_seed = M1.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
# Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
# Assign data to training and test sets
X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index])
#############################
regression_algo = M1.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12)
if regression_algo == 'SciKitLearn PLSR':
# Train model with model function from application_functions.py
Reg = PinardPlsr(x_train=X_train, x_test=X_test,y_train=y_train, y_test=y_test)
reg_model = Reg.model_
#M2.dataframe(Pin.pred_data_)
elif regression_algo == 'Jchemo Local Weighted PLSR':
reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
elif regression_algo == "Intervalle Selection PLSR":
s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value="min")
reg_model = TpeIpls(x_train= X_train, y_train= y_train, x_test=X_test, y_test= y_test,Kfold= 3,scale= True, n_intervall = 3)
reg_model.tune(n_iter=10)
if regression_algo in ["SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]:
with container3:
st.header("Model Diagnosis", divider='blue')
yc = Reg.pred_data_[0]
ycv = Reg.pred_data_[1]
yt = Reg.pred_data_[2]
M7.write('Predicted vs Measured values')
M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
M8.write('Residuals plot')
M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
# Export the model with pickle or joblib
if regression_algo != '':
M1.write("-- Performance metrics --")
M1.dataframe(Reg.metrics_)
M1.write("-- Save the model --")
#model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
model_name = M1.text_input('Give it a name')
if M1.button('Export Model'):
#export_package = __import__(model_export)
with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model,f)
# create a report with information on the model
## see https://stackoverflow.com/a/59578663
#M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
# Prediction module - TO BE DONE !!!!!
st.write("---")
st.write("Predict chemical values from NIRS")
NIRS_csv = file_column.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")

BARTHES Nicolas
committed
export_folder = './data/predictions/'
export_name = 'Predictions_of_'
if NIRS_csv:
export_name += str(NIRS_csv.name[:-4])
qsep = file_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+NIRS_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+NIRS_csv.name))), key=2)
qhdr = file_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+NIRS_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+NIRS_csv.name))), key=3)

BARTHES Nicolas
committed
# Load the model with joblib
model_column.write("Load your saved predictive model")
model_name_import = model_column.selectbox('Choose file:', options=os.listdir('data/models/'), key = 21)
if model_name_import != ' ':
export_name += '_with_' + str(model_name_import[:-4])
with open('data/models/'+ model_name_import,'rb') as f:
model_column.success("The model has been loaded successfully", icon="✅")
# use prediction function from application_functions.py to predict chemical values
result = prediction(NIRS_csv, qsep, qhdr, model_loaded)
st.write('Predicted values are: ')

BARTHES Nicolas
committed
pd.DataFrame(result).to_csv(export_folder + export_name + '.csv')
# export to local drive - Download
download_results(export_folder + export_name + '.csv', export_name + '.csv')
# create a report with information on the prediction
## see https://stackoverflow.com/a/59578663