diff --git a/app.py b/app.py index 654599da5b0c1dd146da53bc7046dd5d57c7422f..a7dc627f6d504a0598c133d09fe0974fd29b1096 100644 --- a/app.py +++ b/app.py @@ -3,8 +3,13 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * +from Packages import * from Class_Mod.DATA_HANDLING import * +# graphical delimiter +st.write("---") + + # load images for web interface img_sselect = Image.open("images\sselect.JPG") img_general = Image.open("images\general.JPG") @@ -21,199 +26,278 @@ with st.sidebar: with st.container(): st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie-PACE :goat:") st.title("NIRS Utils") - st.write("Sample selection, Predictive Modelling & Predictions making using [Pinard](https://github.com/GBeurier/pinard) and PACE NIRS Database.") - st.image(img_general) + st.write("Samples selection, Predictive Modelling, and Predictions making using [Pinard](https://github.com/GBeurier/pinard) and PACE NIRS Database.") + #st.image(img_general) -# graphical delimiter -st.write("---") -# Sample Selection module -with st.container(): - st.header("Sample Selection") - st.image(img_sselect) - st.write("Sample selection using PCA and K-Means algorithms") - # split 2 columns 4:1 ratio - scatter_column, settings_column = st.columns((4, 1)) - scatter_column.write("**Multi-Dimensional Analysis**") - settings_column.write("**Settings**") - # loader for csv file containing NIRS spectra - sselectx_csv = settings_column.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) +################################### Data Loading and Visualization ######################################## +container1 = st.container(border=True) +col2, col1 = st.columns([3, 1]) + + +container2 = st.container(border=True) +container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') +scores, loadings, pc = st.columns([2, 2, 0.5]) +influence, hotelling, qexp = st.columns([2, 2, 1]) + + +with container1: + col1.header("NIRS Data Loading", divider='blue') + col2.header("Spectral Data Visualization", divider='blue') + + with col1: + # loader for csv file containing NIRS spectra + sselectx_csv = st.file_uploader("Load NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) + if sselectx_csv is not None: + # Select list for CSV delimiter + psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) + # Select list for CSV header True / False + phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) + if phdr == 'yes': + col = 0 + else: + col = False + data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) + st.success("The data have been loaded successfully", icon="✅") + ## Visualize spectra + + if sselectx_csv is not None: + with col2: + fig, ax = plt.subplots(figsize = (30,7)) + data_import.T.plot(legend=False, ax = ax, color = 'blue') + ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) + ax.set_ylabel('Signal', fontsize=18) + plt.margins(x = 0) + st.pyplot(fig) + + st.write("Summary") + info = pd.DataFrame({'N':[data_import.shape[0]], + 'Min': [np.min(data_import)], + 'Max':[np.max(data_import)],}, index = ['Values']).T + info.rename_axis('information') + st.table(data=info) +###################################################################################### + +############################## Exploratory data analysis ############################### +with container2: if sselectx_csv is not None: - # Select list for CSV delimiter - psep = settings_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) - # Select list for CSV header True / False - phdr = settings_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) - if phdr == 'yes': - col = 0 - else: - col = False - data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) - # Select type of plot - plot_type=['', 'pca','umap'] - type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37) - # compute UMAP - umap_maker in application_functions.py - if type_plot == 'umap': - pc_data, cat_cols, pc_cols = umap_maker(data_import) - # compute PCA - pca_maker function in application_functions.py - if type_plot == 'pca': - pc_data, cat_cols, pc_cols = pca_maker(data_import) - if type_plot == 'umap' or type_plot == 'pca': + plot_type=['', 'PCA','UMAP', 'NMF'] + cluster_methods = ['', 'Kmeans','UMAP', 'AP'] + + with pc: + type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37) + type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38) + # compute UMAP - umap_maker in application_functions.py + if type_plot == 'PCA': + model = LinearPCA(data_import, Ncomp=5) + elif type_plot =='UMAP': + pass + if type_plot in ['PCA', 'UMAP']: # add 2 select lists to choose which component to plot - pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0) - pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1) - # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA - if cat_cols[0] == "no categories": - plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra")) - else: - categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols) - categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols) - plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra")) - # Clustering method - cluster_type = ['', 'k-means', 'umap'] - # cluster_type = ['k-means', 'umap'] # uncomment if more clustering algorithms available - type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38) - # clustering via K-Means - if type_cluster == 'k-means': - #K-Means - ## K-Means choose number of clusters - wcss_samples = [] - cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i") - clusters_sample = np.arange(2, cluster_max) - for i in clusters_sample: - kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42) - kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) - wcss_samples.append(kmeans_samples.inertia_) - settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200)) - ## Draw clustering - nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") - kmeans_samples = km(n_clusters=nb_select, random_state=42) - kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) - # choose between cluster centered sample and n-random samples - selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) - export = [] - scatter_column.write("Selected samples for chemical analysis:") - if selection == 'center': - # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster - closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) - scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False) - export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T) - # list indexes of selected samples for colored plot - te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist() - elif selection == 'random': - selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) - for i in np.unique(kmeans_samples.labels_): - if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number: - # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster - kmeans_selected_samples = km(n_clusters=selection_number, random_state=42) - kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]]) - closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) - export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index) - else: - export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index) - # list indexes of selected samples for colored plot - te = [] - for sublist in export: - for item in sublist: - te.append(item) - # display a matrix of selected samples - scatter_column.write(pd.DataFrame(export).T) - # convert cluster number to text for optimized coloring - kmeans_samples.labels_ = kmeans_samples.labels_.astype(str) - for j in te: - kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected' - # plot de pc with colored clusters and selected samples - graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples") - plot = scatter_column.plotly_chart(graph_selected) - # button to export the names of selected samples - by cluster if random - in a csv - if scatter_column.button('Export'): - pd.DataFrame(export).T.to_csv('./data/sample_selections/Samples_from_' + sselectx_csv.name + '_for_Chemical_Analysis.csv') - else: - scatter_column.write("_Please Choose a file_") - # clustering via UMAP / HDBSCAN -- TO BE DONE !!! - if type_cluster == 'hdbscan': - import hdbscan - # plot de pc with colored clusters and selected samples - # graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples") - # plot = scatter_column.plotly_chart(graph_selected) - scatter_column.dataframe(pc_data) - labels = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=10,).fit_predict(pc_data.loc[:,[pc_1,pc_2]]) - clustered = (labels >= 0) - graph_clustered = plt.scatter(standard_embedding[clustered, 0], standard_embedding[clustered, 1], c=labels[clustered], s=0.1, cmap='Spectral') - plot = scatter_column.plotly_chart(graph_selected) -# graphical delimiter -st.write("---") + axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0) + axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1) + axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2) + + if type_cluster == 'Kmeans': + cl = Sk_Kmeans(pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1), max_clusters = 30) + + + + with scores: + + t = model.scores_ + if type_cluster in ['Kmeans','UMAP', 'AP']: + st.write('Scree plot') + fig2 = px.scatter(cl.inertia_.T, y = 'inertia') + st.plotly_chart(fig2) + + ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') + data, colors = cl.fit_optimal(nclusters=ncluster) + #fig = px.scatter(data, x=axis1, y=axis2, color= colors) + st.write('Scores plot') + fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors) + + + + else: + fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3) + + st.plotly_chart(fig) + + + + with loadings: + st.write('Loadings plot') + p = model.loadings_ + pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1) + df1 = pp.melt(id_vars="wl") + fig = px.line(df1, x = 'wl', y = 'value', color='variable') + fig.update_layout( + legend=dict(x=1, y=0, + font=dict( + family="Courier", size=12, color="black"), + bordercolor="Black", borderwidth=2) + ) + st.plotly_chart(fig) + + + with influence: + st.write('Influence plot') + ax1 = st.selectbox("Component", options=model.scores_.columns, index=3) + + leverage = model.leverage_ + residuals = model.residuals + fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]) + st.plotly_chart(fig) + + with hotelling: + + st.write('T²-Hotelling vs Q residuals plot') + ax2 = st.selectbox("Component", options=model.scores_.columns, index=4) + + t = model.scores_ + fig = px.scatter(t, x=axis1, y=t.columns[1]) + st.plotly_chart(fig) + + with qexp: + pass + + + else: + st.markdown('Select a dimensionality reduction technique from the dropdown list') + + +######################################################################################## # Model creation module -with st.container(): - st.header("Create a model") - st.image(img_predict) - st.write("Create a model to then predict chemical values from NIRS spectra") - available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR"] +container2 = st.container(border=True) + +M1, M2, M3 = st.columns([2,2,2]) +M4, M5 = st.columns([6,2]) +container3 = st.container(border=True) +M7, M8 = st.columns([2,2]) + +available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"] +with container2: + st.header("Calibration Model Development", divider='blue') + st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra") # CSV files loader - xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") - ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") + xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") + if xcal_csv is not None and ycal_csv is not None: # Select list for CSV delimiter - sep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) + sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) # Select list for CSV header True / False - hdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) - regression_algo = st.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12) + hdr = M3.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) + if hdr == 'yes': + col = 0 + else: + col = False + rd_seed = M1.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i") + x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) + # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing + train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) + # Assign data to training and test sets + X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index]) + ############################# + + regression_algo = M1.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12) + if regression_algo == 'SciKitLearn PLSR': - rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i") # Train model with model function from application_functions.py - trained_model = model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed) + Reg = PinardPlsr(x_train=X_train, x_test=X_test,y_train=y_train, y_test=y_test) + reg_model = Reg.model_ + + #M2.dataframe(Pin.pred_data_) + elif regression_algo == 'Jchemo Local Weighted PLSR': - trained_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) + reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) + + elif regression_algo == "Intervalle Selection PLSR": + s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value="min") + reg_model = TpeIpls(x_train= X_train, y_train= y_train, x_test=X_test, y_test= y_test,Kfold= 3,scale= True, n_intervall = 3) + reg_model.tune(n_iter=10) + + if regression_algo in ["SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]: + with container3: + st.header("Model Diagnosis", divider='blue') + yc = Reg.pred_data_[0] + ycv = Reg.pred_data_[1] + yt = Reg.pred_data_[2] + M7.write('Predicted vs Measured values') + M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) + M8.write('Residuals plot') + M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) + + + + + + + # Export the model with pickle or joblib if regression_algo != '': - st.write("-- Save the model --") - model_export = st.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) - model_name = st.text_input('Give it a name') - if st.button('Export Model'): - export_package = __import__(model_export) - with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + model_export + '.pkl','wb') as f: - export_package.dump(trained_model,f) + M1.write("-- Performance metrics --") + M1.dataframe(Reg.metrics_) + M1.write("-- Save the model --") + #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) + model_name = M1.text_input('Give it a name') + if M1.button('Export Model'): + #export_package = __import__(model_export) + with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f: + joblib.dump(reg_model,f) st.write('Model Exported') + # create a report with information on the model ## see https://stackoverflow.com/a/59578663 + + #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv])) + + # graphical delimiter st.write("---") + + + +#M9, M10, M11 = st.columns([2,2,2]) # Prediction module - TO BE DONE !!!!! with st.container(): - st.header("Predict") + st.header("Predictions making") st.write("---") st.write("Predict chemical values from NIRS") - file_column, space, model_column = st.columns((3, 1, 3)) + model_column, space, file_column= st.columns((2, 1, 1)) NIRS_csv = file_column.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") export_name = './data/predictions/Predictions_of_' if NIRS_csv: export_name += str(NIRS_csv.name[:-4]) qsep = file_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+NIRS_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+NIRS_csv.name))), key=2) qhdr = file_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+NIRS_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+NIRS_csv.name))), key=3) + + # Load the model with pickle or joblib - model_column.write("Load a saved model") - model_import = model_column.selectbox("Choose a way to import", options=["pickle", "joblib"], key=22) - model_name_import = model_column.selectbox('Choose file:', options=[' '] + list_files('data/models/', model_import), key = 21) + model_column.write("Load your saved predictive model") + model_name_import = model_column.selectbox('Choose file:', options=os.listdir('data/models/'), key = 21) if model_name_import != ' ': export_name += '_with_' + str(model_name_import[:-4]) - export_package = __import__(model_import) with open('data/models/'+ model_name_import,'rb') as f: - model_loaded = export_package.load(f) + model_loaded = joblib.load(f) if model_loaded: - model_column.write('Model Imported') + model_column.success("The model has been loaded successfully", icon="✅") result = '' + if st.button("Predict"): # use prediction function from application_functions.py to predict chemical values result = prediction(NIRS_csv, qsep, qhdr, model_loaded) st.write('Predicted values are: ') - st.dataframe(result) + st.dataframe(result.T) pd.DataFrame(result).to_csv(export_name + '.csv') st.write('Predictions exported to ' + export_name + '.csv') # export to local drive