import streamlit as st # help on streamlit input https://docs.streamlit.io/library/api-reference/widgets from PIL import Image # emojis code here : https://www.webfx.com/tools/emoji-cheat-sheet/ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") import numpy as np import pandas as pd import plotly.express as px from sklearn.cluster import KMeans as km from sklearn.metrics import pairwise_distances_argmin_min from application_functions import pca_maker, model_PLSR, model_LWPLSR, prediction, find_delimiter, umap_maker, find_col_index, list_files, CV_model # load images for web interface img_sselect = Image.open("images\sselect.JPG") img_general = Image.open("images\general.JPG") img_predict = Image.open("images\predict.JPG") # TOC menu on the left with st.sidebar: st.markdown("[Sample Selection](#sample-selection)") st.markdown("[Model Creation](#create-a-model)") st.markdown("[Prediction](#predict)") # Page header with st.container(): st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie :goat:") st.title("NIRS Utils") st.write("Sample selections, Modelisations & Predictions using [Pinard](https://github.com/GBeurier/pinard) and PACE NIRS Database.") st.image(img_general) # graphical delimiter st.write("---") # Sample Selection module with st.container(): st.header("Sample Selection") st.image(img_sselect) st.write("Sample selection using PCA and K-Means algorythms") # split 2 columns 4:1 ratio scatter_column, settings_column = st.columns((4, 1)) scatter_column.write("**Multi-Dimensional Analysis**") settings_column.write("**Settings**") # loader for csv file containing NIRS spectra sselectx_csv = settings_column.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) if sselectx_csv is not None: # Select list for CSV delimiter psep = settings_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) # Select list for CSV header True / False phdr = settings_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) if phdr == 'yes': col = 0 else: col = False data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) # Select type of plot plot_type=['', 'pca','umap'] type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37) # compute UMAP - umap_maker in application_functions.py if type_plot == 'umap': pc_data, cat_cols, pc_cols = umap_maker(data_import) # compute PCA - pca_maker function in application_functions.py if type_plot == 'pca': pc_data, cat_cols, pc_cols = pca_maker(data_import) if type_plot == 'umap' or type_plot == 'pca': # add 2 select lists to choose which component to plot pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0) pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1) # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA if cat_cols[0] == "no categories": plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra")) else: categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols) categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols) plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra")) # Clustering method cluster_type = ['', 'k-means', 'umap'] # cluster_type = ['k-means', 'umap'] # uncomment if more clustering algorithms available type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38) # clustering via K-Means if type_cluster == 'k-means': #K-Means ## K-Means choose number of clusters wcss_samples = [] cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i") clusters_sample = np.arange(2, cluster_max) for i in clusters_sample: kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42) kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) wcss_samples.append(kmeans_samples.inertia_) settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200)) ## Draw clustering nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") kmeans_samples = km(n_clusters=nb_select, random_state=42) kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) # choose between cluster centered sample and n-random samples selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) export = [] scatter_column.write("Selected samples for chemical analysis:") if selection == 'center': # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False) export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T) # list indexes of selected samples for colored plot te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist() elif selection == 'random': selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) for i in np.unique(kmeans_samples.labels_): if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number: # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster kmeans_selected_samples = km(n_clusters=selection_number, random_state=42) kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]]) closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index) else: export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index) # list indexes of selected samples for colored plot te = [] for sublist in export: for item in sublist: te.append(item) # display a matrix of selected samples scatter_column.write(pd.DataFrame(export).T) # convert cluster number to text for optimized coloring kmeans_samples.labels_ = kmeans_samples.labels_.astype(str) for j in te: kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected' # plot de pc with colored clusters and selected samples graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples") plot = scatter_column.plotly_chart(graph_selected) # button to export the names of selected samples - by cluster if random - in a csv if scatter_column.button('Export'): pd.DataFrame(export).T.to_csv('./data/sample_selections/Samples_from_' + sselectx_csv.name + '_for_Chemical_Analysis.csv') else: scatter_column.write("_Please Choose a file_") # clustering via UMAP / HDBSCAN -- TO BE DONE !!! if type_cluster == 'umap': import hdbscan # plot de pc with colored clusters and selected samples # graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples") # plot = scatter_column.plotly_chart(graph_selected) scatter_column.dataframe(pc_data) labels = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=10,).fit_predict(pc_data.loc[:,[pc_1,pc_2]]) clustered = (labels >= 0) graph_clustered = plt.scatter(standard_embedding[clustered, 0], standard_embedding[clustered, 1], c=labels[clustered], s=0.1, cmap='Spectral') plot = scatter_column.plotly_chart(graph_selected) # graphical delimiter st.write("---") # Model creation module with st.container(): st.header("Create a model") st.image(img_predict) st.write("Create a model to then predict chemical values from NIRS spectra") available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR"] # CSV files loader xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") if xcal_csv is not None and ycal_csv is not None: # Select list for CSV delimiter sep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) # Select list for CSV header True / False hdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) regression_algo = st.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12) if regression_algo == 'SciKitLearn PLSR': rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i") # Train model with model function from application_functions.py trained_model = model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed) elif regression_algo == 'Jchemo Local Weighted PLSR': trained_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) # Export the model with pickle or joblib if regression_algo != '': st.write("-- Save the model --") model_export = st.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) model_name = st.text_input('Give it a name') if st.button('Export Model'): export_package = __import__(model_export) with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + model_export + '.pkl','wb') as f: export_package.dump(trained_model,f) st.write('Model Exported') # create a report with information on the model ## see https://stackoverflow.com/a/59578663 # graphical delimiter st.write("---") # Prediction module - TO BE DONE !!!!! with st.container(): st.header("Predict") st.write("---") st.write("Predict chemical values from NIRS") file_column, space, model_column = st.columns((3, 1, 3)) NIRS_csv = file_column.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") export_name = './data/predictions/Predictions_of_' if NIRS_csv: export_name += str(NIRS_csv.name[:-4]) qsep = file_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+NIRS_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+NIRS_csv.name))), key=2) qhdr = file_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+NIRS_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+NIRS_csv.name))), key=3) # Load the model with pickle or joblib model_column.write("Load a saved model") model_import = model_column.selectbox("Choose a way to import", options=["pickle", "joblib"], key=22) model_name_import = model_column.selectbox('Choose file:', options=[' '] + list_files('data/models/', model_import), key = 21) if model_name_import != ' ': export_name += '_with_' + str(model_name_import[:-4]) export_package = __import__(model_import) with open('data/models/'+ model_name_import,'rb') as f: model_loaded = export_package.load(f) if model_loaded: model_column.write('Model Imported') result = '' if st.button("Predict"): # use prediction function from application_functions.py to predict chemical values result = prediction(NIRS_csv, qsep, qhdr, model_loaded) st.write('Predicted values are: ') st.dataframe(result) pd.DataFrame(result).to_csv(export_name + '.csv') st.write('Predictions exported to ' + export_name + '.csv') # export to local drive from urllib.request import urlretrieve url = ('http://localhost:8501' + export_name[1:] + '.csv') filename = export_name + '.csv' urlretrieve(url, filename) # create a report with information on the prediction ## see https://stackoverflow.com/a/59578663 if type(result) is list: st.write(result)