Newer
Older
import streamlit as st
# help on streamlit input https://docs.streamlit.io/library/api-reference/widgets
from PIL import Image
# emojis code here : https://www.webfx.com/tools/emoji-cheat-sheet/
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans as km
from sklearn.metrics import pairwise_distances_argmin_min
from application_functions import pca_maker, model_PLSR, model_LWPLSR, prediction, find_delimiter, umap_maker, find_col_index, list_files, CV_model
# load images for web interface
img_sselect = Image.open("images\sselect.JPG")
img_general = Image.open("images\general.JPG")
img_predict = Image.open("images\predict.JPG")
with st.sidebar:
st.markdown("[Sample Selection](#sample-selection)")
st.markdown("[Model Creation](#create-a-model)")
st.markdown("[Prediction](#predict)")
with st.container():
st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie :goat:")
st.title("NIRS Utils")
st.write("Sample selections, Modelisations & Predictions using [Pinard](https://github.com/GBeurier/pinard) and PACE NIRS Database.")
st.image(img_general)
with st.container():
st.header("Sample Selection")
st.image(img_sselect)
st.write("Sample selection using PCA and K-Means algorythms")
# split 2 columns 4:1 ratio
scatter_column, settings_column = st.columns((4, 1))
scatter_column.write("**Multi-Dimensional Analysis**")
settings_column.write("**Settings**")
# loader for csv file containing NIRS spectra
sselectx_csv = settings_column.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if sselectx_csv is not None:
# Select list for CSV delimiter
psep = settings_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
# Select list for CSV header True / False
phdr = settings_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
plot_type=['', 'pca','umap']
type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37)
# compute UMAP - umap_maker in application_functions.py
if type_plot == 'umap':
pc_data, cat_cols, pc_cols = umap_maker(data_import)
# compute PCA - pca_maker function in application_functions.py
if type_plot == 'pca':
pc_data, cat_cols, pc_cols = pca_maker(data_import)
if type_plot == 'umap' or type_plot == 'pca':
# add 2 select lists to choose which component to plot
pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0)
pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1)
# if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
if cat_cols[0] == "no categories":
plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra"))
else:
categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra"))
# Clustering method
# cluster_type = ['k-means', 'umap'] # uncomment if more clustering algorithms available
type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38)
# clustering via K-Means
if type_cluster == 'k-means':
#K-Means
## K-Means choose number of clusters
wcss_samples = []
cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
clusters_sample = np.arange(2, cluster_max)
for i in clusters_sample:
kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
wcss_samples.append(kmeans_samples.inertia_)
settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
## Draw clustering
nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
kmeans_samples = km(n_clusters=nb_select, random_state=42)
kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
# choose between cluster centered sample and n-random samples
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
export = []
scatter_column.write("Selected samples for chemical analysis:")
if selection == 'center':
# list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False)
export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T)
# list indexes of selected samples for colored plot
te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist()
elif selection == 'random':
selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
for i in np.unique(kmeans_samples.labels_):
if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
# another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index)
else:
export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index)
# list indexes of selected samples for colored plot
te = []
for sublist in export:
for item in sublist:
te.append(item)
# display a matrix of selected samples
scatter_column.write(pd.DataFrame(export).T)
# convert cluster number to text for optimized coloring
kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
for j in te:
kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected'
# plot de pc with colored clusters and selected samples
graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
plot = scatter_column.plotly_chart(graph_selected)
# button to export the names of selected samples - by cluster if random - in a csv
if scatter_column.button('Export'):
pd.DataFrame(export).T.to_csv('./data/sample_selections/Samples_from_' + sselectx_csv.name + '_for_Chemical_Analysis.csv')
else:
scatter_column.write("_Please Choose a file_")
# clustering via UMAP / HDBSCAN -- TO BE DONE !!!
if type_cluster == 'umap':
import hdbscan
# plot de pc with colored clusters and selected samples
# graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
# plot = scatter_column.plotly_chart(graph_selected)
scatter_column.dataframe(pc_data)
labels = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=10,).fit_predict(pc_data.loc[:,[pc_1,pc_2]])
clustered = (labels >= 0)
graph_clustered = plt.scatter(standard_embedding[clustered, 0], standard_embedding[clustered, 1], c=labels[clustered], s=0.1, cmap='Spectral')
plot = scatter_column.plotly_chart(graph_selected)
with st.container():
st.header("Create a model")
st.image(img_predict)
st.write("Create a model to then predict chemical values from NIRS spectra")
available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR"]
xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if xcal_csv is not None and ycal_csv is not None:
# Select list for CSV delimiter
sep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
# Select list for CSV header True / False
hdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
regression_algo = st.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12)
if regression_algo == 'SciKitLearn PLSR':
rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
# Train model with model function from application_functions.py
trained_model = model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed)
elif regression_algo == 'Jchemo Local Weighted PLSR':
trained_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
# Export the model with pickle or joblib
if regression_algo != '':
st.write("-- Save the model --")
model_export = st.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
model_name = st.text_input('Give it a name')
if st.button('Export Model'):
export_package = __import__(model_export)
with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + model_export + '.pkl','wb') as f:
export_package.dump(trained_model,f)
st.write('Model Exported')
# create a report with information on the model
## see https://stackoverflow.com/a/59578663
# Prediction module - TO BE DONE !!!!!
with st.container():
st.header("Predict")
st.write("---")
st.write("Predict chemical values from NIRS")
file_column, space, model_column = st.columns((3, 1, 3))
NIRS_csv = file_column.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
export_name = './data/predictions/Predictions_of_'
if NIRS_csv:
export_name += str(NIRS_csv.name[:-4])
qsep = file_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+NIRS_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+NIRS_csv.name))), key=2)
qhdr = file_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+NIRS_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+NIRS_csv.name))), key=3)
# Load the model with pickle or joblib
model_column.write("Load a saved model")
model_import = model_column.selectbox("Choose a way to import", options=["pickle", "joblib"], key=22)
model_name_import = model_column.selectbox('Choose file:', options=[' '] + list_files('data/models/', model_import), key = 21)
if model_name_import != ' ':
export_name += '_with_' + str(model_name_import[:-4])
export_package = __import__(model_import)
with open('data/models/'+ model_name_import,'rb') as f:
model_loaded = export_package.load(f)
if model_loaded:
model_column.write('Model Imported')
result = ''
if st.button("Predict"):
# use prediction function from application_functions.py to predict chemical values
result = prediction(NIRS_csv, qsep, qhdr, model_loaded)
st.write('Predicted values are: ')
st.dataframe(result)
pd.DataFrame(result).to_csv(export_name + '.csv')
st.write('Predictions exported to ' + export_name + '.csv')
# export to local drive
from urllib.request import urlretrieve
url = ('http://localhost:8501' + export_name[1:] + '.csv')
filename = export_name + '.csv'
urlretrieve(url, filename)
# create a report with information on the prediction
## see https://stackoverflow.com/a/59578663
if type(result) is list:
st.write(result)