Newer
Older
import streamlit as st
# help on streamlit input https://docs.streamlit.io/library/api-reference/widgets
from PIL import Image
# emojis code here : https://www.webfx.com/tools/emoji-cheat-sheet/
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans as km
from sklearn.metrics import pairwise_distances_argmin_min
from application_functions import pca_maker, model, predict, find_delimiter, umap_maker
# load images for web interface
img_sselect = Image.open("images\sselect.JPG")
img_general = Image.open("images\general.JPG")
img_predict = Image.open("images\predict.JPG")
with st.sidebar:
st.markdown("[Sample Selection](#sample-selection)")
st.markdown("[Model Creation](#create-a-model)")
st.markdown("[Prediction](#predict)")
with st.container():
st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie :goat:")
st.title("NIRS Utils")
st.write("Sample selections, Modelisations & Predictions using [Pinard](https://github.com/GBeurier/pinard) and PACE NIRS Database.")
st.image(img_general)
with st.container():
st.header("Sample Selection")
st.image(img_sselect)
st.write("Sample selection using PCA and K-Means algorythms")
# split 2 columns 4:1 ratio
scatter_column, settings_column = st.columns((4, 1))
scatter_column.write("**Multi-Dimensional Analysis**")
settings_column.write("**Settings**")
# loader for csv file containing NIRS spectra
sselectx_csv = settings_column.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if sselectx_csv is not None:
# Select list for CSV delimiter
psep = settings_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
# Select list for CSV header True / False
phdr = settings_column.selectbox("indexes column in csv?", options=["no", "yes"], key=31)
if phdr == 'yes':
col = 0
else:
col = False

Nicolas Barthes
committed
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
# Select type of plot
plot_type=['pca','umap']
type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37)
# compute UMAP - umap_maker in application_functions.py
if type_plot == 'umap':
pc_data, cat_cols, pc_cols = umap_maker(data_import)
# compute PCA - pca_maker function in application_functions.py
if type_plot == 'pca':
pc_data, cat_cols, pc_cols = pca_maker(data_import)
# add 2 select lists to choose which component to plot
pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0)
pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1)
# if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra"))
categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra"))
# Clustering method
cluster_type = ['k-means', 'umap']
type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38)
if type_cluster == 'k-means':
#K-Means
## K-Means choose number of clusters
wcss_samples = []
cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
clusters_sample = np.arange(2, cluster_max)
for i in clusters_sample:
kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
wcss_samples.append(kmeans_samples.inertia_)
settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
## Draw clustering
nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
kmeans_samples = km(n_clusters=nb_select, random_state=42)
kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
# plot the pc with clustering only (no selected samples)
# graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters")
# plot = scatter_column.plotly_chart(graph)
# choose between cluster centered sample and random samples
selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
export = []
scatter_column.write("Selected samples for chemical analysis:")
if selection == 'center':
# list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False)
export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T)
# list indexes of selected samples for colored plot
te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist()
elif selection == 'random':
selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
for i in np.unique(kmeans_samples.labels_):
if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
# export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
# another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index)
else:
export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index)
# list indexes of selected samples for colored plot
te = []
for sublist in export:
for item in sublist:
te.append(item)
# display a matrix of selected samples
scatter_column.write(pd.DataFrame(export).T)
# convert cluster number to text for optimized coloring
kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
for j in te:
kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected'
# plot de pc with colored clusters and selected samples
graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
plot = scatter_column.plotly_chart(graph_selected)
# button to export the names of selected samples - by cluster if random - in a csv
if scatter_column.button('Export'):
pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
else:
scatter_column.write("_Please Choose a file_")
if type_cluster == 'umap':
pass
with st.container():
st.header("Create a model")
st.image(img_predict)
st.write("Create a model to then predict chemical values from NIRS spectra")
xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if xcal_csv is not None and ycal_csv is not None:
# Select list for CSV delimiter
sep = st.selectbox("Select csv separator - CSV Detected separator: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
# Select list for column indexes True / False
hdr = st.selectbox("column indexes in csv?", options=["yes", "no"], key=1)
rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
# Train model with model function from application_functions.py
trained_model = model(xcal_csv, ycal_csv, sep, hdr, rd_seed)
# Prediction module - TO BE DONE !!!!!
with st.container():
st.header("Predict")
st.write("---")
st.write("Predict chemical values from NIRS")
NIRS_csv = st.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
psep = st.selectbox("Select csv separator", options=[";", ","], key=2)
phdr = st.selectbox("indexes column in csv?", options=["yes", "no"], key=3)
st.button("Predict", on_click=predict)