Newer
Older
import streamlit as st
# help on streamlit input https://docs.streamlit.io/library/api-reference/widgets
from PIL import Image
# emojis code here : https://www.webfx.com/tools/emoji-cheat-sheet/
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans as km
from sklearn.metrics import pairwise_distances_argmin_min
from application_functions import pca_maker, model, predict, find_delimiter
# load images for web interface
img_sselect = Image.open("images\sselect.JPG")
img_general = Image.open("images\general.JPG")
img_predict = Image.open("images\predict.JPG")
with st.sidebar:
st.markdown("[Sample Selection](#sample-selection)")
st.markdown("[Model Creation](#create-a-model)")
st.markdown("[Prediction](#predict)")
with st.container():
st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie :goat:")
st.title("NIRS Utils")
st.write("Sample selections, Modelisations & Predictions using [Pinard](https://github.com/GBeurier/pinard) and PACE NIRS Database.")
st.image(img_general)
with st.container():
st.header("Sample Selection")
st.image(img_sselect)
st.write("Sample selection using PCA and K-Means algorythms")
# split 2 columns 4:1 ratio
scatter_column, settings_column = st.columns((4, 1))
scatter_column.write("**Multi-Dimensional Analysis**")
settings_column.write("**Settings**")
# loader for csv file containing NIRS spectra
sselectx_csv = settings_column.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if sselectx_csv is not None:
# Select list for CSV delimiter
psep = settings_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
# Select list for CSV header True / False
phdr = settings_column.selectbox("indexes column in csv?", options=["no", "yes"], key=31)
if phdr == 'yes':
col = 0
else:
col = False
import_button = settings_column.button('Import')
if import_button:
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
# compute PCA - pca_maker function in application_functions.py
# add 2 select lists to choose which component to plot
pca_1 = settings_column.selectbox("First Principle Component", options=pca_cols, index=0)
pca_2 = settings_column.selectbox("Second Principle Component", options=pca_cols, index=1)
# if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
if cat_cols[0] == "no categories":
scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra"))
else:
categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra"))
#K-Means
## K-Means choose number of clusters
wcss_samples = []
cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
clusters_sample = np.arange(2, cluster_max)
for i in clusters_sample:
kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
wcss_samples.append(kmeans_samples.inertia_)
settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection", width=200))
# scatter_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection"))
## Draw clustering
nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
kmeans_samples = km(n_clusters=nb_select, random_state=42)
kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
# kmeans_samples.labels_
plot = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters"))
# choose between cluster centered sample and random samples
selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
export = []
scatter_column.write("Selected samples for chemical analysis:")
if selection == 'center':
# list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pca_data.loc[:,[pca_1,pca_2]])
scatter_column.dataframe(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]], use_container_width=True)
export.append(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.T)
# plot.empty()
elif selection == 'random':
selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
for i in np.unique(kmeans_samples.labels_):
if len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])) >= selection_number:
# scatter_column.write('cluster number - ')
# scatter_column.write(i)
# scatter_column.write('_samples in this cluster_')
# scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])))
# scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]).sample(n=selection_number))
export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
else:
# scatter_column.write('cluster number - ')
# scatter_column.write(i)
# scatter_column.write("_whole cluster (not enough samples)_")
# scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])))
# scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]))
export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].index)
scatter_column.write(pd.DataFrame(export).T)
if scatter_column.button('Export'):
pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
else:
scatter_column.write("_Please Choose a file_")
with st.container():
st.header("Create a model")
st.image(img_predict)
st.write("Create a model to then predict chemical values from NIRS spectra")
xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if xcal_csv is not None and ycal_csv is not None:
# Select list for CSV delimiter
sep = st.selectbox("Select csv separator - CSV Detected separator: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
# Select list for column indexes True / False
hdr = st.selectbox("column indexes in csv?", options=["yes", "no"], key=1)
rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
# Train model with model function from application_functions.py
trained_model = model(xcal_csv, ycal_csv, sep, hdr, rd_seed)
# Prediction module - TO BE DONE !!!!!
with st.container():
st.header("Predict")
st.write("---")
st.write("Predict chemical values from NIRS")
NIRS_csv = st.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
psep = st.selectbox("Select csv separator", options=[";", ","], key=2)
phdr = st.selectbox("indexes column in csv?", options=["yes", "no"], key=3)
st.button("Predict", on_click=predict)