Newer
Older
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
# HTML pour le bandeau "CEFE - CNRS"
bandeau_html = """
<div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;">
<h1 style="text-align: center; color: white;">CEFE - CNRS</h1>
</div>
"""
# Injecter le code HTML du bandeau
st.markdown(bandeau_html, unsafe_allow_html=True)
st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
hide_pages("Predictions")
################################### Data Loading and Visualization ########################################
col2, col1 = st.columns([3, 1])
col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue')
meta_data = pd.DataFrame
selected_samples = pd.DataFrame
# loader for datafile
data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if data_file:
# Retrieve the extension of the file
test = data_file.name[data_file.name.find('.'):]
if test== '.csv':
with col1:
# Select list for CSV delimiter
psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False

BARTHES Nicolas
committed
# spectra = col_cat(imp)[0]
# meta_data = col_cat(imp)[1]
spectra, meta_data = col_cat(imp)
st.success("The data have been loaded successfully", icon="✅")
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:

BARTHES Nicolas
committed
_, spectra, meta_data = read_dx(file = tmp_path)
fig.savefig("./Report/figures/Spectra_Plot.png")
############################## Exploratory data analysis ###############################
container2 = st.container(border=True)
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
scores, loadings, pc = st.columns([2, 3, 0.5])
influence, hotelling, qexp = st.columns([2, 2, 1])
st.header('Selected samples for chemical analysis')
selected_s, selected_samples_metd = st.columns([3, 3])
selected_s.write('Samples scores')
dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos
dr_model = None # dimensionality reduction model
cl_model = None # clustering model
t = pd.DataFrame # scores
p = pd.DataFrame # loadings
labels = []
if not spectra.empty:
dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)

BARTHES Nicolas
committed
if dim_red_method == dim_red_methods[1]:
elif dim_red_method == dim_red_methods[2]:
if not meta_data.empty:
filter = meta_data.columns[1:]
col = pc.selectbox('Supervised UMAP by:', options= filter, key=108)
supervised = meta_data[col]
else:
supervised = None
dr_model = Umap(data_import = imp, numerical_data = MinMaxScale(spectra), cat_data = supervised)
if dr_model:
axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
# clustering
if not t.empty:
if clus_method == cluster_methods[1]:
ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
cl_model = Sk_Kmeans(tcr, max_clusters = 30)
fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
scores.plotly_chart(fig2,use_container_width=True)
img = pio.to_image(fig2, format="png")
with open("./Report/figures/Elbow.png", "wb") as f:
f.write(img)
data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
optimized_hdbscan = Hdbscan(dr_model.scores_raw_)
labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#####################################################################################################
selec_strategy = ['center','random']
samples_df_chem = pd.DataFrame
selected_samples = []
selected_samples_idx = []
if labels:
selection = scores.radio('Select samples selection strategy:', options = selec_strategy)
#################### selection strategy to be corrected
if selection == selec_strategy[0]:
# list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
closest, _ = pairwise_distances_argmin_min(clu_centers, tcr)
selected_samples_idx = list(closest)
elif selection == selec_strategy[1]:
selection_number = scores.number_input('How many samples per cluster?', min_value = 1, step=1, value = 3)
for i in np.unique(labels):
C = np.where(np.array(labels) ==i)[0]
if C.shape[0] >= selection_number:
#scores.write(list(tcr.index)[labels== i])
km2 = KMeans(n_clusters = selection_number)
km2.fit(tcr.iloc[C,:])
clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
selected_samples_idx2 = list(clos)
selected_samples_idx.extend(tcr.iloc[C,:].index[selected_samples_idx2])
# selected_samples_idx.extend(tcr.iloc[C,:].sample(n=selection_number).index.to_list())
else:
selected_samples_idx.extend(tcr.iloc[C,:].index.to_list())
# list indexes of selected samples for colored plot
if labels:
if selected_samples_idx:
sam = pd.DataFrame({'cluster':np.array(labels)[selected_samples_idx],
'index': spectra.index[selected_samples_idx]})
selected_s.write(sam)
if not meta_data.empty:
selected_samples_metd.write('Corresponding meta-data')
meta = meta_data.iloc[selected_samples_idx,:]
meta['cluster'] = np.array(labels)[selected_samples_idx]
meta['index'] = spectra.index[selected_samples_idx]
selected_samples_metd.write(meta)
## Scores
if not t.empty:
with scores:
st.write('Scores plot')
# scores plot with clustering
if list(labels) and meta_data.empty:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
elif len(list(labels)) == 0 and not meta_data.empty:
filter = meta_data.columns[1:]
col = st.selectbox('Color by:', options= filter)
if col == 0:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
# color with scores and metadata
elif len(list(labels)) > 0 and not meta_data.empty:
if clus_method in cluster_methods[1:]:
filter = ['None', clus_method]
filter.extend(meta_data.columns[1:])
filter = meta_data.columns[1:].insert(0,'None')
col = st.selectbox('Color by:', options= filter)
if col == "None":
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
elif col == clus_method:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])))
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
fig.update_traces(marker=dict(size=4))
if selected_samples_idx:
tt = tcr.iloc[selected_samples_idx,:]
fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],
z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'),
name = 'selected samples')
st.plotly_chart(fig, use_container_width=True)
custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
color_discrete_sequence=custom_color_palette
# Créer et exporter le graphique Axe1-Axe2 en PNG
fig_axe1_axe2 = px.scatter(tcr, x=axis1, y=axis2, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
fig_axe1_axe2.update_layout(title='Axe1-Axe2')
fig_axe1_axe2.update_traces(marker=dict(size=4))
# Créer et exporter le graphique Axe1-Axe3 en PNG
fig_axe1_axe3 = px.scatter(tcr, x=axis1, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
fig_axe1_axe3.update_layout(title='Axe1-Axe3')
fig_axe1_axe3.update_traces(marker=dict(size=4))
# Créer et exporter le graphique Axe2-Axe3 en PNG
fig_axe2_axe3 = px.scatter(tcr, x=axis2, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
fig_axe2_axe3.update_layout(title='Axe2-Axe3')
fig_axe2_axe3.update_traces(marker=dict(size=4))
if dim_red_method == dim_red_methods[1]:
with loadings:
st.write('Loadings plot')
p = dr_model.loadings_
pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis=1)
fig = px.line(df1, x='wl', y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
bordercolor="black", borderwidth=2))
st.plotly_chart(fig, use_container_width=True)
# Export du graphique
img = pio.to_image(fig, format="png")
with open("./Report/figures/graphe_loadings.png", "wb") as f:
f.write(img)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
leverage = dr_model.leverage_
residuals = dr_model.residuals_
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color=leverage[ax1]*residuals[ax1], color_continuous_scale='Blues')
fig.update_layout(xaxis_title="Leverage", yaxis_title="Residuals")
img = pio.to_image(fig, format="png")
st.write('T²-Hotelling vs Q residuals plot')
hotelling = dr_model.hotelling_
ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
hotelling = dr_model.hotelling_
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
st.plotly_chart(fig, use_container_width=True)
fig.write_image("./Report/figures/graphe_hotelling.png", format="png")
if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN
with loadings: # Display some clustering metrics
st.write('Clustering metrics:')
clusters_number = set(labels)
clusters_number.remove(-1)
st.write('Optimal number of clusters = ' + str(len(clusters_number)))
st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')