diff --git a/Class_Mod/HDBSCAN_Clustering.py b/Class_Mod/HDBSCAN_Clustering.py index 1a9df2d72833121b79f19a6d9c0618868fc0ffc3..63b32a48dcc71789b14ab41a7e2e9352545253dd 100644 --- a/Class_Mod/HDBSCAN_Clustering.py +++ b/Class_Mod/HDBSCAN_Clustering.py @@ -1,9 +1,9 @@ from Packages import * class Hdbscan: """ - Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reducted space. + Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reduced space. Vars: - data: the Dimensionality reducted space, raw result of the UMAP.fit() + data: the Dimensionality reduced space, raw result of the UMAP.fit() param_dist: the HDBSCAN optimization parameters to test Density-Based Clustering Validation - DBCV (https://github.com/christopherjenness/DBCV/tree/master ; Moulavi, Davoud, et al. "Density-based clustering validation." Proceedings of the 2014 SIAM @@ -17,14 +17,21 @@ class Hdbscan: The HDBSCAN_scores_ @property return the cluster number of each sample (_labels) and the DBCV best score. """ def __init__(self, data): + # Really fast # self._param_dist = {'min_samples': [1], - # 'min_cluster_size':[5,10], + # 'min_cluster_size':[5], # 'metric' : ['euclidean','manhattan'], # } - self._param_dist = {'min_samples': [1,5,10,], - 'min_cluster_size':[5,25,50,], - 'metric' : ['euclidean','manhattan'], - } + # Medium + self._param_dist = {'min_samples': [1,10], + 'min_cluster_size':[5,50], + 'metric' : ['euclidean','manhattan'], + } + # Complete + # self._param_dist = {'min_samples': [1,5,10,], + # 'min_cluster_size':[5,25,50,], + # 'metric' : ['euclidean','manhattan'], + # } self._clusterable_embedding = data diff --git a/Class_Mod/UMAP_.py b/Class_Mod/UMAP_.py index 8d415ebb9b32761ea9c53c06a88363e0300206da..2bb51177a0329fa118e956d9a31b6359b158bc82 100644 --- a/Class_Mod/UMAP_.py +++ b/Class_Mod/UMAP_.py @@ -1,6 +1,6 @@ # UMAP function for the Sample Selection module -from Packages import * -from Class_Mod.DATA_HANDLING import * +from Packages import * +from Class_Mod.DATA_HANDLING import * class Umap: @@ -10,13 +10,14 @@ class Umap: def __init__(self, data_import, numerical_data, cat_data): self.x = data_import self.numerical_data = numerical_data - if len(cat_data) > 0: + if cat_data is None: + self.categorical_data_encoded = cat_data + elif len(cat_data) > 0: self.categorical_data = cat_data self.le = LabelEncoder() self.categorical_data_encoded = self.le.fit_transform(self.categorical_data) - else: - self.categorical_data = False + self.categorical_data_encoded = None self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, random_state=42,) self.model.fit(self.numerical_data, y = self.categorical_data_encoded) diff --git a/Packages.py b/Packages.py index ec7d83f23abc877b99e5eb07c3abc95a2280edba..fe21dcae4da468f573c90071dc6e646c642a9359 100644 --- a/Packages.py +++ b/Packages.py @@ -64,6 +64,8 @@ import joblib from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal - - +import plotly.graph_objects as go +import plotly.express as px +import plotly.io as pio +import streamlit as st st.set_option('deprecation.showPyplotGlobalUse', False) diff --git a/Report/figures/Elbow.png b/Report/figures/Elbow.png new file mode 100644 index 0000000000000000000000000000000000000000..bccaf706eaa6f87684d2571862067e32ec337745 Binary files /dev/null and b/Report/figures/Elbow.png differ diff --git a/Report/figures/Spectra_Plot.png b/Report/figures/Spectra_Plot.png new file mode 100644 index 0000000000000000000000000000000000000000..38d71f6619f41e4d2e60c415ffa217a67a3d8269 Binary files /dev/null and b/Report/figures/Spectra_Plot.png differ diff --git a/Report/figures/graphe_hotelling.png b/Report/figures/graphe_hotelling.png new file mode 100644 index 0000000000000000000000000000000000000000..5018f3fa6df582ee8bdb1a1237953893acfdefa4 Binary files /dev/null and b/Report/figures/graphe_hotelling.png differ diff --git a/Report/figures/graphe_influence.png b/Report/figures/graphe_influence.png new file mode 100644 index 0000000000000000000000000000000000000000..3a7ad7924e3754459d2b9ab43f1515df6d201b19 Binary files /dev/null and b/Report/figures/graphe_influence.png differ diff --git a/Report/figures/graphe_loadings.png b/Report/figures/graphe_loadings.png new file mode 100644 index 0000000000000000000000000000000000000000..8c12f8f35021dd100f494d7c3bf9f5b9fa0d7982 Binary files /dev/null and b/Report/figures/graphe_loadings.png differ diff --git a/Report/figures/scores_plot_2d_axis1_axis2.png b/Report/figures/scores_plot_2d_axis1_axis2.png new file mode 100644 index 0000000000000000000000000000000000000000..3107ca9465252cf925b5c00e8eafb524922fb95e Binary files /dev/null and b/Report/figures/scores_plot_2d_axis1_axis2.png differ diff --git a/Report/figures/scores_plot_2d_axis1_axis3.png b/Report/figures/scores_plot_2d_axis1_axis3.png new file mode 100644 index 0000000000000000000000000000000000000000..75c1b4681ee6d22d802cc99baa8854b0a5925372 Binary files /dev/null and b/Report/figures/scores_plot_2d_axis1_axis3.png differ diff --git a/Report/figures/scores_plot_2d_axis2_axis3.png b/Report/figures/scores_plot_2d_axis2_axis3.png new file mode 100644 index 0000000000000000000000000000000000000000..859b3251a719aa16196b71866f5858e4e6f74db4 Binary files /dev/null and b/Report/figures/scores_plot_2d_axis2_axis3.png differ diff --git a/app.py b/app.py index 027393b8079257a75cae2893f29cd52e15db1c8a..3ae15c08841d69be3ba72b75a3737d00269f65e6 100644 --- a/app.py +++ b/app.py @@ -6,6 +6,16 @@ from Modules import * from Class_Mod.DATA_HANDLING import * +# HTML pour le bandeau "CEFE - CNRS" +bandeau_html = """ +<div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> + <h1 style="text-align: center; color: white;">CEFE - CNRS</h1> +</div> +""" + +# Injecter le code HTML du bandeau +st.markdown(bandeau_html, unsafe_allow_html=True) + # # TOC menu on the left show_pages( [Page("app.py", "Home"), @@ -18,6 +28,7 @@ hide_pages("Samples Selection") hide_pages("Models Creation") hide_pages("Predictions") + with st.sidebar: interface = st.selectbox(label="Interface", options=['simple', 'advanced'], key='interface') st.page_link('pages\\1-samples_selection.py') @@ -35,17 +46,35 @@ with st.sidebar: st.page_link('pages\\2-model_creation.py') st.page_link('pages\\3-prediction.py') + # Page header with st.container(): - st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie-PACE :goat:") - st.title("NIRS Utils") - st.write("Samples selection (PCA, [UMAP](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html), ...), Predictive Modelling ([Pinard](https://github.com/GBeurier/pinard), [LWPLSR](https://doi.org/10.1002/cem.3209), ...), and Predictions using your data (CSV or DX files) and/or PACE NIRS Database.") - #st.image(img_general) - st.markdown("### We could add documentation here ###") + + # Centrer les boutons + st.markdown( + """ + <style> + .stButton>button { + display: block; + margin: 0 auto; + width: 200px; + height: 50px; + font-size: 16px; + } + </style> + """, + unsafe_allow_html=True + ) + header1, header2, header3 = st.columns(3) if header1.button("Samples Selection"): st.switch_page('pages\\1-samples_selection.py') if header2.button("Model Creation"): st.switch_page('pages\\2-model_creation.py') if header3.button("Predictions"): - st.switch_page('pages\\3-prediction.py') \ No newline at end of file + st.switch_page('pages\\3-prediction.py') + st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie-PACE :goat:") + st.title("NIRS Utils") + st.write("Samples selection (PCA, [UMAP](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html), ...), Predictive Modelling ([Pinard](https://github.com/GBeurier/pinard), [LWPLSR](https://doi.org/10.1002/cem.3209), ...), and Predictions using your data (CSV or DX files) and/or PACE NIRS Database.") + #st.image(img_general) + st.markdown("### We could add documentation here ###") \ No newline at end of file diff --git a/graphe.png b/graphe.png new file mode 100644 index 0000000000000000000000000000000000000000..3a7ad7924e3754459d2b9ab43f1515df6d201b19 Binary files /dev/null and b/graphe.png differ diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index a75dba662ab993f73becb488e6243f919c086a7e..e0654390e6c46cbf3d7f703464ecb1457fedde46 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -3,6 +3,19 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * + + +# HTML pour le bandeau "CEFE - CNRS" +bandeau_html = """ +<div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> + <h1 style="text-align: center; color: white;">CEFE - CNRS</h1> +</div> +""" + + +# Injecter le code HTML du bandeau +st.markdown(bandeau_html, unsafe_allow_html=True) + st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") @@ -55,13 +68,12 @@ if data_file: st.success("The data have been loaded successfully", icon="✅") os.unlink(tmp_path) - ## Visualize spectra if not spectra.empty: with col2: fig = plot_spectra(spectra) st.pyplot(fig) - + fig.savefig("./Report/figures/Spectra_Plot.png") ############################## Exploratory data analysis ############################### container2 = st.container(border=True) @@ -91,7 +103,13 @@ if not spectra.empty: if dim_red_method == dim_red_methods[1]: dr_model = LinearPCA(xc, Ncomp=5) elif dim_red_method == dim_red_methods[2]: - dr_model = Umap(data_import = imp, numerical_data = MinMaxScale(spectra), cat_data = meta_data) + if not meta_data.empty: + filter = meta_data.columns[1:] + col = pc.selectbox('Supervised UMAP by:', options= filter, key=108) + supervised = meta_data[col] + else: + supervised = None + dr_model = Umap(data_import = imp, numerical_data = MinMaxScale(spectra), cat_data = supervised) if dr_model: axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) @@ -108,11 +126,14 @@ if not t.empty: ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') cl_model = Sk_Kmeans(tcr, max_clusters = 30) fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia') - scores.plotly_chart(fig2) + scores.plotly_chart(fig2,use_container_width=True) + img = pio.to_image(fig2, format="png") + with open("./Report/figures/Elbow.png", "wb") as f: + f.write(img) data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster) elif clus_method == cluster_methods[2]: - optimized_hdbscan = Hdbscan(model.scores_raw_) + optimized_hdbscan = Hdbscan(dr_model.scores_raw_) labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ ##### Plots @@ -168,11 +189,11 @@ if not t.empty: # scores plot with clustering if list(labels) and meta_data.empty: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) - + # scores plot with metadata elif len(list(labels)) == 0 and not meta_data.empty: filter = meta_data.columns[1:] - col = st.selectbox('Group by:', options= filter) + col = st.selectbox('Color by:', options= filter) if col == 0: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) else: @@ -186,7 +207,7 @@ if not t.empty: else: filter = meta_data.columns[1:].insert(0,'None') - col = st.selectbox('Group by:', options= filter) + col = st.selectbox('Color by:', options= filter) if col == "None": fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) elif col == clus_method: @@ -203,38 +224,87 @@ if not t.empty: fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2], z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'), name = 'selected samples') - st.plotly_chart(fig) + st.plotly_chart(fig, use_container_width=True) + + +## Export en 2d Axe1..Axe3 +if not t.empty: + if dim_red_method == dim_red_methods[1]: + + # nombre de clusters + num_clusters = len(np.unique(labels)) + # Une couleur par cluster + custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] + # Graphique pour les dimensions (axis1, axis2) + fig_2d_axis1_axis2 = px.scatter(t, x=axis1, y=axis2, color=labels, color_discrete_sequence=custom_color_palette) + img_2d_axis1_axis2 = pio.to_image(fig_2d_axis1_axis2, format="png") + with open("./Report/figures/scores_plot_2d_axis1_axis2.png", "wb") as f: + f.write(img_2d_axis1_axis2) + + # Graphique pour les dimensions (axis1, axis3) + fig_2d_axis1_axis3 = px.scatter(t, x=axis1, y=axis3, color=labels, color_discrete_sequence=custom_color_palette) + img_2d_axis1_axis3 = pio.to_image(fig_2d_axis1_axis3, format="png") + with open("./Report/figures/scores_plot_2d_axis1_axis3.png", "wb") as f: + f.write(img_2d_axis1_axis3) + + # Graphique pour les dimensions (axis2, axis3) + fig_2d_axis2_axis3 = px.scatter(t, x=axis2, y=axis3, color=labels, color_discrete_sequence=custom_color_palette) + img_2d_axis2_axis3 = pio.to_image(fig_2d_axis2_axis3, format="png") + with open("./Report/figures/scores_plot_2d_axis2_axis3.png", "wb") as f: + f.write(img_2d_axis2_axis3) if not spectra.empty: if dim_red_method == dim_red_methods[1]: + with loadings: st.write('Loadings plot') p = dr_model.loadings_ - pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1) + pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis=1) df1 = pp.melt(id_vars="wl") - fig = px.line(df1, x = 'wl', y = 'value', color='variable') - fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"), - bordercolor="Black", borderwidth=2)) - st.plotly_chart(fig, use_container_width = True) + fig = px.line(df1, x='wl', y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly) + fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"), + bordercolor="black", borderwidth=2)) + st.plotly_chart(fig, use_container_width=True) + + # Export du graphique + img = pio.to_image(fig, format="png") + with open("./Report/figures/graphe_loadings.png", "wb") as f: + f.write(img) with influence: st.write('Influence plot') ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3) leverage = dr_model.leverage_ residuals = dr_model.residuals_ - fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals") + fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color=leverage[ax1]*residuals[ax1], color_continuous_scale='Blues') + fig.update_layout(xaxis_title="Leverage", yaxis_title="Residuals") st.plotly_chart(fig) + img = pio.to_image(fig, format="png") + with open("./Report/figures/graphe_influence.png", "wb") as f: + f.write(img) + with hotelling: - st.write('T²-Hotelling vs Q residuals plot') - hotelling = dr_model.hotelling_ - ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) + st.write('T²-Hotelling vs Q residuals plot') + hotelling = dr_model.hotelling_ + ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) + + hotelling = dr_model.hotelling_ + fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") + st.plotly_chart(fig, use_container_width=True) + fig.write_image("./Report/figures/graphe_hotelling.png", format="png") + + if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN + with loadings: # Display some clustering metrics + st.write('Clustering metrics:') + clusters_number = set(labels) + clusters_number.remove(-1) + st.write('Optimal number of clusters = ' + str(len(clusters_number))) + st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) + st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).') - hotelling = dr_model.hotelling_ - fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") - st.plotly_chart(fig) diff --git a/pages/2-model_creation.py b/pages/2-model_creation.py index 3a4dea1c8eb1f2edf4aed8979ea0696b5cb27851..50f07cdae1e7f51c466148e255bb7a8fb6ba9a1d 100644 --- a/pages/2-model_creation.py +++ b/pages/2-model_creation.py @@ -2,7 +2,15 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * - +# HTML pour le bandeau "CEFE - CNRS" +bandeau_html = """ +<div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> + <h1 style="text-align: center; color: white;">CEFE - CNRS</h1> +</div> +""" + +# Injecter le code HTML du bandeau +st.markdown(bandeau_html, unsafe_allow_html=True) st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': diff --git a/pages/3-prediction.py b/pages/3-prediction.py index e2acfc13702b1944a36fb8341797f9912853c354..a3eccd090b2fe97e090c23cf52beb5092bf61ce4 100644 --- a/pages/3-prediction.py +++ b/pages/3-prediction.py @@ -2,7 +2,15 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * - +# HTML pour le bandeau "CEFE - CNRS" +bandeau_html = """ +<div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> + <h1 style="text-align: center; color: white;">CEFE - CNRS</h1> +</div> +""" + +# Injecter le code HTML du bandeau +st.markdown(bandeau_html, unsafe_allow_html=True) st.session_state["interface"] = st.session_state.get('interface')