From 3461d3b07f77a1605d234dbb79a99e0d8a58aad3 Mon Sep 17 00:00:00 2001
From: barthes <nicolas.barthes@cefe.cnrs.fr>
Date: Tue, 16 Apr 2024 15:44:09 +0200
Subject: [PATCH] HDBSCAN working with metadata & supervised UMAP + fixed
 widget width

---
 Class_Mod/HDBSCAN_Clustering.py | 21 ++++++++++++++-------
 pages/1-samples_selection.py    | 33 +++++++++++++++++++++------------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/Class_Mod/HDBSCAN_Clustering.py b/Class_Mod/HDBSCAN_Clustering.py
index 1a9df2d..63b32a4 100644
--- a/Class_Mod/HDBSCAN_Clustering.py
+++ b/Class_Mod/HDBSCAN_Clustering.py
@@ -1,9 +1,9 @@
 from Packages import *
 class Hdbscan:
     """
-    Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reducted space.
+    Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reduced space.
     Vars:
-        data: the Dimensionality reducted space, raw result of the UMAP.fit()
+        data: the Dimensionality reduced space, raw result of the UMAP.fit()
         param_dist: the HDBSCAN optimization parameters to test
         Density-Based Clustering Validation - DBCV (https://github.com/christopherjenness/DBCV/tree/master ;
             Moulavi, Davoud, et al. "Density-based clustering validation." Proceedings of the 2014 SIAM
@@ -17,14 +17,21 @@ class Hdbscan:
         The HDBSCAN_scores_ @property return the cluster number of each sample (_labels) and the DBCV best score.
     """
     def __init__(self, data):
+        # Really fast
         # self._param_dist = {'min_samples': [1],
-        #               'min_cluster_size':[5,10],
+        #               'min_cluster_size':[5],
         #               'metric' : ['euclidean','manhattan'],
         #               }
-        self._param_dist = {'min_samples': [1,5,10,],
-                      'min_cluster_size':[5,25,50,],
-                      'metric' : ['euclidean','manhattan'],
-                      }
+        # Medium
+        self._param_dist = {'min_samples': [1,10],
+            'min_cluster_size':[5,50],
+            'metric' : ['euclidean','manhattan'],
+            }
+        # Complete
+        # self._param_dist = {'min_samples': [1,5,10,],
+        #       'min_cluster_size':[5,25,50,],
+        #       'metric' : ['euclidean','manhattan'],
+        #       }
 
         self._clusterable_embedding = data
 
diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py
index 8f11ea6..f97708c 100644
--- a/pages/1-samples_selection.py
+++ b/pages/1-samples_selection.py
@@ -111,7 +111,7 @@ if not t.empty:
         ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
         cl_model = Sk_Kmeans(tcr, max_clusters = 30)
         fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
-        scores.plotly_chart(fig2)
+        scores.plotly_chart(fig2, use_container_width=True)
         data, labels = cl_model.fit_optimal(nclusters = ncluster)
 
     elif clus_method == cluster_methods[2]:
@@ -131,7 +131,7 @@ if not t.empty:
         # scores plot with metadata
         elif len(list(labels)) == 0 and not meta_data.empty:
             filter = meta_data.columns[1:]
-            col = st.selectbox('Group by:', options= filter)
+            col = st.selectbox('Color by:', options= filter)
             if col == 0:
                 fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
             else:
@@ -145,7 +145,7 @@ if not t.empty:
             else:
                 filter = meta_data.columns[1:].insert(0,'None')
 
-            col = st.selectbox('Group by:', options= filter)
+            col = st.selectbox('Color by:', options= filter)
             if col == "None":
                 fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
             elif col == clus_method:
@@ -156,7 +156,7 @@ if not t.empty:
         else:
             fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
         fig.update_traces(marker=dict(size=4))
-        st.plotly_chart(fig)
+        st.plotly_chart(fig, use_container_width=True)
 
 
 
@@ -178,13 +178,22 @@ if not spectra.empty:
             leverage = dr_model.leverage_
             residuals = dr_model.residuals_
             fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
-            st.plotly_chart(fig)
+            st.plotly_chart(fig, use_container_width=True)
 
         with hotelling:
-                st.write('TÂ²-Hotelling vs Q residuals plot')
-                hotelling = dr_model.hotelling_
-                ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
-
-                hotelling = dr_model.hotelling_
-                fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="TÂ²",yaxis_title="Residuals")
-                st.plotly_chart(fig)
\ No newline at end of file
+            st.write('TÂ²-Hotelling vs Q residuals plot')
+            hotelling = dr_model.hotelling_
+            ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
+
+            hotelling = dr_model.hotelling_
+            fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="TÂ²",yaxis_title="Residuals")
+            st.plotly_chart(fig, use_container_width=True)
+
+    if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN
+        with loadings: # Display some clustering metrics
+            st.write('Clustering metrics:')
+            clusters_number = set(labels)
+            clusters_number.remove(-1)
+            st.write('Optimal number of clusters = ' + str(len(clusters_number)))
+            st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
+            st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
-- 
GitLab