From 32c20f0a31ea5fc90d34bdda7d9bd370858db9ba Mon Sep 17 00:00:00 2001
From: barthes <nicolas.barthes@cefe.cnrs.fr>
Date: Tue, 30 Apr 2024 15:20:24 +0200
Subject: [PATCH] improved HDBSCAN clustering (random still buggy)

---
 src/Class_Mod/HDBSCAN_Clustering.py | 13 ++++++++----
 src/pages/1-samples_selection.py    | 31 ++++++++++-------------------
 2 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/src/Class_Mod/HDBSCAN_Clustering.py b/src/Class_Mod/HDBSCAN_Clustering.py
index d19b990..d20aa7e 100644
--- a/src/Class_Mod/HDBSCAN_Clustering.py
+++ b/src/Class_Mod/HDBSCAN_Clustering.py
@@ -57,11 +57,15 @@ class Hdbscan:
                 self._ij_hdbscan_score = self.DBCV(self._clusterable_embedding, self._ij_label,)# dist_function=euclidean)
                 self._score.at[i,j] = self._ij_hdbscan_score
         # get the best DBCV score
-        self._hdbscan_score  = max(self._score.max())
+        self._hdbscan_bscore  = max(self._score.max())
         # find the coordinates of the best clustering parameters and run HDBSCAN below
-        self._bparams = np.where(self._score == self._hdbscan_score)
+        self._bparams = np.where(self._score == self._hdbscan_bscore)
         # run HDBSCAN with best params
-        self._labels = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]]).fit_predict(self._clusterable_embedding)
+
+        self.best_hdbscan = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]], store_centers="medoid", )
+        self.best_hdbscan.fit_predict(self._clusterable_embedding)
+        self._labels = self.best_hdbscan.labels_
+        self._centers = self.best_hdbscan.medoids_
 
     def DBCV(self, X, labels, dist_function=euclidean):
         """
@@ -316,4 +320,5 @@ class Hdbscan:
 
     @property
     def HDBSCAN_scores_(self):
-         return self._labels, self._hdbscan_score
+        clu = [f'cluster#{i}' for i in self._labels]
+        return clu, self._hdbscan_bscore, self._centers
diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py
index de03b98..83c7c25 100644
--- a/src/pages/1-samples_selection.py
+++ b/src/pages/1-samples_selection.py
@@ -2,19 +2,8 @@ from Packages import *
 st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
 from Modules import *
 
-
-
-# HTML pour le bandeau "CEFE - CNRS"
-# bandeau_html = """
-# <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;">
-#   <h1 style="text-align: center; color: white;">CEFE - CNRS</h1>
-# </div>
-# """
-# # Injecter le code HTML du bandeau
-# st.markdown(bandeau_html, unsafe_allow_html=True)
 add_header()
 
-
 st.session_state["interface"] = st.session_state.get('interface')
 if st.session_state["interface"] == 'simple':
     hide_pages("Predictions")
@@ -152,10 +141,12 @@ if not t.empty:
         data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
 
     elif clus_method == cluster_methods[2]:
-        optimized_hdbscan = Hdbscan(np.array(t))
-        labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
-        non_clustered = np.where(labels == -1)
-        labels[non_clustered] = 1000
+        optimized_hdbscan = Hdbscan(np.array(tcr))
+        labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_
+        labels = np.array(labels)
+        non_clustered = np.where(np.array(labels) == "cluster#-1")
+        unclustered_number = len(non_clustered[0])
+        labels[non_clustered] = "unclust."
         labels = labels.tolist()
     
     elif clus_method == cluster_methods[3]:
@@ -193,7 +184,6 @@ if labels:
             # list indexes of selected samples for colored plot    
 
     if selected_samples_idx:
-        
         if meta_data.empty:
             sam = pd.DataFrame({'name': spectra.index[selected_samples_idx],
                                 'cluster':np.array(labels)[selected_samples_idx]},
@@ -359,11 +349,12 @@ if not spectra.empty:
     if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN
         with loadings: # Display some clustering metrics
             st.write('Clustering metrics:')
-            clusters_number = set(labels)
-            clusters_number.remove(-1)
-            st.write('Optimal number of clusters = ' + str(len(clusters_number)))
+            clusters_number = len(set(labels))
+            if unclustered_number > 0:
+                clusters_number -= 1
+            st.write('Optimal number of clusters = ' + str(clusters_number))
             st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
-            st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
+            st.write('Unclassified samples: ' + str(unclustered_number) + ' on ' + str(len(tcr)) + ' samples (' + str(round(unclustered_number/len(tcr)*100, 1)) + '%).')
     
 
 
-- 
GitLab