From 32c20f0a31ea5fc90d34bdda7d9bd370858db9ba Mon Sep 17 00:00:00 2001 From: barthes <nicolas.barthes@cefe.cnrs.fr> Date: Tue, 30 Apr 2024 15:20:24 +0200 Subject: [PATCH] improved HDBSCAN clustering (random still buggy) --- src/Class_Mod/HDBSCAN_Clustering.py | 13 ++++++++---- src/pages/1-samples_selection.py | 31 ++++++++++------------------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/src/Class_Mod/HDBSCAN_Clustering.py b/src/Class_Mod/HDBSCAN_Clustering.py index d19b990..d20aa7e 100644 --- a/src/Class_Mod/HDBSCAN_Clustering.py +++ b/src/Class_Mod/HDBSCAN_Clustering.py @@ -57,11 +57,15 @@ class Hdbscan: self._ij_hdbscan_score = self.DBCV(self._clusterable_embedding, self._ij_label,)# dist_function=euclidean) self._score.at[i,j] = self._ij_hdbscan_score # get the best DBCV score - self._hdbscan_score = max(self._score.max()) + self._hdbscan_bscore = max(self._score.max()) # find the coordinates of the best clustering parameters and run HDBSCAN below - self._bparams = np.where(self._score == self._hdbscan_score) + self._bparams = np.where(self._score == self._hdbscan_bscore) # run HDBSCAN with best params - self._labels = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]]).fit_predict(self._clusterable_embedding) + + self.best_hdbscan = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]], store_centers="medoid", ) + self.best_hdbscan.fit_predict(self._clusterable_embedding) + self._labels = self.best_hdbscan.labels_ + self._centers = self.best_hdbscan.medoids_ def DBCV(self, X, labels, dist_function=euclidean): """ @@ -316,4 +320,5 @@ class Hdbscan: @property def HDBSCAN_scores_(self): - return self._labels, self._hdbscan_score + clu = [f'cluster#{i}' for i in self._labels] + return clu, self._hdbscan_bscore, self._centers diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index de03b98..83c7c25 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -2,19 +2,8 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * - - -# HTML pour le bandeau "CEFE - CNRS" -# bandeau_html = """ -# <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> -# <h1 style="text-align: center; color: white;">CEFE - CNRS</h1> -# </div> -# """ -# # Injecter le code HTML du bandeau -# st.markdown(bandeau_html, unsafe_allow_html=True) add_header() - st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") @@ -152,10 +141,12 @@ if not t.empty: data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster) elif clus_method == cluster_methods[2]: - optimized_hdbscan = Hdbscan(np.array(t)) - labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ - non_clustered = np.where(labels == -1) - labels[non_clustered] = 1000 + optimized_hdbscan = Hdbscan(np.array(tcr)) + labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_ + labels = np.array(labels) + non_clustered = np.where(np.array(labels) == "cluster#-1") + unclustered_number = len(non_clustered[0]) + labels[non_clustered] = "unclust." labels = labels.tolist() elif clus_method == cluster_methods[3]: @@ -193,7 +184,6 @@ if labels: # list indexes of selected samples for colored plot if selected_samples_idx: - if meta_data.empty: sam = pd.DataFrame({'name': spectra.index[selected_samples_idx], 'cluster':np.array(labels)[selected_samples_idx]}, @@ -359,11 +349,12 @@ if not spectra.empty: if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN with loadings: # Display some clustering metrics st.write('Clustering metrics:') - clusters_number = set(labels) - clusters_number.remove(-1) - st.write('Optimal number of clusters = ' + str(len(clusters_number))) + clusters_number = len(set(labels)) + if unclustered_number > 0: + clusters_number -= 1 + st.write('Optimal number of clusters = ' + str(clusters_number)) st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) - st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).') + st.write('Unclassified samples: ' + str(unclustered_number) + ' on ' + str(len(tcr)) + ' samples (' + str(round(unclustered_number/len(tcr)*100, 1)) + '%).') -- GitLab