From 4b97a50f8cfa7540d20b1966daedefbdb8876d84 Mon Sep 17 00:00:00 2001
From: Nicolas Barthes <nicolas.barthes@cnrs.fr>
Date: Fri, 12 Apr 2024 16:03:23 +0200
Subject: [PATCH] updated HDBSCAN optimization

---
 Class_Mod/HDBSCAN_Clustering.py | 26 ++++++++++++++++++++------
 Packages.py                     |  2 +-
 pages/1-samples_selection.py    | 13 +++++--------
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/Class_Mod/HDBSCAN_Clustering.py b/Class_Mod/HDBSCAN_Clustering.py
index 0d9268b..f019282 100644
--- a/Class_Mod/HDBSCAN_Clustering.py
+++ b/Class_Mod/HDBSCAN_Clustering.py
@@ -256,14 +256,18 @@ def _get_label_members(X, labels, cluster):
     members = X[indices]
     return members
 
-def HDBSCAN_function(data, min_cluster_size):
+def HDBSCAN_function(data):
     # param_dist = {'min_samples': [1,5,10,30],
     #               'min_cluster_size':[5,10,20,30,50,75,100],
     #               # 'cluster_selection_method' : ['eom','leaf'],
     #               # 'metric' : ['euclidean','manhattan']
     #               }
-    param_dist = {'min_samples': [1,5],
-                  'min_cluster_size':[5,10],
+    # param_dist = {'min_samples': [1,5,10,50],
+    #               'min_cluster_size':[5,10,30,50,100,300,500],
+    #               }
+    param_dist = {'min_samples': [1,5, 10,],
+                  'min_cluster_size':[5,10,30,50,100],
+                  'metric' : ['euclidean','manhattan'],
                   }
 
     clusterable_embedding = UMAP(
@@ -272,6 +276,15 @@ def HDBSCAN_function(data, min_cluster_size):
         n_components=5,
         random_state=42,
     ).fit_transform(data)
+
+    # RandomizedSearchCV not working...
+    # def scoring(model, clusterable_embedding):
+    #     label = HDBSCAN().fit_predict(clusterable_embedding)
+    #     hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean)
+    #     return hdbscan_score
+    # tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist,  scoring=scoring)
+    # tunning.fit(clusterable_embedding)
+    # return tunning
     min_score = pd.DataFrame()
     for i in param_dist.get('min_samples'):
         for j in param_dist.get('min_cluster_size'):
@@ -279,7 +292,8 @@ def HDBSCAN_function(data, min_cluster_size):
             ij_hdbscan_score = DBCV(clusterable_embedding, ij_label, dist_function=euclidean)
             min_score.at[i,j] = ij_hdbscan_score
     hdbscan_score  = max(min_score.max())
-    # get the coordinates of the best clustering paramters and run HDBSCAN below
-
-    labels = HDBSCAN(min_samples=1, min_cluster_size=min_cluster_size).fit_predict(clusterable_embedding)
+    # get the coordinates of the best clustering parameters and run HDBSCAN below
+    bparams = np.where(min_score == hdbscan_score)
+    # run HDBSCAN with best params
+    labels = HDBSCAN(min_samples=param_dist['min_samples'][bparams[0][0]], min_cluster_size=param_dist['min_cluster_size'][bparams[1][0]], metric=param_dist['metric'][bparams[1][0]]).fit_predict(clusterable_embedding)
     return labels, hdbscan_score
diff --git a/Packages.py b/Packages.py
index 217daba..9cad07b 100644
--- a/Packages.py
+++ b/Packages.py
@@ -27,7 +27,7 @@ from pinard import utils
 from pinard import preprocessing as pp
 from pinard.model_selection import train_test_split_idx
 
-from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate
+from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, RandomizedSearchCV
 from sklearn.pipeline import Pipeline, FeatureUnion
 from sklearn.compose import TransformedTargetRegressor
 from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py
index 579b528..cb1348d 100644
--- a/pages/1-samples_selection.py
+++ b/pages/1-samples_selection.py
@@ -119,8 +119,7 @@ with container2:
 
             elif type_cluster == 'HDBSCAN':
                 from Class_Mod.HDBSCAN_Clustering import HDBSCAN_function
-                labels, hdbscan_score = HDBSCAN_function(data_import, min_cluster_size=10)
-
+                labels, hdbscan_score = HDBSCAN_function(data_import)
             with scores:
                 t = model.scores_
                 if type_cluster in ['AP', 'Kmeans']:
@@ -138,12 +137,10 @@ with container2:
 
                 elif type_cluster in ['HDBSCAN']:
                     st.write('plot HDBSCAN clustering')
-                    fig_hdbscan = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
-                    fig_hdbscan.update_traces(marker=dict(size=4))
-                    st.plotly_chart(fig_hdbscan)
-                    st.write('DBCV score = ' + str(hdbscan_score))
-                    # st.dataframe(min_score.stack().agg(['min']))
-
+                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
+                    fig.update_traces(marker=dict(size=4))
+                    # st.plotly_chart(fig_hdbscan)
+                    st.write('DBCV score (-1:1) = ' + str(hdbscan_score))
 
                 else:
                     if test == '.dx':
-- 
GitLab