From 4b97a50f8cfa7540d20b1966daedefbdb8876d84 Mon Sep 17 00:00:00 2001 From: Nicolas Barthes <nicolas.barthes@cnrs.fr> Date: Fri, 12 Apr 2024 16:03:23 +0200 Subject: [PATCH] updated HDBSCAN optimization --- Class_Mod/HDBSCAN_Clustering.py | 26 ++++++++++++++++++++------ Packages.py | 2 +- pages/1-samples_selection.py | 13 +++++-------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/Class_Mod/HDBSCAN_Clustering.py b/Class_Mod/HDBSCAN_Clustering.py index 0d9268b..f019282 100644 --- a/Class_Mod/HDBSCAN_Clustering.py +++ b/Class_Mod/HDBSCAN_Clustering.py @@ -256,14 +256,18 @@ def _get_label_members(X, labels, cluster): members = X[indices] return members -def HDBSCAN_function(data, min_cluster_size): +def HDBSCAN_function(data): # param_dist = {'min_samples': [1,5,10,30], # 'min_cluster_size':[5,10,20,30,50,75,100], # # 'cluster_selection_method' : ['eom','leaf'], # # 'metric' : ['euclidean','manhattan'] # } - param_dist = {'min_samples': [1,5], - 'min_cluster_size':[5,10], + # param_dist = {'min_samples': [1,5,10,50], + # 'min_cluster_size':[5,10,30,50,100,300,500], + # } + param_dist = {'min_samples': [1,5, 10,], + 'min_cluster_size':[5,10,30,50,100], + 'metric' : ['euclidean','manhattan'], } clusterable_embedding = UMAP( @@ -272,6 +276,15 @@ def HDBSCAN_function(data, min_cluster_size): n_components=5, random_state=42, ).fit_transform(data) + + # RandomizedSearchCV not working... + # def scoring(model, clusterable_embedding): + # label = HDBSCAN().fit_predict(clusterable_embedding) + # hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean) + # return hdbscan_score + # tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist, scoring=scoring) + # tunning.fit(clusterable_embedding) + # return tunning min_score = pd.DataFrame() for i in param_dist.get('min_samples'): for j in param_dist.get('min_cluster_size'): @@ -279,7 +292,8 @@ def HDBSCAN_function(data, min_cluster_size): ij_hdbscan_score = DBCV(clusterable_embedding, ij_label, dist_function=euclidean) min_score.at[i,j] = ij_hdbscan_score hdbscan_score = max(min_score.max()) - # get the coordinates of the best clustering paramters and run HDBSCAN below - - labels = HDBSCAN(min_samples=1, min_cluster_size=min_cluster_size).fit_predict(clusterable_embedding) + # get the coordinates of the best clustering parameters and run HDBSCAN below + bparams = np.where(min_score == hdbscan_score) + # run HDBSCAN with best params + labels = HDBSCAN(min_samples=param_dist['min_samples'][bparams[0][0]], min_cluster_size=param_dist['min_cluster_size'][bparams[1][0]], metric=param_dist['metric'][bparams[1][0]]).fit_predict(clusterable_embedding) return labels, hdbscan_score diff --git a/Packages.py b/Packages.py index 217daba..9cad07b 100644 --- a/Packages.py +++ b/Packages.py @@ -27,7 +27,7 @@ from pinard import utils from pinard import preprocessing as pp from pinard.model_selection import train_test_split_idx -from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate +from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, RandomizedSearchCV from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.compose import TransformedTargetRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index 579b528..cb1348d 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -119,8 +119,7 @@ with container2: elif type_cluster == 'HDBSCAN': from Class_Mod.HDBSCAN_Clustering import HDBSCAN_function - labels, hdbscan_score = HDBSCAN_function(data_import, min_cluster_size=10) - + labels, hdbscan_score = HDBSCAN_function(data_import) with scores: t = model.scores_ if type_cluster in ['AP', 'Kmeans']: @@ -138,12 +137,10 @@ with container2: elif type_cluster in ['HDBSCAN']: st.write('plot HDBSCAN clustering') - fig_hdbscan = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels) - fig_hdbscan.update_traces(marker=dict(size=4)) - st.plotly_chart(fig_hdbscan) - st.write('DBCV score = ' + str(hdbscan_score)) - # st.dataframe(min_score.stack().agg(['min'])) - + fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels) + fig.update_traces(marker=dict(size=4)) + # st.plotly_chart(fig_hdbscan) + st.write('DBCV score (-1:1) = ' + str(hdbscan_score)) else: if test == '.dx': -- GitLab