From 3461d3b07f77a1605d234dbb79a99e0d8a58aad3 Mon Sep 17 00:00:00 2001 From: barthes <nicolas.barthes@cefe.cnrs.fr> Date: Tue, 16 Apr 2024 15:44:09 +0200 Subject: [PATCH] HDBSCAN working with metadata & supervised UMAP + fixed widget width --- Class_Mod/HDBSCAN_Clustering.py | 21 ++++++++++++++------- pages/1-samples_selection.py | 33 +++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/Class_Mod/HDBSCAN_Clustering.py b/Class_Mod/HDBSCAN_Clustering.py index 1a9df2d..63b32a4 100644 --- a/Class_Mod/HDBSCAN_Clustering.py +++ b/Class_Mod/HDBSCAN_Clustering.py @@ -1,9 +1,9 @@ from Packages import * class Hdbscan: """ - Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reducted space. + Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reduced space. Vars: - data: the Dimensionality reducted space, raw result of the UMAP.fit() + data: the Dimensionality reduced space, raw result of the UMAP.fit() param_dist: the HDBSCAN optimization parameters to test Density-Based Clustering Validation - DBCV (https://github.com/christopherjenness/DBCV/tree/master ; Moulavi, Davoud, et al. "Density-based clustering validation." Proceedings of the 2014 SIAM @@ -17,14 +17,21 @@ class Hdbscan: The HDBSCAN_scores_ @property return the cluster number of each sample (_labels) and the DBCV best score. """ def __init__(self, data): + # Really fast # self._param_dist = {'min_samples': [1], - # 'min_cluster_size':[5,10], + # 'min_cluster_size':[5], # 'metric' : ['euclidean','manhattan'], # } - self._param_dist = {'min_samples': [1,5,10,], - 'min_cluster_size':[5,25,50,], - 'metric' : ['euclidean','manhattan'], - } + # Medium + self._param_dist = {'min_samples': [1,10], + 'min_cluster_size':[5,50], + 'metric' : ['euclidean','manhattan'], + } + # Complete + # self._param_dist = {'min_samples': [1,5,10,], + # 'min_cluster_size':[5,25,50,], + # 'metric' : ['euclidean','manhattan'], + # } self._clusterable_embedding = data diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index 8f11ea6..f97708c 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -111,7 +111,7 @@ if not t.empty: ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') cl_model = Sk_Kmeans(tcr, max_clusters = 30) fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia') - scores.plotly_chart(fig2) + scores.plotly_chart(fig2, use_container_width=True) data, labels = cl_model.fit_optimal(nclusters = ncluster) elif clus_method == cluster_methods[2]: @@ -131,7 +131,7 @@ if not t.empty: # scores plot with metadata elif len(list(labels)) == 0 and not meta_data.empty: filter = meta_data.columns[1:] - col = st.selectbox('Group by:', options= filter) + col = st.selectbox('Color by:', options= filter) if col == 0: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) else: @@ -145,7 +145,7 @@ if not t.empty: else: filter = meta_data.columns[1:].insert(0,'None') - col = st.selectbox('Group by:', options= filter) + col = st.selectbox('Color by:', options= filter) if col == "None": fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) elif col == clus_method: @@ -156,7 +156,7 @@ if not t.empty: else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) fig.update_traces(marker=dict(size=4)) - st.plotly_chart(fig) + st.plotly_chart(fig, use_container_width=True) @@ -178,13 +178,22 @@ if not spectra.empty: leverage = dr_model.leverage_ residuals = dr_model.residuals_ fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals") - st.plotly_chart(fig) + st.plotly_chart(fig, use_container_width=True) with hotelling: - st.write('T²-Hotelling vs Q residuals plot') - hotelling = dr_model.hotelling_ - ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) - - hotelling = dr_model.hotelling_ - fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") - st.plotly_chart(fig) \ No newline at end of file + st.write('T²-Hotelling vs Q residuals plot') + hotelling = dr_model.hotelling_ + ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) + + hotelling = dr_model.hotelling_ + fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") + st.plotly_chart(fig, use_container_width=True) + + if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN + with loadings: # Display some clustering metrics + st.write('Clustering metrics:') + clusters_number = set(labels) + clusters_number.remove(-1) + st.write('Optimal number of clusters = ' + str(len(clusters_number))) + st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) + st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).') -- GitLab