added HDBSCAN clustering with optimization

edd165e7 · BARTHES Nicolas · 8df9593c · edd165e7 · edd165e7 · edd165e7
Commit edd165e7 authored 1 year ago by BARTHES Nicolas
--- a/Class_Mod/UMAP_.py
+++ b/Class_Mod/UMAP_.py
@@ -10,7 +10,7 @@ class Umap:

        self.x = scaled_values
        
-        self.model = UMAP(random_state=42, n_neighbors=20, n_components=4, min_dist=0.0)
+        self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,)
        self.model.fit(self.x)
        self.scores = self.model.transform(self.x)
        self.scores = pd.DataFrame(self.scores, index = self.numerical_data.index)

--- a/Packages.py
+++ b/Packages.py
@@ -17,8 +17,7 @@ from umap.umap_ import UMAP
 from sklearn.decomposition import PCA, NMF

 # Clustering
-from sklearn.cluster import KMeans
-#import hdbscan
+from sklearn.cluster import KMeans, HDBSCAN

 # Modelling
 # import julia
@@ -40,7 +39,7 @@ import plotly.express as px
 import matplotlib.pyplot as plt
 import seaborn as sns
 ### Important Metrics
-from sklearn.metrics import pairwise_distances_argmin_min
+from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score

 ## Web app construction
 import streamlit as st

--- a/app.py
+++ b/app.py
@@ -27,4 +27,10 @@ with st.container():
    st.write("Samples selection (PCA, [UMAP](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html), ...), Predictive Modelling ([Pinard](https://github.com/GBeurier/pinard), [LWPLSR](https://doi.org/10.1002/cem.3209), ...), and Predictions using your data (CSV or DX files) and/or PACE NIRS Database.")
    #st.image(img_general)
    st.markdown("### We could add documentation here ###")
-
+    header1, header2, header3 = st.columns(3)
+    if header1.button("Samples Selection"):
+        st.switch_page('pages\\1-samples_selection.py')
+    if header2.button("Model Creation"):
+        st.switch_page('pages\\2-model_creation.py')
+    if header3.button("Predictions"):
+        st.switch_page('pages\\3-prediction.py')
\ No newline at end of file
--- a/pages/1-samples_selection.py
+++ b/pages/1-samples_selection.py
@@ -88,7 +88,7 @@ with container1:
 with container2:
    if sselectx_csv is not None:
        plot_type=['', 'PCA','UMAP', 'NMF']
-        cluster_methods = ['', 'Kmeans','UMAP', 'AP']
+        cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']

        with pc:
            type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
@@ -110,9 +110,13 @@ with container2:
            if type_cluster == 'Kmeans':
                cl = Sk_Kmeans(pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1), max_clusters = 30)

+            elif type_cluster == 'HDBSCAN':
+                from hdbscan import HDBSCAN_function
+                labels, hdbscan_score = HDBSCAN_function(data_import, min_cluster_size=10)
+
            with scores:
                t = model.scores_
-                if type_cluster in ['Kmeans','UMAP', 'AP']:
+                if type_cluster in ['AP', 'Kmeans']:
                    st.write('Scree plot')
                    fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
                    st.plotly_chart(fig2)
@@ -123,14 +127,22 @@ with container2:
                    st.write('Scores plot')
                    fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
                    fig.update_traces(marker=dict(size=4))
+                    st.plotly_chart(fig)
+

+                elif type_cluster in ['HDBSCAN']:
+                    st.write('plot HDBSCAN clustering')
+                    fig_hdbscan = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
+                    fig_hdbscan.update_traces(marker=dict(size=4))
+                    st.plotly_chart(fig_hdbscan)
+                    st.write('DBCV score = ' + str(hdbscan_score))
+                    # st.dataframe(min_score.stack().agg(['min']))


                else:
                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)
                    fig.update_traces(marker=dict(size=4))
-
-                st.plotly_chart(fig)
+                    st.plotly_chart(fig)


            if type_plot =='PCA':