prepare code for other clusterings than k-means (UMAP/HDBSCAN)

3232d795 · Nicolas Barthes · 48b54236 · 3232d795 · 3232d795
Commit 3232d795 authored 1 year ago by Nicolas Barthes
--- a/app.py
+++ b/app.py
@@ -51,7 +51,7 @@ with st.container():

        data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
        # Select type of plot
-        plot_type=['pca','umap']
+        plot_type=['', 'pca','umap']
        type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37)
        # compute UMAP - umap_maker in application_functions.py
        if type_plot == 'umap':
@@ -59,82 +59,93 @@ with st.container():
        # compute PCA - pca_maker function in application_functions.py
        if type_plot == 'pca':
            pc_data, cat_cols, pc_cols = pca_maker(data_import)
-        # add 2 select lists to choose which component to plot
-        pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0)
-        pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1)
-        # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
-        if cat_cols[0] == "no categories":
-            plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra"))
-        else:
-            categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
-            categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
-            plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra"))
-        # Clustering method
-        cluster_type = ['k-means', 'umap']
-        type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38)
-        if type_cluster == 'k-means':
-            #K-Means
-            ## K-Means choose number of clusters
-            wcss_samples = []
-            cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
-            clusters_sample = np.arange(2, cluster_max)
-            for i in clusters_sample:
-                kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
+        if type_plot == 'umap' or type_plot == 'pca':
+            # add 2 select lists to choose which component to plot
+            pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0)
+            pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1)
+            # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
+            if cat_cols[0] == "no categories":
+                plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra"))
+            else:
+                categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
+                categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
+                plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra"))
+            # Clustering method
+            cluster_type = ['', 'k-means']
+            # cluster_type = ['k-means', 'umap'] # uncomment if more clustering algorithms available
+            type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38)
+            # clustering via K-Means
+            if type_cluster == 'k-means':
+                #K-Means
+                ## K-Means choose number of clusters
+                wcss_samples = []
+                cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
+                clusters_sample = np.arange(2, cluster_max)
+                for i in clusters_sample:
+                    kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
+                    kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
+                    wcss_samples.append(kmeans_samples.inertia_)
+                settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
+                ## Draw clustering
+                nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
+                kmeans_samples = km(n_clusters=nb_select, random_state=42)
                kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
-                wcss_samples.append(kmeans_samples.inertia_)
-            settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
-            ## Draw clustering
-            nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
-            kmeans_samples = km(n_clusters=nb_select, random_state=42)
-            kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
-            # plot the pc with clustering only (no selected samples)
-            # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters")
-            # plot = scatter_column.plotly_chart(graph)
-            # choose between cluster centered sample and random samples
-            selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
-            export = []
-            scatter_column.write("Selected samples for chemical analysis:")
-            if selection == 'center':
-                # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
-                closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
-                scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False)
-                export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T)
-                # list indexes of selected samples for colored plot
-                te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist()
-            elif selection == 'random':
-                selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
-                for i in np.unique(kmeans_samples.labels_):
-                    if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
-                        # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
-                        # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
-                        kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
-                        kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
-                        closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
-                        export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index)
-                    else:
-                        export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index)
-                # list indexes of selected samples for colored plot
-                te = []
-                for sublist in export:
-                    for item in sublist:
-                        te.append(item)
-                # display a matrix of selected samples
-                scatter_column.write(pd.DataFrame(export).T)
-            # convert cluster number to text for optimized coloring
-            kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
-            for j in te:
-                kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected'
-            # plot de pc with colored clusters and selected samples
-            graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
-            plot = scatter_column.plotly_chart(graph_selected)
-            # button to export the names of selected samples - by cluster if random - in a csv
-            if scatter_column.button('Export'):
-                pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
-        else:
-            scatter_column.write("_Please Choose a file_")
-        if type_cluster == 'umap':
-            pass
-
+                # plot the pc with clustering only (no selected samples)
+                # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters")
+                # plot = scatter_column.plotly_chart(graph)
+                # choose between cluster centered sample and random samples
+                selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
+                export = []
+                scatter_column.write("Selected samples for chemical analysis:")
+                if selection == 'center':
+                    # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
+                    closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
+                    scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False)
+                    export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T)
+                    # list indexes of selected samples for colored plot
+                    te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist()
+                elif selection == 'random':
+                    selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
+                    for i in np.unique(kmeans_samples.labels_):
+                        if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
+                            # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
+                            # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
+                            kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
+                            kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
+                            closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
+                            export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index)
+                        else:
+                            export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index)
+                    # list indexes of selected samples for colored plot
+                    te = []
+                    for sublist in export:
+                        for item in sublist:
+                            te.append(item)
+                    # display a matrix of selected samples
+                    scatter_column.write(pd.DataFrame(export).T)
+                # convert cluster number to text for optimized coloring
+                kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
+                for j in te:
+                    kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected'
+                # plot de pc with colored clusters and selected samples
+                graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
+                plot = scatter_column.plotly_chart(graph_selected)
+                # button to export the names of selected samples - by cluster if random - in a csv
+                if scatter_column.button('Export'):
+                    pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
+            else:
+                scatter_column.write("_Please Choose a file_")
+            # clustering via UMAP / HDBSCAN
+            if type_cluster == 'umap':
+                import hdbscan
+                # plot de pc with colored clusters and selected samples
+                # graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
+                # plot = scatter_column.plotly_chart(graph_selected)
+                scatter_column.dataframe(pc_data)
+                labels = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=10,).fit_predict(pc_data.loc[:,[pc_1,pc_2]])
+                clustered = (labels >= 0)
+                graph_clustered = plt.scatter(standard_embedding[clustered, 0], standard_embedding[clustered, 1], c=labels[clustered], s=0.1, cmap='Spectral')
+                plot = scatter_column.plotly_chart(graph_selected)
 # graphical delimiter
 st.write("---")
 # Model creation module

--- a/application_functions.py
+++ b/application_functions.py
@@ -51,7 +51,7 @@ def col_cat(data_import):
 # UMAP function for the Sample Selection module
 def umap_maker(data_import):
    numerical_data, categorical_data, scaled_values = col_cat(data_import)
-    umap_func = UMAP(random_state=42, n_neighbors=30, n_components=4)
+    umap_func = UMAP(random_state=42, n_neighbors=20, n_components=4, min_dist=0.0,)
    umap_fit = umap_func.fit(scaled_values)
    umap_data = umap_fit.transform(scaled_values)
    umap_data = pd.DataFrame(umap_data, index=numerical_data.index)