From 48b5423688f2290207ba957b38c37a3432badefc Mon Sep 17 00:00:00 2001
From: Nicolas Barthes <nicolas.barthes@cnrs.fr>
Date: Fri, 22 Mar 2024 15:44:44 +0100
Subject: [PATCH] added UMAP to PCA dimension reduction added 2nd k-means to
 ensure random samples in a cluster are not close to each other in k-means
 sample selection

---
 app.py                   | 139 ++++++++++++++++++++++-----------------
 application_functions.py |  24 ++++++-
 2 files changed, 100 insertions(+), 63 deletions(-)

diff --git a/app.py b/app.py
index 0bac962..426b733 100644
--- a/app.py
+++ b/app.py
@@ -8,7 +8,7 @@ import pandas as pd
 import plotly.express as px
 from sklearn.cluster import KMeans as km
 from sklearn.metrics import pairwise_distances_argmin_min
-from application_functions import pca_maker, model, predict, find_delimiter
+from application_functions import pca_maker, model, predict, find_delimiter, umap_maker
 
 # load images for web interface
 img_sselect = Image.open("images\sselect.JPG")
@@ -50,73 +50,90 @@ with st.container():
             col = False
 
         data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
+        # Select type of plot
+        plot_type=['pca','umap']
+        type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37)
+        # compute UMAP - umap_maker in application_functions.py
+        if type_plot == 'umap':
+            pc_data, cat_cols, pc_cols = umap_maker(data_import)
         # compute PCA - pca_maker function in application_functions.py
-        pca_data, cat_cols, pca_cols = pca_maker(data_import)
+        if type_plot == 'pca':
+            pc_data, cat_cols, pc_cols = pca_maker(data_import)
         # add 2 select lists to choose which component to plot
-        pca_1 = settings_column.selectbox("First Principle Component", options=pca_cols, index=0)
-        pca_2 = settings_column.selectbox("Second Principle Component", options=pca_cols, index=1)
+        pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0)
+        pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1)
         # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
         if cat_cols[0] == "no categories":
-            plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra"))
+            plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra"))
         else:
             categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
             categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
-            plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra"))
-        #K-Means
-        ## K-Means choose number of clusters
-        wcss_samples = []
-        cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
-        clusters_sample = np.arange(2, cluster_max)
-        for i in clusters_sample:
-            kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
-            kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
-            wcss_samples.append(kmeans_samples.inertia_)
-        settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
-        ## Draw clustering
-        nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
-        kmeans_samples = km(n_clusters=nb_select, random_state=42)
-        kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
-        # plot the pca with clustering only (no selected samples)
-        # graph = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters")
-        # plot = scatter_column.plotly_chart(graph)
-        # choose between cluster centered sample and random samples
-        selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
-        export = []
-        scatter_column.write("Selected samples for chemical analysis:")
-        if selection == 'center':
-            # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
-            closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pca_data.loc[:,[pca_1,pca_2]])
-            scatter_column.dataframe(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]], use_container_width=True)
-            export.append(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.T)
-            # list indexes of selected samples for colored plot
-            te = pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.values.tolist()
-        elif selection == 'random':
-            selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
-            for i in np.unique(kmeans_samples.labels_):
-                if len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])) >= selection_number:
-                    export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
-                else:
-                    export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].index)
-            # list indexes of selected samples for colored plot
-            te = []
-            for sublist in export:
-                for item in sublist:
-                    te.append(item)
-            # display a matrix of selected samples
-            scatter_column.write(pd.DataFrame(export).T)
-        # convert cluster number to text for optimized coloring
-        kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
-        for j in te:
-            kmeans_samples.labels_[pca_data.index.get_loc(j)] = 'selected'
-        # plot de pca with colored clusters and selected samples
-        graph_selected = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters and selected samples")
-        plot = scatter_column.plotly_chart(graph_selected)
-        # button to export the names of selected samples - by cluster if random - in a csv
-        if scatter_column.button('Export'):
-            pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
-    else:
-        scatter_column.write("_Please Choose a file_")
-
+            plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra"))
+        # Clustering method
+        cluster_type = ['k-means', 'umap']
+        type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38)
+        if type_cluster == 'k-means':
+            #K-Means
+            ## K-Means choose number of clusters
+            wcss_samples = []
+            cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
+            clusters_sample = np.arange(2, cluster_max)
+            for i in clusters_sample:
+                kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
+                kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
+                wcss_samples.append(kmeans_samples.inertia_)
+            settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
+            ## Draw clustering
+            nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
+            kmeans_samples = km(n_clusters=nb_select, random_state=42)
+            kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
+            # plot the pc with clustering only (no selected samples)
+            # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters")
+            # plot = scatter_column.plotly_chart(graph)
+            # choose between cluster centered sample and random samples
+            selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
+            export = []
+            scatter_column.write("Selected samples for chemical analysis:")
+            if selection == 'center':
+                # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
+                closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
+                scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False)
+                export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T)
+                # list indexes of selected samples for colored plot
+                te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist()
+            elif selection == 'random':
+                selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
+                for i in np.unique(kmeans_samples.labels_):
+                    if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
+                        # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
+                        # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
+                        kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
+                        kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
+                        closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
+                        export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index)
+                    else:
+                        export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index)
+                # list indexes of selected samples for colored plot
+                te = []
+                for sublist in export:
+                    for item in sublist:
+                        te.append(item)
+                # display a matrix of selected samples
+                scatter_column.write(pd.DataFrame(export).T)
+            # convert cluster number to text for optimized coloring
+            kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
+            for j in te:
+                kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected'
+            # plot de pc with colored clusters and selected samples
+            graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
+            plot = scatter_column.plotly_chart(graph_selected)
+            # button to export the names of selected samples - by cluster if random - in a csv
+            if scatter_column.button('Export'):
+                pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
+        else:
+            scatter_column.write("_Please Choose a file_")
+        if type_cluster == 'umap':
+            pass
 
 # graphical delimiter
 st.write("---")
diff --git a/application_functions.py b/application_functions.py
index f311192..5c62ca9 100644
--- a/application_functions.py
+++ b/application_functions.py
@@ -4,6 +4,7 @@ import pandas as pd
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 import csv
+from umap.umap_ import UMAP
 
 # local CSS
 ## load the custom CSS in the style folder
@@ -19,8 +20,8 @@ def find_delimiter(filename):
         delimiter = sniffer.sniff(fp.read(5000)).delimiter
     return delimiter
 
-# PCA function for the Sample Selection module
-def pca_maker(data_import):
+# detection of columns categories and scaling
+def col_cat(data_import):
     # detect numerical and categorical columns in the csv
     numerical_columns_list = []
     categorical_columns_list = []
@@ -45,6 +46,25 @@ def pca_maker(data_import):
     # Scale the numerical data
     scaler = StandardScaler()
     scaled_values = scaler.fit_transform(numerical_data)
+    return numerical_data, categorical_data, scaled_values
+
+# UMAP function for the Sample Selection module
+def umap_maker(data_import):
+    numerical_data, categorical_data, scaled_values = col_cat(data_import)
+    umap_func = UMAP(random_state=42, n_neighbors=30, n_components=4)
+    umap_fit = umap_func.fit(scaled_values)
+    umap_data = umap_fit.transform(scaled_values)
+    umap_data = pd.DataFrame(umap_data, index=numerical_data.index)
+    # Set UMAP column names with component number
+    new_column_names = ["UMAP_" + str(i) for i in range(1, len(umap_data.columns) + 1)]
+    # Format the output
+    column_mapper = dict(zip(list(umap_data.columns), new_column_names))
+    umap_data = umap_data.rename(columns=column_mapper)
+    output = pd.concat([data_import, umap_data], axis=1)
+    return output, list(categorical_data.columns), new_column_names
+# PCA function for the Sample Selection module
+def pca_maker(data_import):
+    numerical_data, categorical_data, scaled_values = col_cat(data_import)
     # Compute a 6 components PCA on scaled values
     pca = PCA(n_components=6)
     pca_fit = pca.fit(scaled_values)
-- 
GitLab