plot clusters and colored selected samples in both center and random options

5e548c9f · Nicolas Barthes · 5b62e1dc · 5e548c9f · 5e548c9f
Commit 5e548c9f authored 1 year ago by Nicolas Barthes
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/app.py
+++ b/app.py
@@ -48,8 +48,7 @@ with st.container():
            col = 0
        else:
            col = False
-    import_button = settings_column.button('Import')
-    if import_button:
        data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
        # compute PCA - pca_maker function in application_functions.py
        pca_data, cat_cols, pca_cols = pca_maker(data_import)
@@ -58,11 +57,11 @@ with st.container():
        pca_2 = settings_column.selectbox("Second Principle Component", options=pca_cols, index=1)
        # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
        if cat_cols[0] == "no categories":
-            scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra"))
+            plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra"))
        else:
            categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
            categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
-            scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra"))
+            plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra"))
        #K-Means
        ## K-Means choose number of clusters
        wcss_samples = []
@@ -72,14 +71,14 @@ with st.container():
            kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
            kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
            wcss_samples.append(kmeans_samples.inertia_)
-        settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection", width=200))
+        settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
-        # scatter_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection"))
        ## Draw clustering
        nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
        kmeans_samples = km(n_clusters=nb_select, random_state=42)
        kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
-        # kmeans_samples.labels_
+        # plot the pca with clustering only (no selected samples)
-        plot = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters"))
+        # graph = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters")
+        # plot = scatter_column.plotly_chart(graph)
        # choose between cluster centered sample and random samples
        selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
        export = []
@@ -89,25 +88,30 @@ with st.container():
            closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pca_data.loc[:,[pca_1,pca_2]])
            scatter_column.dataframe(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]], use_container_width=True)
            export.append(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.T)
-            # plot.empty()
+            # list indexes of selected samples for colored plot
+            te = pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.values.tolist()
        elif selection == 'random':
            selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
            for i in np.unique(kmeans_samples.labels_):
                if len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])) >= selection_number:
-                    # scatter_column.write('cluster number - ')
-                    # scatter_column.write(i)
-                    # scatter_column.write('_samples in this cluster_')
-                    # scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])))
-                    # scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]).sample(n=selection_number))
                    export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
                else:
-                    # scatter_column.write('cluster number - ')
-                    # scatter_column.write(i)
-                    # scatter_column.write("_whole cluster (not enough samples)_")
-                    # scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])))
-                    # scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]))
                    export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].index)
+            # list indexes of selected samples for colored plot
+            te = []
+            for sublist in export:
+                for item in sublist:
+                    te.append(item)
+            # display a matrix of selected samples
            scatter_column.write(pd.DataFrame(export).T)
+        # convert cluster number to text for optimized coloring
+        kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
+        for j in te:
+            kmeans_samples.labels_[pca_data.index.get_loc(j)] = 'selected'
+        # plot de pca with colored clusters and selected samples
+        graph_selected = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters and selected samples")
+        plot = scatter_column.plotly_chart(graph_selected)
+        # button to export the names of selected samples - by cluster if random - in a csv
        if scatter_column.button('Export'):
            pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
    else: