From 3232d7953c81a9a56bd6141724735adb7fdc0417 Mon Sep 17 00:00:00 2001 From: Nicolas Barthes <nicolas.barthes@cnrs.fr> Date: Fri, 22 Mar 2024 16:45:26 +0100 Subject: [PATCH] prepare code for other clusterings than k-means (UMAP/HDBSCAN) --- app.py | 163 +++++++++++++++++++++------------------ application_functions.py | 2 +- 2 files changed, 88 insertions(+), 77 deletions(-) diff --git a/app.py b/app.py index 426b733..67673ee 100644 --- a/app.py +++ b/app.py @@ -51,7 +51,7 @@ with st.container(): data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) # Select type of plot - plot_type=['pca','umap'] + plot_type=['', 'pca','umap'] type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37) # compute UMAP - umap_maker in application_functions.py if type_plot == 'umap': @@ -59,82 +59,93 @@ with st.container(): # compute PCA - pca_maker function in application_functions.py if type_plot == 'pca': pc_data, cat_cols, pc_cols = pca_maker(data_import) - # add 2 select lists to choose which component to plot - pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0) - pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1) - # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA - if cat_cols[0] == "no categories": - plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra")) - else: - categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols) - categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols) - plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra")) - # Clustering method - cluster_type = ['k-means', 'umap'] - type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38) - if type_cluster == 'k-means': - #K-Means - ## K-Means choose number of clusters - wcss_samples = [] - cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i") - clusters_sample = np.arange(2, cluster_max) - for i in clusters_sample: - kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42) + if type_plot == 'umap' or type_plot == 'pca': + # add 2 select lists to choose which component to plot + pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0) + pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1) + # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA + if cat_cols[0] == "no categories": + plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra")) + else: + categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols) + categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols) + plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra")) + # Clustering method + cluster_type = ['', 'k-means'] + # cluster_type = ['k-means', 'umap'] # uncomment if more clustering algorithms available + type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38) + # clustering via K-Means + if type_cluster == 'k-means': + #K-Means + ## K-Means choose number of clusters + wcss_samples = [] + cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i") + clusters_sample = np.arange(2, cluster_max) + for i in clusters_sample: + kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42) + kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) + wcss_samples.append(kmeans_samples.inertia_) + settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200)) + ## Draw clustering + nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") + kmeans_samples = km(n_clusters=nb_select, random_state=42) kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) - wcss_samples.append(kmeans_samples.inertia_) - settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200)) - ## Draw clustering - nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") - kmeans_samples = km(n_clusters=nb_select, random_state=42) - kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) - # plot the pc with clustering only (no selected samples) - # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters") - # plot = scatter_column.plotly_chart(graph) - # choose between cluster centered sample and random samples - selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) - export = [] - scatter_column.write("Selected samples for chemical analysis:") - if selection == 'center': - # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster - closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) - scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False) - export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T) - # list indexes of selected samples for colored plot - te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist() - elif selection == 'random': - selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) - for i in np.unique(kmeans_samples.labels_): - if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number: - # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index) - # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster - kmeans_selected_samples = km(n_clusters=selection_number, random_state=42) - kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]]) - closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) - export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index) - else: - export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index) - # list indexes of selected samples for colored plot - te = [] - for sublist in export: - for item in sublist: - te.append(item) - # display a matrix of selected samples - scatter_column.write(pd.DataFrame(export).T) - # convert cluster number to text for optimized coloring - kmeans_samples.labels_ = kmeans_samples.labels_.astype(str) - for j in te: - kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected' - # plot de pc with colored clusters and selected samples - graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples") - plot = scatter_column.plotly_chart(graph_selected) - # button to export the names of selected samples - by cluster if random - in a csv - if scatter_column.button('Export'): - pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv') - else: - scatter_column.write("_Please Choose a file_") - if type_cluster == 'umap': - pass - + # plot the pc with clustering only (no selected samples) + # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters") + # plot = scatter_column.plotly_chart(graph) + # choose between cluster centered sample and random samples + selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) + export = [] + scatter_column.write("Selected samples for chemical analysis:") + if selection == 'center': + # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster + closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) + scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False) + export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T) + # list indexes of selected samples for colored plot + te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist() + elif selection == 'random': + selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) + for i in np.unique(kmeans_samples.labels_): + if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number: + # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index) + # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster + kmeans_selected_samples = km(n_clusters=selection_number, random_state=42) + kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]]) + closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]]) + export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index) + else: + export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index) + # list indexes of selected samples for colored plot + te = [] + for sublist in export: + for item in sublist: + te.append(item) + # display a matrix of selected samples + scatter_column.write(pd.DataFrame(export).T) + # convert cluster number to text for optimized coloring + kmeans_samples.labels_ = kmeans_samples.labels_.astype(str) + for j in te: + kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected' + # plot de pc with colored clusters and selected samples + graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples") + plot = scatter_column.plotly_chart(graph_selected) + # button to export the names of selected samples - by cluster if random - in a csv + if scatter_column.button('Export'): + pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv') + else: + scatter_column.write("_Please Choose a file_") + # clustering via UMAP / HDBSCAN + if type_cluster == 'umap': + import hdbscan + # plot de pc with colored clusters and selected samples + # graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples") + # plot = scatter_column.plotly_chart(graph_selected) + scatter_column.dataframe(pc_data) + labels = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=10,).fit_predict(pc_data.loc[:,[pc_1,pc_2]]) + clustered = (labels >= 0) + graph_clustered = plt.scatter(standard_embedding[clustered, 0], standard_embedding[clustered, 1], c=labels[clustered], s=0.1, cmap='Spectral') + plot = scatter_column.plotly_chart(graph_selected) # graphical delimiter st.write("---") # Model creation module diff --git a/application_functions.py b/application_functions.py index 5c62ca9..364b108 100644 --- a/application_functions.py +++ b/application_functions.py @@ -51,7 +51,7 @@ def col_cat(data_import): # UMAP function for the Sample Selection module def umap_maker(data_import): numerical_data, categorical_data, scaled_values = col_cat(data_import) - umap_func = UMAP(random_state=42, n_neighbors=30, n_components=4) + umap_func = UMAP(random_state=42, n_neighbors=20, n_components=4, min_dist=0.0,) umap_fit = umap_func.fit(scaled_values) umap_data = umap_fit.transform(scaled_values) umap_data = pd.DataFrame(umap_data, index=numerical_data.index) -- GitLab