diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..35eb1ddfbbc029bcab630581847471d7f238ec53 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/app.py b/app.py index 6cc896d1fb8ea68b59d881c7aba8ad3cac75aafe..0bac962ed2f64a1255e59383a9eee65725291175 100644 --- a/app.py +++ b/app.py @@ -48,8 +48,7 @@ with st.container(): col = 0 else: col = False - import_button = settings_column.button('Import') - if import_button: + data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) # compute PCA - pca_maker function in application_functions.py pca_data, cat_cols, pca_cols = pca_maker(data_import) @@ -58,11 +57,11 @@ with st.container(): pca_2 = settings_column.selectbox("Second Principle Component", options=pca_cols, index=1) # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA if cat_cols[0] == "no categories": - scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra")) + plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra")) else: categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols) categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols) - scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra")) + plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra")) #K-Means ## K-Means choose number of clusters wcss_samples = [] @@ -72,14 +71,14 @@ with st.container(): kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42) kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]]) wcss_samples.append(kmeans_samples.inertia_) - settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection", width=200)) - # scatter_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection")) + settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200)) ## Draw clustering nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") kmeans_samples = km(n_clusters=nb_select, random_state=42) kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]]) - # kmeans_samples.labels_ - plot = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters")) + # plot the pca with clustering only (no selected samples) + # graph = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters") + # plot = scatter_column.plotly_chart(graph) # choose between cluster centered sample and random samples selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) export = [] @@ -89,25 +88,30 @@ with st.container(): closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pca_data.loc[:,[pca_1,pca_2]]) scatter_column.dataframe(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]], use_container_width=True) export.append(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.T) - # plot.empty() + # list indexes of selected samples for colored plot + te = pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.values.tolist() elif selection == 'random': selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) for i in np.unique(kmeans_samples.labels_): if len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])) >= selection_number: - # scatter_column.write('cluster number - ') - # scatter_column.write(i) - # scatter_column.write('_samples in this cluster_') - # scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]))) - # scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]).sample(n=selection_number)) export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index) else: - # scatter_column.write('cluster number - ') - # scatter_column.write(i) - # scatter_column.write("_whole cluster (not enough samples)_") - # scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]))) - # scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])) export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].index) + # list indexes of selected samples for colored plot + te = [] + for sublist in export: + for item in sublist: + te.append(item) + # display a matrix of selected samples scatter_column.write(pd.DataFrame(export).T) + # convert cluster number to text for optimized coloring + kmeans_samples.labels_ = kmeans_samples.labels_.astype(str) + for j in te: + kmeans_samples.labels_[pca_data.index.get_loc(j)] = 'selected' + # plot de pca with colored clusters and selected samples + graph_selected = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters and selected samples") + plot = scatter_column.plotly_chart(graph_selected) + # button to export the names of selected samples - by cluster if random - in a csv if scatter_column.button('Export'): pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv') else: