Skip to content
Snippets Groups Projects
Commit 5e548c9f authored by Nicolas Barthes's avatar Nicolas Barthes
Browse files

plot clusters and colored selected samples in both center and random options

parent 5b62e1dc
No related branches found
No related tags found
No related merge requests found
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
...@@ -48,8 +48,7 @@ with st.container(): ...@@ -48,8 +48,7 @@ with st.container():
col = 0 col = 0
else: else:
col = False col = False
import_button = settings_column.button('Import')
if import_button:
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
# compute PCA - pca_maker function in application_functions.py # compute PCA - pca_maker function in application_functions.py
pca_data, cat_cols, pca_cols = pca_maker(data_import) pca_data, cat_cols, pca_cols = pca_maker(data_import)
...@@ -58,11 +57,11 @@ with st.container(): ...@@ -58,11 +57,11 @@ with st.container():
pca_2 = settings_column.selectbox("Second Principle Component", options=pca_cols, index=1) pca_2 = settings_column.selectbox("Second Principle Component", options=pca_cols, index=1)
# if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
if cat_cols[0] == "no categories": if cat_cols[0] == "no categories":
scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra")) plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, hover_name=pca_data.index, title="PCA plot of sample spectra"))
else: else:
categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols) categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols) categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra")) plot_pca = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pca_data.index, title="PCA plot of sample spectra"))
#K-Means #K-Means
## K-Means choose number of clusters ## K-Means choose number of clusters
wcss_samples = [] wcss_samples = []
...@@ -72,14 +71,14 @@ with st.container(): ...@@ -72,14 +71,14 @@ with st.container():
kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42) kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]]) kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
wcss_samples.append(kmeans_samples.inertia_) wcss_samples.append(kmeans_samples.inertia_)
settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection", width=200)) settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
# scatter_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters number selection"))
## Draw clustering ## Draw clustering
nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
kmeans_samples = km(n_clusters=nb_select, random_state=42) kmeans_samples = km(n_clusters=nb_select, random_state=42)
kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]]) kmeans_samples.fit(pca_data.loc[:,[pca_1,pca_2]])
# kmeans_samples.labels_ # plot the pca with clustering only (no selected samples)
plot = scatter_column.plotly_chart(px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters")) # graph = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters")
# plot = scatter_column.plotly_chart(graph)
# choose between cluster centered sample and random samples # choose between cluster centered sample and random samples
selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
export = [] export = []
...@@ -89,25 +88,30 @@ with st.container(): ...@@ -89,25 +88,30 @@ with st.container():
closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pca_data.loc[:,[pca_1,pca_2]]) closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pca_data.loc[:,[pca_1,pca_2]])
scatter_column.dataframe(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]], use_container_width=True) scatter_column.dataframe(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]], use_container_width=True)
export.append(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.T) export.append(pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.T)
# plot.empty() # list indexes of selected samples for colored plot
te = pca_data.loc[pca_data.index[closest],[pca_1,pca_2]].index.values.tolist()
elif selection == 'random': elif selection == 'random':
selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
for i in np.unique(kmeans_samples.labels_): for i in np.unique(kmeans_samples.labels_):
if len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])) >= selection_number: if len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])) >= selection_number:
# scatter_column.write('cluster number - ')
# scatter_column.write(i)
# scatter_column.write('_samples in this cluster_')
# scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])))
# scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]).sample(n=selection_number))
export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index) export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
else: else:
# scatter_column.write('cluster number - ')
# scatter_column.write(i)
# scatter_column.write("_whole cluster (not enough samples)_")
# scatter_column.write(len(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]])))
# scatter_column.dataframe(pd.DataFrame(pca_data.loc[pca_data.index[kmeans_samples.labels_==i],[pca_1,pca_2]]))
export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].index) export.append(pca_data.loc[pca_data.index[kmeans_samples.labels_==i]].index)
# list indexes of selected samples for colored plot
te = []
for sublist in export:
for item in sublist:
te.append(item)
# display a matrix of selected samples
scatter_column.write(pd.DataFrame(export).T) scatter_column.write(pd.DataFrame(export).T)
# convert cluster number to text for optimized coloring
kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
for j in te:
kmeans_samples.labels_[pca_data.index.get_loc(j)] = 'selected'
# plot de pca with colored clusters and selected samples
graph_selected = px.scatter(data_frame=pca_data, x=pca_1, y=pca_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pca_data.index, title="PCA projection with K-Means Clusters and selected samples")
plot = scatter_column.plotly_chart(graph_selected)
# button to export the names of selected samples - by cluster if random - in a csv
if scatter_column.button('Export'): if scatter_column.button('Export'):
pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv') pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
else: else:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment