diff --git a/Class_Mod/KMEANS_.py b/Class_Mod/KMEANS_.py index 526a43597155183de2241e0fd0b850f8b4af13ad..60d77ae9f702093083095064c2b647326faa6c90 100644 --- a/Class_Mod/KMEANS_.py +++ b/Class_Mod/KMEANS_.py @@ -21,4 +21,5 @@ class Sk_Kmeans: model.fit(self.x) yp = model.predict(self.x)+1 clu = [f'cluster#{i}' for i in yp] - return self.x, clu \ No newline at end of file + + return self.x, clu, model.cluster_centers_ \ No newline at end of file diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index 2edb875368402305ff6bcbf9bfdbe77a37fffecc..a75dba662ab993f73becb488e6243f919c086a7e 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -68,6 +68,9 @@ container2 = st.container(border=True) container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') scores, loadings, pc = st.columns([2, 3, 0.5]) influence, hotelling, qexp = st.columns([2, 2, 1]) +st.header('Selected samples for chemical analysis') +selected_s, selected_samples_metd = st.columns([3, 3]) +selected_s.write('Samples scores') dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos @@ -106,7 +109,7 @@ if not t.empty: cl_model = Sk_Kmeans(tcr, max_clusters = 30) fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia') scores.plotly_chart(fig2) - data, labels = cl_model.fit_optimal(nclusters = ncluster) + data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster) elif clus_method == cluster_methods[2]: optimized_hdbscan = Hdbscan(model.scores_raw_) @@ -114,7 +117,51 @@ if not t.empty: ##### Plots -## Scores + +##################################################################################################### +selec_strategy = ['center','random'] +samples_df_chem = pd.DataFrame +selected_samples = [] +selected_samples_idx = [] + +if labels: + selection = scores.radio('Select samples selection strategy:', options = selec_strategy) +#################### selection strategy to be corrected + if selection == selec_strategy[0]: + # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster + closest, _ = pairwise_distances_argmin_min(clu_centers, tcr) + selected_samples_idx = list(closest) + elif selection == selec_strategy[1]: + selection_number = scores.number_input('How many samples per cluster?', min_value = 1, step=1, value = 3) + for i in np.unique(labels): + C = np.where(np.array(labels) ==i)[0] + if C.shape[0] >= selection_number: + #scores.write(list(tcr.index)[labels== i]) + km2 = KMeans(n_clusters = selection_number) + km2.fit(tcr.iloc[C,:]) + clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) + selected_samples_idx2 = list(clos) + selected_samples_idx.extend(tcr.iloc[C,:].index[selected_samples_idx2]) + # selected_samples_idx.extend(tcr.iloc[C,:].sample(n=selection_number).index.to_list()) + else: + selected_samples_idx.extend(tcr.iloc[C,:].index.to_list()) + # list indexes of selected samples for colored plot + +if labels: + if selected_samples_idx: + sam = pd.DataFrame({'cluster':np.array(labels)[selected_samples_idx], + 'index': spectra.index[selected_samples_idx]}) + selected_s.write(sam) + + if not meta_data.empty: + selected_samples_metd.write('Corresponding meta-data') + meta = meta_data.iloc[selected_samples_idx,:] + meta['cluster'] = np.array(labels)[selected_samples_idx] + meta['index'] = spectra.index[selected_samples_idx] + selected_samples_metd.write(meta) + + + ## Scores if not t.empty: with scores: st.write('Scores plot') @@ -150,6 +197,12 @@ if not t.empty: else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) fig.update_traces(marker=dict(size=4)) + + if selected_samples_idx: + tt = tcr.iloc[selected_samples_idx,:] + fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2], + z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'), + name = 'selected samples') st.plotly_chart(fig) @@ -181,4 +234,12 @@ if not spectra.empty: hotelling = dr_model.hotelling_ fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") - st.plotly_chart(fig) \ No newline at end of file + st.plotly_chart(fig) + + + + + + + +