diff --git a/src/Class_Mod/PCA_.py b/src/Class_Mod/PCA_.py index fc0d430828167f12f6d4185bc3b576b771a8782b..0d2afdb2d00add778fdfdd2f1a56e34e57886e5f 100644 --- a/src/Class_Mod/PCA_.py +++ b/src/Class_Mod/PCA_.py @@ -32,13 +32,10 @@ class LinearPCA: # Matrix reconstruction- prediction self._xp[i] = np.dot(self._t[:,:i+1], self._p.T[:i+1,:]) - # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model - self._qres[i] = np.diag(np.subtract(self.__x, self._xp[i])@ np.subtract(self.__x, self._xp[i]).T) - self.T2[i] = np.diag(self._t[:,:i+1] @ np.transpose(self._t[:,:i+1])) + + #self.T2[i] = np.diag(self._t[:,:i+1] @ np.transpose(self._t[:,:i+1])) - # Laverage - Hat = self._t[:,:i+1] @ np.linalg.inv(np.transpose(self._t[:,:i+1]) @ self._t[:,:i+1]) @ np.transpose(self._t[:,:i+1]) - self.leverage[i] = np.diag(Hat) / np.trace(Hat) + @property @@ -49,20 +46,8 @@ class LinearPCA: def loadings_(self): return pd.DataFrame(self._p, columns=self.__pcnames) - @property - def leverage_(self): - lev = pd.DataFrame(self.leverage) - lev.columns =self.__pcnames - return lev - @property def residuals_(self): res = pd.DataFrame(self._qres) res.columns=self.__pcnames - return res - - @property - def hotelling_(self): - hot = pd.DataFrame(self.T2) - hot.columns=self.__pcnames - return hot \ No newline at end of file + return res \ No newline at end of file diff --git a/src/Packages.py b/src/Packages.py index 131385146d58e4d179258e26bca85f150a462ab7..e871c2090556f3cff4d3bb79f671d2f2ea5b6f87 100644 --- a/src/Packages.py +++ b/src/Packages.py @@ -13,7 +13,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder import time from scipy.stats import skew, kurtosis from scipy.signal import savgol_filter, find_peaks_cwt, detrend - +import scipy as sc ### Exploratory data analysis-Dimensionality reduction from umap.umap_ import UMAP from sklearn.decomposition import PCA, NMF diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index c4f310189a8962c698d9d99dfd180925fef203e6..70db8fc07c142c661d5dcf8f916530a035addf67 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -30,7 +30,7 @@ selected_samples = pd.DataFrame non_clustered = None colnames = [] rownames = [] - +l1 = [] # loader for datafile data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) @@ -132,9 +132,11 @@ if not spectra.empty: axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1) axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2) + t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) + ###### II - clustering ####### if not t.empty: tcr = standardize(t) @@ -153,20 +155,28 @@ if not t.empty: # 2- HDBSCAN clustering elif clus_method == cluster_methods[2]: - optimized_hdbscan = Hdbscan(np.array(t)) + optimized_hdbscan = Hdbscan(np.array(tcr)) all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_ labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels] - non_clustered = np.where(np.array(labels) == 'Non clustered') - clustered = np.where(np.array(labels) != 'Non clustered')[0] - - #st.write(optimized_hdbscan.non_clustered) + # 3- Affinity propagation elif clus_method == cluster_methods[3]: cl_model = AP(X = tcr) data, labels, clu_centers = cl_model.fit_optimal_ + + if clus_method == cluster_methods[2]: + #clustered = np.where(np.array(labels) != 'Non clustered')[0] + clustered = np.arange(tcr.shape[0]) + non_clustered = np.where(np.array(labels) == 'Non clustered')[0] + else: + clustered = np.arange(tcr.shape[0]) + non_clustered = None + + new_tcr = tcr.iloc[clustered,:] + -###### III - Samples selection using the reduced data preentation ###### +#################################################### III - Samples selection using the reduced data preentation ###### selec_strategy = ['center','random'] samples_df_chem = pd.DataFrame selected_samples = [] @@ -175,20 +185,21 @@ selected_samples_idx = [] if labels: if clus_method: - selection = scores.radio('Select samples selection strategy:', + selection = scores.radio('Select samples selection strategy:', options = selec_strategy) - if clus_method == cluster_methods[2]: - tcr = tcr.iloc[clustered,:] - + # Strategy 0 if selection == selec_strategy[0]: # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster - closest, _ = pairwise_distances_argmin_min(clu_centers, tcr) - selected_samples_idx = list(closest) - + closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr) + selected_samples_idx = np.array(new_tcr.index)[list(closest)] + selected_samples_idx = selected_samples_idx.tolist() + + #### Strategy 1 elif selection == selec_strategy[1]: selection_number = scores.number_input('How many samples per cluster?', min_value = 1, step=1, value = 3) - for i in np.unique(labels): + s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]] + for i in np.unique(s): C = np.where(np.array(labels) == i)[0] if C.shape[0] >= selection_number: # scores.write(list(tcr.index)[labels== i]) @@ -197,26 +208,44 @@ if labels: clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index) else: - selected_samples_idx.extend(tcr.iloc[C,:].index.to_list()) + selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list()) # list indexes of selected samples for colored plot if selected_samples_idx: if meta_data.empty: - sam = pd.DataFrame({'name': spectra.index[selected_samples_idx], - 'cluster':np.array(labels)[selected_samples_idx]}, + sam1 = pd.DataFrame({'name': spectra.index[clustered][selected_samples_idx], + 'cluster':np.array(labels)[clustered][selected_samples_idx]}, index = selected_samples_idx) else: - sam = meta_data.iloc[selected_samples_idx,:] - sam.insert(loc=0, column='index', value=selected_samples_idx) - sam.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx]) - - sam.index = np.arange(len(selected_samples_idx))+1 - - selected_s.write(f' The selected subset consists of {sam.shape[0]} samples') - selected_s.write(sam) - selected_s.checkbox("Include non clustered samples (for HDBSCAN clustering)") - + sam1 = meta_data.iloc[clustered,:].iloc[selected_samples_idx,:] + sam1.insert(loc=0, column='index', value=selected_samples_idx) + sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx]) + sam1.index = np.arange(len(selected_samples_idx))+1 + selected_s.write(f' - The total number of samples:{tcr.shape[0]}.\n- The number of selected samples for chemical analysis: {sam1.shape[0]}.') + sam = sam1 + unclus = selected_s.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True) + + if clus_method == cluster_methods[2]: + if selected_samples_idx: + if unclus: + if meta_data.empty: + sam2 = pd.DataFrame({'name': spectra.index[non_clustered], + 'cluster':['Non clustered']*len(spectra.index[non_clustered])}, + index = spectra.index[non_clustered]) + else : + sam2 = meta_data.iloc[non_clustered,:] + sam2.insert(loc=0, column='index', value= spectra.index[non_clustered]) + sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered])) + + sam = pd.concat([sam1, sam2], axis = 0) + sam.index = np.arange(sam.shape[0])+1 + selected_s.write(f' The number of Non-clustered samples is {sam2.shape[0]} samples') + else: + sam = sam1 + st.write(sam) ################################ Plots visualization ############################################ + + ## Scores if not t.empty: with scores: @@ -348,16 +377,63 @@ if not spectra.empty: img = pio.to_image(fig, format="png") with open("./Report/figures/graphe_loadings.png", "wb") as f: f.write(img) - +############################################################################################################# if dim_red_method == dim_red_methods[1]: with influence: st.write('Influence plot') - ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3) - leverage = dr_model.leverage_ - residuals = dr_model.residuals_ - fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color=leverage[ax1]*residuals[ax1], color_continuous_scale='Blues') - fig.update_layout(xaxis_title="Leverage", yaxis_title="Residuals") - st.plotly_chart(fig, use_container_width=True) + + # Laverage + Hat = t.to_numpy() @ np.linalg.inv(np.transpose(t.to_numpy()) @ t.to_numpy()) @ np.transpose(t.to_numpy()) + leverage = np.diag(Hat) / np.trace(Hat) + tresh3 = 2 * t.shape[1]/t.shape[0] + # Loadings + p = pd.concat([dr_model.loadings_.loc[:,axis1], dr_model.loadings_.loc[:,axis2], dr_model.loadings_.loc[:,axis3]], axis = 1) + # Matrix reconstruction + xp = np.dot(t,p.T) + # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model + residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T) + tresh4 = sc.stats.chi2.ppf(0.05, df = 3) + + # color with metadata + if not meta_data.empty and clus_method: + if col == "None": + l1 = ["Samples"]* t.shape[0] + + elif col == clus_method: + l1 = labels + + else: + l1 = list(map(str.lower,md_df_st_[col])) + + elif meta_data.empty and clus_method: + l1 = labels + + elif meta_data.empty and not clus_method: + l1 = ["Samples"]* t.shape[0] + + elif not meta_data.empty and not clus_method: + l1 = list(map(str.lower,md_df_st_[col])) + + + + fig = px.scatter(x = leverage, y = residuals, color = l1) + fig.add_vline(x = tresh3, line_width = 1, line_dash = 'solid', line_color = 'red') + fig.add_hline(y=tresh4, line_width=1, line_dash='solid', line_color='red') + fig.update_layout(xaxis_title="Leverage", yaxis_title = "Residuals") + + out3 = leverage > tresh3 + out4 = residuals > tresh4 + + for i in range(t.shape[0]): + if out3[i]: + if not meta_data.empty: + ann = meta_data.loc[:,'name'][i] + else: + ann = t.index[i] + fig.add_annotation(dict(x = leverage[i], y = residuals[i], showarrow=True, text = ann, + xanchor = 'auto', yanchor = 'auto')) + + st.plotly_chart(fig, use_container_width = True) img = pio.to_image(fig, format="png") with open("./Report/figures/graphe_influence.png", "wb") as f: f.write(img) @@ -365,10 +441,35 @@ if not spectra.empty: with hotelling: st.write('T²-Hotelling vs Q residuals plot') - hotelling = dr_model.hotelling_ - ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) + # Hotelling + hotelling = t.var(axis = 1) + # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model + residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T) + + I = t.shape[0] + fcri = sc.stats.f.isf(0.05, 3, I) + tresh0 = (3 * (I ** 2 - 1) * fcri) / (I * (I - 3)) + tresh1 = sc.stats.chi2.ppf(0.05, df = 3) + + fig = px.scatter(t, x = hotelling, y = residuals, color = l1) + fig.update_layout(xaxis_title="T²",yaxis_title="Q-Residuals") + fig.add_vline(x=tresh0, line_width=1, line_dash='solid', line_color='red') + fig.add_hline(y=tresh1, line_width=1, line_dash='solid', line_color='red') + + out0 = hotelling > tresh0 + out1 = residuals > tresh1 - hotelling = dr_model.hotelling_ - fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") + + for i in range(t.shape[0]): + if out0[i]: + if not meta_data.empty: + ann = meta_data.loc[:,'name'][i] + else: + ann = t.index[i] + fig.add_annotation(dict(x = hotelling[i], y = residuals[i], showarrow=True, text = ann, + xanchor = 'auto', yanchor = 'auto')) + st.plotly_chart(fig, use_container_width=True) - fig.write_image("./Report/figures/graphe_hotelling.png", format="png") \ No newline at end of file + fig.write_image("./Report/figures/graphe_hotelling.png", format="png") + #st.write() + #st.write() \ No newline at end of file