diff --git a/src/Class_Mod/NMF.py b/src/Class_Mod/NMF.py deleted file mode 100644 index d4f6dae604a46c42780132f43cae55b5c7bf6848..0000000000000000000000000000000000000000 --- a/src/Class_Mod/NMF.py +++ /dev/null @@ -1,11 +0,0 @@ -from Packages import * -class Nmf: - def __init__(self,x , ncomp =3): - self._x = x - self._ncp = ncomp - - model = NMF(n_components='warn', init=None, solver='cd', beta_loss='frobenius', - tol=0.0001, max_iter=200, random_state=None, alpha_W=0.0, alpha_H='same', - l1_ratio=0.0, verbose=0, shuffle=False ) - - sc = pd.DataFrame(model.fit_transform(self._x), ) diff --git a/src/Class_Mod/NMF_.py b/src/Class_Mod/NMF_.py new file mode 100644 index 0000000000000000000000000000000000000000..fead5eb4f82b256d0591fc16b44fd5ca0acc4114 --- /dev/null +++ b/src/Class_Mod/NMF_.py @@ -0,0 +1,28 @@ +from Packages import * + + +class Nmf: + def __init__(self, X, Ncomp=3): + ## input matrix + if np.min(X)<0: + self.__x = np.array(X-np.min(X)) + else: + self.__x = np.array(X) + ## set the number of components to compute and fit the model + self.__ncp = Ncomp + + # Fit PCA model + Mo = NMF(n_components=self.__ncp, init=None, solver='cd', beta_loss='frobenius', + tol=0.0001, max_iter=300, random_state=None, alpha_W=0.0, alpha_H='same', + l1_ratio=0.0, verbose=0, shuffle=False) + Mo.fit(self.__x) + # Results + self._p = Mo.components_.T + self._t = Mo.transform(self.__x) + @property + def scores_(self): + return pd.DataFrame(self._t) + + @property + def loadings_(self): + return pd.DataFrame(self._p) \ No newline at end of file diff --git a/src/Class_Mod/PCA_.py b/src/Class_Mod/PCA_.py index 4f7fe3d074ac685702f022837a08d8b038405135..fc0d430828167f12f6d4185bc3b576b771a8782b 100644 --- a/src/Class_Mod/PCA_.py +++ b/src/Class_Mod/PCA_.py @@ -1,15 +1,9 @@ from Packages import * + class LinearPCA: def __init__(self, X, Ncomp=10): ## input matrix - - if isinstance(X, pd.DataFrame): - self.__x = X.to_numpy() - self._rownames = X.index - else: - self.__x = X - - + self.__x = np.array(X) ## set the number of components to compute and fit the model self.__ncp = Ncomp diff --git a/src/Class_Mod/__init__.py b/src/Class_Mod/__init__.py index 7fec54a6af4530aef9b5fc5737e6bc69afe12643..7c978eaa95e1831c19a48bbf7d39737534a38d0b 100644 --- a/src/Class_Mod/__init__.py +++ b/src/Class_Mod/__init__.py @@ -12,4 +12,5 @@ from .Miscellaneous import resid_plot, reg_plot from .DxReader import DxRead, read_dx from .HDBSCAN_Clustering import Hdbscan from .SK_PLSR_ import PlsR -from .PLSR_Preprocess import PlsProcess \ No newline at end of file +from .PLSR_Preprocess import PlsProcess +from .NMF_ import Nmf \ No newline at end of file diff --git a/src/Modules.py b/src/Modules.py index cb51a5c8e3fc5e3b62af93dedb35b8b3eb0dc639..fff8d5b222b9b094ee174c7355353e6f90d2e454 100644 --- a/src/Modules.py +++ b/src/Modules.py @@ -1,6 +1,6 @@ -from Class_Mod import PlsR, LinearPCA, Umap, find_col_index, PinardPlsr +from Class_Mod import PlsR, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf from Class_Mod import LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx, PlsProcess - +from Class_Mod.DATA_HANDLING import * from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra, local_css from style.header import add_header local_css("style/style.css") \ No newline at end of file diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 75c4571a6efce97d006887ca48dd884b5b45bcf6..0526d9d8d34cb20946e0e9e601312e8dba279cad 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -1,7 +1,6 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * -from Class_Mod.DATA_HANDLING import * @@ -20,7 +19,7 @@ st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") -################################### Data Loading and Visualization ######################################## +################################### I - Data Loading and Visualization ######################################## col2, col1 = st.columns([3, 1]) col1.header("Data Loading", divider='blue') col2.header("Spectral Data Visualization", divider='blue') @@ -30,12 +29,12 @@ col2.header("Spectral Data Visualization", divider='blue') spectra = pd.DataFrame meta_data = pd.DataFrame selected_samples = pd.DataFrame - +colnames = [] +rownames = [] # loader for datafile data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) - if data_file: # Retrieve the extension of the file test = data_file.name[data_file.name.find('.'):] @@ -68,8 +67,15 @@ if data_file: st.success("The data have been loaded successfully", icon="✅") os.unlink(tmp_path) + + ## Visualize spectra if not spectra.empty: + # retrieve columns name and rows name of spectra + colnames = list(spectra.columns) + rownames = [str(i) for i in list(spectra.index)] + spectra.index = rownames + with col2: if test =='.dx': if meta_data.loc[:,'xunits'][0] == '1/cm': @@ -84,11 +90,6 @@ if not spectra.empty: fig.savefig("./Report/figures/Spectra_Plot.png") - - - - - ############################## Exploratory data analysis ############################### container2 = st.container(border=True) container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') @@ -103,7 +104,7 @@ cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos dr_model = None # dimensionality reduction model cl_model = None # clustering model -# Dimensionality reduction +###### 1- Dimensionality reduction ###### t = pd.DataFrame # scores p = pd.DataFrame # loadings labels = [] @@ -115,6 +116,7 @@ if not spectra.empty: if dim_red_method == dim_red_methods[1]: dr_model = LinearPCA(xc, Ncomp=8) + elif dim_red_method == dim_red_methods[2]: if not meta_data.empty: filter = meta_data.columns[1:] @@ -124,6 +126,9 @@ if not spectra.empty: supervised = None dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised) + elif dim_red_method == dim_red_methods[3]: + dr_model = Nmf(spectra, Ncomp=3) + if dr_model: axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1) @@ -131,7 +136,7 @@ if not spectra.empty: t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) -# clustering +###### 2- clustering ####### if not t.empty: tcr = standardize(t) # Clustering @@ -152,9 +157,8 @@ if not t.empty: non_clustered = np.where(labels == -1) labels[non_clustered] = 1000 labels = labels.tolist() -##### Plots -##################################################################################################### +###### 3- Samples selection using the reduced data preentation ###### selec_strategy = ['center','random'] samples_df_chem = pd.DataFrame selected_samples = [] @@ -163,7 +167,6 @@ selected_samples_idx = [] if labels: selection = scores.radio('Select samples selection strategy:', options = selec_strategy) - #################### selection strategy to be corrected if selection == selec_strategy[0]: # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster closest, _ = pairwise_distances_argmin_min(clu_centers, tcr) @@ -184,31 +187,23 @@ if labels: selected_samples_idx.extend(tcr.iloc[C,:].index.to_list()) # list indexes of selected samples for colored plot -if labels: if selected_samples_idx: - e = [] - if isinstance(selected_samples_idx[0], str): - for i in range(len(selected_samples_idx)): - e.append(np.where(np.array(spectra.index) == selected_samples_idx[i])[0]) - selected_samples_idx = list(np.array(e).reshape(-1)) - - ############################# - sam = pd.DataFrame({'cluster':np.array(labels)[selected_samples_idx], - 'index': spectra.index[selected_samples_idx]}, - index = selected_samples_idx) - + if meta_data.empty: + sam = pd.DataFrame({'name': spectra.index[selected_samples_idx], + 'cluster':np.array(labels)[selected_samples_idx]}, + index = selected_samples_idx) + else: + sam = meta_data.iloc[selected_samples_idx,:] + sam.insert(loc=0, column='index', value=selected_samples_idx) + sam.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx]) + + sam.index = np.arange(len(selected_samples_idx))+1 + + selected_s.write(f' The selected subset consists of {sam.shape[0]} samples') selected_s.write(sam) - - if not meta_data.empty: - selected_samples_metd.write('Corresponding meta-data') - meta = meta_data.iloc[selected_samples_idx,:] - meta['cluster'] = np.array(labels)[selected_samples_idx] - meta['index'] = spectra.index[selected_samples_idx] - selected_samples_metd.write(meta) - -############################################################################ +################################ Plots visualization ############################################ ## Scores if not t.empty: with scores: @@ -273,8 +268,6 @@ if not t.empty: plt.savefig("./Report/Figures/test.png") st.plotly_chart(fig, use_container_width=True) - import plotly.express as px - if labels: num_clusters = len(np.unique(labels)) @@ -303,17 +296,13 @@ if not t.empty: if not spectra.empty: - if dim_red_method == dim_red_methods[1]: - - + if dim_red_method == dim_red_methods[1] or dim_red_method == dim_red_methods[3]: with loadings: st.write('Loadings plot') p = dr_model.loadings_ - ######################################## - if isinstance(spectra.columns[0], str): - freq = pd.DataFrame(np.arange(p.shape[0]), index=p.index) - else: - freq = pd.DataFrame(spectra.columns, index=p.index) + freq = pd.DataFrame(colnames, index=p.index) + + if test =='.dx': if meta_data.loc[:,'xunits'][0] == '1/cm': @@ -322,24 +311,23 @@ if not spectra.empty: freq.columns = ['Wavelength (nm)'] else: freq.columns = ['Wavelength/Wavenumber'] - - ############## pp = pd.concat([p, freq], axis=1) ######################################### - - #pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis=1) df1 = pp.melt(id_vars=freq.columns) fig = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly) fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"), bordercolor="black", borderwidth=2)) + st.plotly_chart(fig, use_container_width=True) + # Export du graphique img = pio.to_image(fig, format="png") with open("./Report/figures/graphe_loadings.png", "wb") as f: f.write(img) + if dim_red_method == dim_red_methods[1]: with influence: st.write('Influence plot') ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3) @@ -371,6 +359,7 @@ if not spectra.empty: st.write('Optimal number of clusters = ' + str(len(clusters_number))) st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).') +