From 023dfa7d416099d0e9867fcbf323ce9d3cc3d6ab Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Fri, 12 Apr 2024 16:07:06 +0200 Subject: [PATCH] code readability and complexity were enhanced modifications were incorporated --- Class_Mod/DATA_HANDLING.py | 12 +- Class_Mod/DxReader.py | 37 +++-- Class_Mod/Miscellaneous.py | 15 ++ Class_Mod/UMAP_.py | 5 +- Class_Mod/__init__.py | 2 +- Modules.py | 4 +- Packages.py | 2 + pages/1-samples_selection.py | 300 ++++++++++++++++------------------- pages/3-prediction.py | 1 - predictions/.gitkeep | 0 10 files changed, 183 insertions(+), 195 deletions(-) delete mode 100644 predictions/.gitkeep diff --git a/Class_Mod/DATA_HANDLING.py b/Class_Mod/DATA_HANDLING.py index 02e5694..10fb7ab 100644 --- a/Class_Mod/DATA_HANDLING.py +++ b/Class_Mod/DATA_HANDLING.py @@ -31,17 +31,13 @@ def col_cat(data_import): if len(categorical_columns_list) > 0: categorical_data = pd.concat(categorical_columns_list, axis=1) if len(categorical_columns_list) == 0: - empty = ["" for x in range(len(data_import))] - categorical_columns_list.append(empty) - categorical_data = pd.DataFrame(categorical_columns_list).T - categorical_data.columns = ['no categories'] + categorical_data = pd.DataFrame # Create numerical data matrix from the numerical columns list and fill na with the mean of the column numerical_data = pd.concat(numerical_columns_list, axis=1) numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x))) - # Scale the numerical data - scaler = StandardScaler() - scaled_values = scaler.fit_transform(numerical_data) - return numerical_data, categorical_data, scaled_values + + return numerical_data, categorical_data + def list_files(mypath, import_type): diff --git a/Class_Mod/DxReader.py b/Class_Mod/DxReader.py index d877ff2..f024894 100644 --- a/Class_Mod/DxReader.py +++ b/Class_Mod/DxReader.py @@ -29,21 +29,21 @@ class DxRead: block_met = { 'name': block['title'], 'origin': block['origin'], 'date': block['date'], - 'time': block['time'], - 'spectrometer/data system': block['spectrometer/data system'], - 'instrumental parameters': block['instrumental parameters'], - 'xunits': block['xunits'], - 'yunits': block['yunits'], - 'xfactor': block['xfactor'], - 'yfactor': block['yfactor'], - 'firstx': block['firstx'], - 'lastx': block['lastx'], - 'firsty':block['firsty'], - 'miny': block['miny'], - 'maxy': block['maxy'], - 'npoints': block['npoints'], + # 'time': block['time'], + # 'spectrometer/data system': block['spectrometer/data system'], + # 'instrumental parameters': block['instrumental parameters'], + # 'xunits': block['xunits'], + # 'yunits': block['yunits'], + # 'xfactor': block['xfactor'], + # 'yfactor': block['yfactor'], + # 'firstx': block['firstx'], + # 'lastx': block['lastx'], + # 'firsty':block['firsty'], + # 'miny': block['miny'], + # 'maxy': block['maxy'], + # 'npoints': block['npoints'], 'concentrations':block['concentrations'], - 'deltax':block['deltax'] + # 'deltax':block['deltax'] } self.__met[f'{i}'] = block_met self.metadata_ = pd.DataFrame(self.__met).T @@ -87,8 +87,13 @@ class DxRead: return self.spectra @property def md_df_(self): - return self.metadata_ + return self.metadata_.drop("concentrations", axis = 1) @property def chem_data_(self): - return self.chem_data \ No newline at end of file + return self.chem_data + +@st.cache_data +def read_dx(file): + M = DxRead(file) + return M.chem_data, M.specs_df_, M.md_df_ \ No newline at end of file diff --git a/Class_Mod/Miscellaneous.py b/Class_Mod/Miscellaneous.py index 1627b39..79d1708 100644 --- a/Class_Mod/Miscellaneous.py +++ b/Class_Mod/Miscellaneous.py @@ -47,3 +47,18 @@ def resid_plot( meas, pred): def download_results(data, export_name): with open(data) as f: st.download_button('Download Results', f, export_name) + +@st.cache_resource +def plot_spectra(df): + if isinstance(df.columns[0], str): + m = 0 + else: + m = np.min(df.columns) + + fig, ax = plt.subplots(figsize = (30,7)) + df.T.plot(legend=False, ax = ax, color = 'blue') + ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) + ax.set_ylabel('Signal intensity', fontsize=18) + plt.margins(x = 0) + plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(m, np.max(df)), size=20, color = 'black', backgroundcolor='red') + return fig diff --git a/Class_Mod/UMAP_.py b/Class_Mod/UMAP_.py index e9ae0dc..21d2f82 100644 --- a/Class_Mod/UMAP_.py +++ b/Class_Mod/UMAP_.py @@ -5,10 +5,7 @@ from Class_Mod.DATA_HANDLING import * class Umap: def __init__(self, x, n_components, n_neighbors, min_dist): - self.numerical_data, categorical_data, scaled_values = col_cat(x) - self.catdata = list(categorical_data.columns) - - self.x = scaled_values + self.x = x self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,) self.model.fit(self.x) diff --git a/Class_Mod/__init__.py b/Class_Mod/__init__.py index eb2dbb5..63b5b5f 100644 --- a/Class_Mod/__init__.py +++ b/Class_Mod/__init__.py @@ -7,4 +7,4 @@ from .LWPLSR_ import model_LWPLSR from .Regression_metrics import metrics from .VarSel import TpeIpls from .Miscellaneous import resid_plot, reg_plot -from .DxReader import DxRead +from .DxReader import DxRead, read_dx diff --git a/Modules.py b/Modules.py index 5439917..f447cdd 100644 --- a/Modules.py +++ b/Modules.py @@ -1,4 +1,4 @@ -from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead +from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, read_dx # find_col_index -from Class_Mod.Miscellaneous import prediction, download_results +from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra diff --git a/Packages.py b/Packages.py index 554f28d..924b788 100644 --- a/Packages.py +++ b/Packages.py @@ -38,6 +38,8 @@ from PIL import Image import plotly.express as px import matplotlib.pyplot as plt import seaborn as sns +import matplotlib + ### Important Metrics from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index 6ca45f1..07139bf 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -3,193 +3,167 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * + + + + ################################### Data Loading and Visualization ######################################## -container1 = st.container(border=True) +# container1 = st.header("Data loading",border=True) col2, col1 = st.columns([3, 1]) col1.header("Data Loading", divider='blue') col2.header("Spectral Data Visualization", divider='blue') -container2 = st.container(border=True) -container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') -scores, loadings, pc = st.columns([2, 3, 0.5]) -influence, hotelling, qexp = st.columns([2, 2, 1]) +## Preallocation of data structure +data_import = pd.DataFrame +meta_data = pd.DataFrame +selected_samples = pd.DataFrame -with container1: - # loader for csv file containing NIRS spectra - sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) - if sselectx_csv is not None: - test = sselectx_csv.name[sselectx_csv.name.find('.'):] - if test== '.csv': - with col1: - # Select list for CSV delimiter - psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) - # Select list for CSV header True / False - phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) - if phdr == 'yes': - col = 0 - else: - col = False - data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) - st.success("The data have been loaded successfully", icon="✅") - ## Visualize spectra - - with col2: - fig, ax = plt.subplots(figsize = (30,7)) - data_import.T.plot(legend=False, ax = ax, color = 'blue') - ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) - ax.set_ylabel('Signal', fontsize=18) - plt.margins(x = 0) - st.pyplot(fig) - - st.write("Summary") - info = pd.DataFrame({'N':[data_import.shape[0]], - 'Min': [np.min(data_import)], - 'Max':[np.max(data_import)],}, index = ['Values']).T - info.rename_axis('information') - st.table(data=info) - - elif test == '.dx': - # Create a temporary file to save the uploaded file - with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: - tmp.write(sselectx_csv.read()) - tmp_path = tmp.name - with col1: - data = DxRead(path = tmp_path) - data_import = data.specs_df_ - st.success("The data have been loaded successfully", icon="✅") - - ## Visualize spectra - - with col2: - fig, ax = plt.subplots(figsize = (30,7)) - data_import.T.plot(legend=False, ax = ax, color = 'blue') - ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) - ax.set_ylabel('Signal', fontsize=18) - plt.margins(x = 0) - st.pyplot(fig) - - st.write("Summary") - info = pd.DataFrame({'N':[data_import.shape[0]], - 'Min': [np.min(data_import)], - 'Max':[np.max(data_import)],}, index = ['Values']).T - info.rename_axis('information') - st.table(data=info) - os.unlink(tmp_path) - - - - - -###################################################################################### +# loader for csv file containing NIRS spectra +sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) -############################## Exploratory data analysis ############################### -plot_type=['', 'PCA','UMAP', 'NMF'] -cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] -with container2: - if sselectx_csv is not None: - plot_type=['', 'PCA','UMAP', 'NMF'] - cluster_methods = ['', 'Kmeans','UMAP', 'AP'] - with pc: - type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37) - type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38) - # compute UMAP - umap_maker in application_functions.py - if type_plot == 'PCA': - model = LinearPCA(data_import, Ncomp=5) - elif type_plot =='UMAP': - model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0) +#with container1: +if sselectx_csv: + test = sselectx_csv.name[sselectx_csv.name.find('.'):] + if test== '.csv': + with col1: + # Select list for CSV delimiter + psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) + # Select list for CSV header True / False + phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) + if phdr == 'yes': + col = 0 + else: + col = False + imp = pd.read_csv(sselectx_csv, sep=psep, index_col=col) + data_import = col_cat(imp)[0] + meta_data = col_cat(imp)[1] + st.success("The data have been loaded successfully", icon="✅") + + elif test == '.dx': + # Create a temporary file to save the uploaded file + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(sselectx_csv.read()) + tmp_path = tmp.name + with col1: + _, data_import, meta_data = read_dx(file = tmp_path) + st.success("The data have been loaded successfully", icon="✅") + os.unlink(tmp_path) - if type_plot in ['PCA', 'UMAP']: - # add 2 select lists to choose which component to plot - axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0) - axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1) - axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2) +if not data_import.empty: + ## Visualize spectra + with col2: + fig = plot_spectra(data_import) - if type_cluster == 'Kmeans': - scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1) - cl = Sk_Kmeans(scsc, max_clusters = 30) + #plt.annotate(text = info.T, xy =(m, info.loc[:,"Max"]), size=20, color = 'black', backgroundcolor='red') + st.pyplot(fig) - elif type_cluster == 'HDBSCAN': - from hdbscan import HDBSCAN_function - labels, hdbscan_score = HDBSCAN_function(data_import, min_cluster_size=10) - with scores: - t = model.scores_ - if type_cluster in ['AP', 'Kmeans']: - st.write('Scree plot') - fig2 = px.scatter(cl.inertia_.T, y = 'inertia') - st.plotly_chart(fig2) +############################## Exploratory data analysis ############################### +container2 = st.container(border=True) +container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') +scores, loadings, pc = st.columns([2, 3, 0.5]) +influence, hotelling, qexp = st.columns([2, 2, 1]) - ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') - data, colors = cl.fit_optimal(nclusters=ncluster) - #fig = px.scatter(data, x=axis1, y=axis2, color= colors) - st.write('Scores plot') - fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors) - fig.update_traces(marker=dict(size=4)) +dim_red_methods=['', 'PCA','UMAP', 'NMF'] +cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] +dr_model = None +cl_model = None + +# Dimensionality reduction +t = pd.DataFrame +if not data_import.empty: + dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37) + clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38) + if dim_red_method == dim_red_methods[1]: + dr_model = LinearPCA(data_import, Ncomp=5) + elif dim_red_method == dim_red_methods[2]: + dr_model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0) + + if dr_model: + axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) + axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1) + axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2) + t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) + + +# clustering +labels = pd.DataFrame +if not t.empty: + # Clustering + if clus_method == cluster_methods[1]: + ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') + cl_model = Sk_Kmeans(t, max_clusters = 30) + fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia') + scores.plotly_chart(fig2) + data, labels = cl_model.fit_optimal(nclusters = ncluster) + + elif clus_method == cluster_methods[1]: + from hdbscan import HDBSCAN_function + labels, hdbscan_score = HDBSCAN_function(t, min_cluster_size=10) + +##### Plots - elif type_cluster in ['HDBSCAN']: - st.write('plot HDBSCAN clustering') - fig_hdbscan = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels) - fig_hdbscan.update_traces(marker=dict(size=4)) - st.plotly_chart(fig_hdbscan) - st.write('DBCV score = ' + str(hdbscan_score)) - # st.dataframe(min_score.stack().agg(['min'])) +## Scores +if not t.empty: + with scores: + st.write('Scores plot') + # scores plot with clustering + if not pd.DataFrame(labels).empty: + fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = labels) + else: + # scores plot with metadata + if not meta_data.empty: + filter = meta_data.columns[1:] + col = st.selectbox('filter', options= filter) + if col == 0: + fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3) else: - if test == '.dx': - filter = ['origin', 'date', 'time', 'spectrometer/data system'] - col = st.selectbox('filter', options= filter) - - fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col]) - fig.update_traces(marker=dict(size=4)) - else: - fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 ) - fig.update_traces(marker=dict(size=4)) + fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) ) + else: + # scores plot with neither metadata nor clustering + fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3) + fig.update_traces(marker=dict(size=4)) + st.plotly_chart(fig) + + + + +if not data_import.empty: + if dim_red_method == dim_red_methods[1]: + with loadings: + st.write('Loadings plot') + p = dr_model.loadings_ + pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1) + df1 = pp.melt(id_vars="wl") + fig = px.line(df1, x = 'wl', y = 'value', color='variable') + fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"), + bordercolor="Black", borderwidth=2)) + st.plotly_chart(fig, use_container_width = True) + + with influence: + st.write('Influence plot') + ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3) + leverage = dr_model.leverage_ + residuals = dr_model.residuals_ + fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals") + st.plotly_chart(fig) + + with hotelling: + st.write('T²-Hotelling vs Q residuals plot') + hotelling = dr_model.hotelling_ + ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) + hotelling = dr_model.hotelling_ + fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") st.plotly_chart(fig) - if type_plot =='PCA': - with loadings: - st.write('Loadings plot') - p = model.loadings_ - pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1) - df1 = pp.melt(id_vars="wl") - - fig = px.line(df1, x = 'wl', y = 'value', color='variable') - fig.update_layout( - legend=dict(x=1, y=0, - font=dict( - family="Courier", size=12, color="black"), - bordercolor="Black", borderwidth=2) - ) - st.plotly_chart(fig, use_container_width = True) - - - with influence: - st.write('Influence plot') - ax1 = st.selectbox("Component", options=model.scores_.columns, index=3) - leverage = model.leverage_ - residuals = model.residuals_ - fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals") - st.plotly_chart(fig) - - with hotelling: - st.write('T²-Hotelling vs Q residuals plot') - hotelling = model.hotelling_ - ax2 = st.selectbox("Component", options=model.scores_.columns, index=4) - - hotelling = model.hotelling_ - fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") - st.plotly_chart(fig) - - - else: - st.markdown('Select a dimensionality reduction technique from the dropdown list') diff --git a/pages/3-prediction.py b/pages/3-prediction.py index 6fba851..d215aa7 100644 --- a/pages/3-prediction.py +++ b/pages/3-prediction.py @@ -46,7 +46,6 @@ if NIRS_csv: if st.button("Predict"): if s: - result = model_loaded.predict(pred_data.iloc[:,idx]) else: # use prediction function from application_functions.py to predict chemical values diff --git a/predictions/.gitkeep b/predictions/.gitkeep deleted file mode 100644 index e69de29..0000000 -- GitLab