From 162d171d32d004c80fa8c5d8b636d50d39392ae1 Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Tue, 25 Jun 2024 11:49:29 +0200 Subject: [PATCH] Results adjustments for report generation +fix err --- src/Class_Mod/Miscellaneous.py | 6 +- src/Class_Mod/UMAP_.py | 1 + src/Report/report.py | 165 +++++++++++++++++++++---------- src/pages/1-samples_selection.py | 56 ++++++----- src/pages/2-model_creation.py | 75 ++++++++------ 5 files changed, 190 insertions(+), 113 deletions(-) diff --git a/src/Class_Mod/Miscellaneous.py b/src/Class_Mod/Miscellaneous.py index 4280bb3..69d7d24 100644 --- a/src/Class_Mod/Miscellaneous.py +++ b/src/Class_Mod/Miscellaneous.py @@ -130,7 +130,7 @@ def desc_stats(x): a['Mean'] = np.mean(x) a['Median'] = np.median(x) a['S'] = np.std(x) - a['RSD(%)'] = np.std(x)*100/np.mean(x) - a['Skewness'] = skew(x, axis=0, bias=True) - a['Kurtosis'] = kurtosis(x, axis=0, bias=True) + a['RSD'] = np.std(x)*100/np.mean(x) + a['Skew'] = skew(x, axis=0, bias=True) + a['Kurt'] = kurtosis(x, axis=0, bias=True) return a \ No newline at end of file diff --git a/src/Class_Mod/UMAP_.py b/src/Class_Mod/UMAP_.py index b3f0c67..75c8746 100644 --- a/src/Class_Mod/UMAP_.py +++ b/src/Class_Mod/UMAP_.py @@ -21,6 +21,7 @@ class Umap: self.model.fit(self.numerical_data, y = self.categorical_data_encoded) self.scores_raw = self.model.transform(self.numerical_data) self.scores = pd.DataFrame(self.scores_raw) + self.scores.columns = [f'axis_{i+1}' for i in range(self.scores_raw.shape[1])] @property def scores_(self): diff --git a/src/Report/report.py b/src/Report/report.py index 7e79ae7..78029e5 100644 --- a/src/Report/report.py +++ b/src/Report/report.py @@ -3,12 +3,14 @@ from pathlib import Path import os import pandas as pd import os.path +import re def intersect(l1, l2): return l1.intersection(set(l2)) def check(file): return os.path.isfile(file) def report(*args): + signal_preprocess = {'Snv':'Standard Normal Variate (SNV)'} dim_red_methods= {'PCA':'Principal Components Analysis (PCA)', 'UMAP':'Uniform Manifold Approximation and Projection (UMAP)', 'NMF':'Non-negative Matrix Factorization (NMF)'} # List of dimensionality reduction algos @@ -16,23 +18,28 @@ def report(*args): 'HDBSCAN':'Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN)', 'AP':'Affinity Propagation (AP)'} # List of clustering algos selec_strategy = {'center':'PCA','random':'PCA'} + reg_algo ={"Full-PLSR":'full Partial Least Squares (PLS)', + "Locally Weighted PLSR": 'Locally Weighted Partial Least Squares (LWPLS)', + "Interval-PLSR": "Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)"} + to_report=[] j=0 for arg in args: - if isinstance(arg, str): - to_report.append(arg) + if isinstance(arg, str) or isinstance(arg, int): + to_report.append(str(arg)) elif isinstance(arg, list): - to_report.extend(arg) + to_report.extend(list(map(str, arg))) elif isinstance(arg, pd.DataFrame): df_name = 'df' + str(j) j+=1 - globals()[df_name] = arg - + globals()[df_name] = arg.select_dtypes(include=['float64', 'int64']) + latex_report = "" latex_report += r"""\documentclass[a4paper,10pt]{article} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage{geometry} + \usepackage{changepage} \geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=3cm } \usepackage{caption, subcaption} \usepackage{hyperref} @@ -63,71 +70,66 @@ def report(*args): \begin{center} \textbf{{\Large NIRS WORKFLOW REPORT}} \\ \end{center}""" - if 'sample_selection' in to_report: - latex_report += r"""\noindent - \textbf{QUERY MADE: } Sample selection performing.\\ - \noindent - \textbf{ENTERED INPUTS: }{"""+to_report[1] + r"""}\\ - \textbf{PRINCIPLE OF RESPONSE TO THE QUERY:} Representative subset selection has - been performed using the "sample selection" workflow that consists of applying - a sequence of data processing techniques, specifically, dimensionality reduction, - clustering, and samples selection techniques.""" - - latex_report += r"""\section*{RESULTS}""" - latex_report += r"""\subsection*{Spectral data visualization}""" - latex_report += r"""Acquired spectra were visualized in fig1 by plotting the intensity - of absorption, reflectance, transmission, etc, against the wavelengths or wavenumbers. - This helps observe general patterns and trends in the spectra, and understand the - variability within the data. - \begin{figure}[h] - \centering - \includegraphics[width=1\linewidth]{spectra_plot.png} - \caption{Acquired spectra} - \label{fig:raw_spectra} - \end{figure}""" + latex_report += r"""\noindent + \textbf{QUERY MADE: }{"""+ re.sub(r'([_%])', r'\\\1',to_report[0])+ r"""}.\\ + \noindent + \textbf{ENTERED INPUTS: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\""" + latex_report += r"""\section*{Results}""" + latex_report += r"""\subsection*{Spectral data visualization}""" + latex_report += r"""Acquired spectra were visualized in fig1 by plotting the intensity + of absorption, reflectance, transmission, etc, against the wavelengths or wavenumbers. + This helps observe general patterns and trends in the spectra, and understand the + variability within the data. + \begin{figure}[h] + \centering + \includegraphics[width=1\linewidth]{spectra_plot.png} + \caption{Acquired spectra} + \label{fig:raw_spectra} + \end{figure}""" + if 'Representative subset selection' in to_report: latex_report += r"""\subsection*{Multivariable Data Analysis}""" latex_report += r""" For optimal selection of subset of the samples to analyze through the - reference method, a workflow consisting of consecutively applying features extraction/dimensionality + reference method, a pipeline consisting of consecutively applying features extraction/dimensionality reduction and clustering analysis was developed. Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} technique which helps represent the high dimensional spectra in a reduced perceptible 3D - subspace spanned by a few number of features (three features in our case), on of the spectra. - While clustering analysis was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique which - helps group the data into groups of spectra that share the same carachteristics. - This workflow is widely used in the world of spectral data analysis for detecting outliers, - analysis the homogenity of the data, reducing the computational costs prior to supervised predictive modelling, etc.\\*""" + subspace spanned by a few number of features (three features in our case), while clustering analysis was performed + using the {"""+cluster_methods[to_report[3]] + r"""} technique which + helps group the data into groups of spectra that share the same carachteristics.\\*""" if "PCA" in to_report: latex_report += r"""\indent To detect the presence of any spectral outliers, the influence and residuals plots were constructed, - with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish Regular Observations (ROs), - which form a homogeneous group near the subspace generated by the PCs; Good Leverage Points (GLPs), - which are at the same plane as the subspace but distant from the ROs; Orthogonal Observations (OOs), which have a - large residual distance to the subspace, but whose projection is on the subspace; and, finally, Bad Leverage - Points (BLPs), which have a large residual distance such that the projection on the subspace is away from ROs.\\*""" + with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish regular observations, + which form a homogeneous group near the subspace generated by the PCs; good leverage points, + which are at the same plane as the subspace but distant from the ROs; orthogonal observations, which have a + large residual distance to the subspace, but whose projection is on the subspace; and, finally, bad leverage + points, which have a large residual distance such that the projection on the subspace is away from regular observations.\\*""" latex_report += """\indent Results of applying this workflow are displayed in fig. 1. Based of the features extracted using {"""+to_report[2]+ r"""}, {"""+to_report[3]+ r"""} revealed the existance of {"""+to_report[5] + r"""} data clusters that are visualized with different colors. \begin{figure}[h] + \captionsetup{justification=centering} \centering \begin{minipage}[b]{0.33\textwidth} - \centering \includegraphics[width=\linewidth]{scores_pc1_pc2.png} \end{minipage}% \begin{minipage}[b]{0.33\textwidth} - \centering \includegraphics[width=\linewidth]{scores_pc1_pc3.png} \end{minipage}% \begin{minipage}[b]{0.33\textwidth} - \centering \includegraphics[width=\linewidth]{scores_pc2_pc3.png} \end{minipage} - \caption{The pairwise projection of spectra on the reduced 3D subspace.} + \centering + \caption{Illustration of the pairwise projection of spectra onto the reduced 3 dimensional subspace, clustering, and sample selection + results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be + analyzed by a standard reference analytical procedure} \label{pcaplots} \end{figure}""" + latex_report +=r""" """ if 'PCA' in to_report: latex_report += r""" - \begin{figure} + \begin{figure}[ht] \centering \begin{minipage}[b]{0.33\textwidth} \centering @@ -137,21 +139,82 @@ def report(*args): \centering \includegraphics[width=\linewidth]{hotelling_plot.png} \end{minipage} - \caption{The pairwise projection of spectra on the reduced 3D subspace.} + \caption{Outliers detection plots;(a) and (b) , respectively, correspond to the hotelling and influence plots} \label{hotelling_and_influence} \end{figure} """ latex_report += r"""Following the exploratory data analysis, a subset sampling method, consisting of""" - if 'random' in to_report: - latex_report += r""" selecting the sample with the least euclidian distance to the center of each data cluster identified by {"""+to_report[3]+ r"""},""" if 'center' in to_report: - latex_report += r""" fitting a second clustering model, specifically kmeans, to each data cluster and selecting - 3 samples or less from each subcluster (if a subcluster contains less than 3 samples, then all samples included - in this subcluster are selected)," + latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs.""" + if 'random' in to_report: + latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""} + samples or less from each subcluster (if a subcluster contains less than 3 samples, then all samples included + in this subcluster are selected), was applied.""" + + latex_report += r"""The subset of selected samples are identified to be representative and are suggested to be used for robust NIR calibration developement + , i.e, to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation).""" - the center was applied to select representative samples to be used for robust NIR calibration developement - , i.e, to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation)""" + elif 'Predictive model development' in to_report: + latex_report += r"""\paragraph{}To develop a robust NIR calibration that formally correlates the spectral signature of the samples in the NIR region + with the corresponding reference data obtained by analyzing the samples using a suitable reference analytical procedure, + a pipeline consisting of consecutively performing spectral signal correction followed by multivariable predictive modelling was applied. + Signal correction was performed by """ + if 'No_transformation' not in to_report: + latex_report += r"""normalizing the raw spectra using {"""+signal_preprocess[to_report[3]]+ r""", then """ + + if to_report[3] !="No_derivation": + latex_report += r"""taking the {"""+to_report[2]+ r"""}-order derivative of a the {"""+to_report[4]+ r"""}-order Savitzky-Golay (SG) + polynomial estimated over a moving window of {"""+to_report[5]+ r"""} data points""" + latex_report += r""". Subequently, the obtained data was split into two subsets using Kennard-Stone (KS) algorithm; a calibration (Cal) and Validation + (Val) subsets, the former ,consisting of 80\% of the data, was used for multivarible calibration development while the latter ,consisting of + the remaining 20\% of the data, was used for evaluating the predictive and the generalizability performance of the developed calibration.""" + latex_report += r""" To optimally select hyperparameters of the model and the signal preprocessing methods, prevent that the model overfit the data, + and optimize the predictive performance of the model, 5-folds Cross Validation (CV) was performed.""" + latex_report += r"""\paragraph{} Fig 5, and table 6 display descriptive summary of the input data, trainset, and testset.""" + + latex_report += r""" + \begin{figure}[h] + \centering + \includegraphics[width=1\linewidth]{Histogram.png} + \caption{Kde plot visualizing the distribution of the target variable, a subset of training, and testing sets} + \label{fig:Histogram} + \end{figure} + """ + df0.style.format("${:.2f}$").to_latex( position_float = 'centering', hrules = True, + caption = 'Descriptive statistics of the target variable, subsets used to develop and validate the predictive model', + label= 'reg_perf') +r"""""" + + + latex_report += r"""Predictive modelling development was performed using the {"""+reg_algo[to_report[6]]+ r"""} regression method.""" + if "Full-PLSR" in to_report: + latex_report += r"""the most important and influential spectral regions in the model, were visualized in fig.5""" + elif "Locally Weighted PLSR" in to_report: + """""" + elif "Interval-PLSR" in to_report: + latex_report += r"""Three intervalls were selected by the TPE-iPLS""" + + latex_report += r"""The calibration, CV, and prediction performance achieved by the developed model was evaluated + by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models. + specifically, the Root Mean Squared Error (RMSE), the Ratio of Performance to Deviation (RPD), the Ratio of + performance to inter-quartile (RPIQ). A table summarizing the model performance is shown bellow(Table. 4).\par"""""" + """ + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'reg_perf') + r"""""" + + if "Full-PLSR" in to_report: + + latex_report += r""" To identify the important variables in the model, Variable Importance in Projection (VIP) test applied, and the important variables in the model were + visualized in Fig.8 \par + + \begin{figure}[h] + \centering + \includegraphics[width=1\linewidth]{Variable_importance.png} + \caption{Visualizing important spectral regions identifiedin the PLS model on the raw and preprocessed average spectrum} + \label{fig:Histogram} + \end{figure} + """ + + latex_report += r"""After numerically analyzing the performance of the model, a visual investigation (figs 7 and 8) of goodness of model fit was performed to identify potential + issues such as a pattern, that has not been captured by the model, or outliers.\par. + """ latex_report += r""" \fontsize{8}{9}\selectfont diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 37b4492..60e2de2 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -62,6 +62,8 @@ labels = [] color_palette = None dr_model = None # dimensionality reduction model cl_model = None # clustering model +selection = None +selection_number = None # loader for datafile data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) @@ -202,11 +204,13 @@ if not t.empty: # all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_ all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_ labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels] + ncluster = len(clu_centers) # 3- Affinity propagation elif clus_method == cluster_methods[3]: cl_model = AP(X = tcr) data, labels, clu_centers = cl_model.fit_optimal_ + ncluster = len(clu_centers) if clus_method == cluster_methods[2]: #clustered = np.where(np.array(labels) != 'Non clustered')[0] @@ -227,6 +231,8 @@ selected_samples_idx = [] if labels: + num_clusters = len(np.unique(labels)) + custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] if clus_method: selection = scores.radio('Select samples selection strategy:', options = selec_strategy, index = default_sample_selection_option, key=102) @@ -297,11 +303,8 @@ if not t.empty: st.write('Scores plot') # scores plot with clustering if list(labels) and meta_data.empty: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels ,color_discrete_sequence= custom_color_palette) sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1) - - - # scores plot with metadata elif len(list(labels)) == 0 and not meta_data.empty: filter = md_df_st_.columns @@ -341,34 +344,33 @@ if not t.empty: sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) else: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels if list(labels) else None,color_discrete_sequence= custom_color_palette) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) fig.update_traces(marker=dict(size=4)) if selected_samples_idx: tt = tcr.iloc[selected_samples_idx,:] - fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2], - z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 5, color = 'black'), + fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],z = tt.loc[:,axis3], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') - st.plotly_chart(fig, use_container_width=True) + st.plotly_chart(fig, use_container_width = True) if labels: - num_clusters = len(np.unique(labels)) - - custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] - - # Créer et exporter le graphique Axe1-Axe2 en PNG + # export 2D scores plot comb = [i for i in combinations([1,2,3], 2)] subcap = ['a','b','c'] for i in range(len(comb)): - fig_axe1_axe2 = px.scatter(tcr, x=eval(f'axis{str(comb[i][0])}'), y=eval(f'axis{str(comb[i][1])}'), - color=labels if list(labels) else None, - color_discrete_sequence= custom_color_palette) - fig_axe1_axe2.update_layout(font=dict(size=23)) - fig_axe1_axe2.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, + fig_export = px.scatter(tcr, x = eval(f'axis{str(comb[i][0])}'), y=eval(f'axis{str(comb[i][1])}'), + color = labels if list(labels) else None, + color_discrete_sequence = custom_color_palette) + fig_export.add_scatter(x = tt.loc[:,eval(f'axis{str(comb[i][0])}')], y = tt.loc[:,eval(f'axis{str(comb[i][1])}')], + mode ='markers', marker = dict(size = 5, color = 'black'), + name = 'selected samples') + fig_export.update_layout(font=dict(size=23)) + fig_export.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) - fig_axe1_axe2.update_traces(marker=dict(size= 10), showlegend= False) - fig_axe1_axe2.write_image(f'./Report/Figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') + fig_export.update_traces(marker=dict(size= 10), showlegend= False) + fig_export.write_image(f'./Report/Figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') @@ -378,9 +380,6 @@ if not spectra.empty: st.write('Loadings plot') p = dr_model.loadings_ freq = pd.DataFrame(colnames, index=p.index) - - - if test =='.dx': if meta_data.loc[:,'xunits'][0] == '1/cm': freq.columns = ['Wavenumber (1/cm)'] @@ -390,7 +389,6 @@ if not spectra.empty: freq.columns = ['Wavelength (nm)'] xlab = 'Wavelength (nm)' inv = None - else: freq.columns = ['Wavelength/Wavenumber'] xlab = 'Wavelength/Wavenumber' @@ -476,8 +474,10 @@ if not spectra.empty: annotation.font.size = 35 fig.update_layout(font=dict(size=23), width=800, height=600) fig.update_traces(marker=dict(size= 10), showlegend= False) + fig.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, + font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) fig.write_image('./Report/figures/influence_plot.png', engine = 'kaleido') - + with hotelling: st.write('T²-Hotelling vs Q-residuals plot') # Hotelling @@ -518,6 +518,8 @@ if not spectra.empty: annotation.font.size = 35 fig.update_layout(font=dict(size=23), width=800, height=600) fig.update_traces(marker=dict(size= 10), showlegend= False) + fig.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, + font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) fig.write_image("./Report/figures/hotelling_plot.png", format="png") @@ -528,5 +530,5 @@ nb_clu = str(sam1.shape[0]) if data_file: with st.container(): if st.button("Download report"): - latex_report = report.report('sample_selection', data_file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, nb_clu,tcr, sam) - report.compile_latex() + latex_report = report.report('Representative subset selection', data_file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam) + report.compile_latex() \ No newline at end of file diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index bf4dd10..855d47e 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -16,8 +16,6 @@ if os.path.exists(repertoire_a_vider): elif os.path.isdir(chemin_fichier): os.rmdir(chemin_fichier) -json_sp = pd.DataFrame() - local_css(css_file / "style_model.css") ####################################### page Design ####################################### @@ -40,6 +38,7 @@ M9.write("-- Save the model --") ############################################################################################## reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] +regression_algo = None ####################################### ########################################### files_format = ['.csv', '.dx'] @@ -69,6 +68,7 @@ if file == files_format[0]: else: col = False if xcal_csv and ycal_csv: + file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0) yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col) @@ -103,6 +103,7 @@ if file == files_format[0]: elif file == files_format[1]: data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") if data_file: + file_name = str(data_file.name) with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: tmp.write(data_file.read()) tmp_path = tmp.name @@ -141,7 +142,7 @@ if not spectra.empty and not y.empty: ax1.margins(0) plt.tight_layout() M0.pyplot(fig) ######## Loaded graph - fig.savefig("./Report/figures/Spectre_mod.png") + fig.savefig("./Report/figures/spectra_plot.png") fig, ax2 = plt.subplots(figsize = (12,3)) sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True) sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True) @@ -151,19 +152,19 @@ if not spectra.empty and not y.empty: plt.tight_layout() M0.pyplot(fig) - fig.savefig("./Report/figures/histo.png") + fig.savefig("./Report/figures/Histogram.png") M0.write('Loaded data summary') M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)) - LoDaSum=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2) + stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2) ####################################### Insight into the loaded data ####################################### - regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) + regression_algo = M1.selectbox("Choose the algorithm for regression", options= reg_algo, key = 12, placeholder ="Choose an option") if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py - Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10) + Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1) reg_model = Reg.model_ #M2.dataframe(Pin.pred_data_) elif regression_algo == reg_algo[2]: @@ -262,7 +263,7 @@ if not spectra.empty and not y.empty: ############ cv2.write('-- Cross-Validation Summary--') cv2.write(Reg.CV_results_) - cv99=pd.DataFrame(Reg.CV_results_) + cv_results=pd.DataFrame(Reg.CV_results_) cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --') fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", @@ -277,7 +278,7 @@ if not spectra.empty and not y.empty: cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --') cv1.plotly_chart(fig0, use_container_width=True) - fig0.write_image("./Report/figures/Predictions_V.png") + fig1.write_image("./Report/figures/Predictions_V.png") yc = Reg.pred_data_[0] @@ -287,8 +288,6 @@ if not spectra.empty and not y.empty: M1.write('-- Spectral preprocessing info --') M1.write(Reg.best_hyperparams_print) - a_Test=Reg.best_hyperparams_print - with open("data/params/Preprocessing.json", "w") as outfile: json.dump(Reg.best_hyperparams_, outfile) @@ -304,10 +303,20 @@ if not spectra.empty and not y.empty: #my_circular_progress.st_circular_progress() #my_circular_progress.update_value(progress=20) - a=reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) + a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) M7.pyplot(a) plt.savefig('./Report/figures/Predictedvs.png') + prep_para = Reg.best_hyperparams_ + prep_para.pop('n_components') + + for i in ['deriv','polyorder']: + if Reg.best_hyperparams_[i] == 0: + prep_para[i] = '0' + elif Reg.best_hyperparams_[i] == 1: + prep_para[i] = '1st' + elif Reg.best_hyperparams_[i] > 1: + prep_para[i] = f"{Reg.best_hyperparams_[i]}nd" residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) M8.pyplot(residual_plot) @@ -352,28 +361,14 @@ if not spectra.empty and not y.empty: ] ) st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') - -## Load .dx file -Ac_Km = ['histo.png', 'Spectre_mod.png','Predictions_V.png','Allinone.png','Predictedvs.png','residual_plot.png'] -with st.container(): - if st.button("Download the report"): - if regression_algo == reg_algo[1]: - latex_report = report.report(LoDaSum, 'model',Ac_Km,a_Test,json_sp,model_per,'full_plsr',cv99) - report.compile_latex() - else: - pass - - else: - pass + -if not spectra.empty and not y.empty: +if not spectra.empty and not y.empty and regression_algo: if regression_algo in reg_algo[1:]: fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') - - ax2.set_xlabel('Wavelenghts') plt.tight_layout() @@ -391,12 +386,28 @@ if not spectra.empty and not y.empty: eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0) if regression_algo == reg_algo[1]: - ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], + ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0)[np.array(Reg.sel_ratio_.index)], color = 'red', label = 'Important variables') - ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], + ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0)[np.array(Reg.sel_ratio_.index)], color = 'red', label = 'Important variables') ax1.legend() ax2.legend() - M2.write('-- Visualization of the spectral regions used for model creation -- ') - M2.pyplot(fig) \ No newline at end of file + M2.write('-- Visualization of the spectral regions used for model creation --') + fig.savefig("./Report/figures/Variable_importance.png") + M2.pyplot(fig) + +## Load .dx file + +with st.container(): + if st.button("Download the report"): + if regression_algo == reg_algo[1]: + latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results) + report.compile_latex() + if regression_algo is None: + st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸") + else: + pass + + else: + pass -- GitLab