From 162d171d32d004c80fa8c5d8b636d50d39392ae1 Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Tue, 25 Jun 2024 11:49:29 +0200
Subject: [PATCH] Results adjustments for report generation +fix err

---
 src/Class_Mod/Miscellaneous.py   |   6 +-
 src/Class_Mod/UMAP_.py           |   1 +
 src/Report/report.py             | 165 +++++++++++++++++++++----------
 src/pages/1-samples_selection.py |  56 ++++++-----
 src/pages/2-model_creation.py    |  75 ++++++++------
 5 files changed, 190 insertions(+), 113 deletions(-)

diff --git a/src/Class_Mod/Miscellaneous.py b/src/Class_Mod/Miscellaneous.py
index 4280bb3..69d7d24 100644
--- a/src/Class_Mod/Miscellaneous.py
+++ b/src/Class_Mod/Miscellaneous.py
@@ -130,7 +130,7 @@ def desc_stats(x):
     a['Mean'] = np.mean(x)
     a['Median'] = np.median(x)
     a['S'] = np.std(x)
-    a['RSD(%)'] = np.std(x)*100/np.mean(x)
-    a['Skewness'] = skew(x, axis=0, bias=True)
-    a['Kurtosis'] = kurtosis(x, axis=0, bias=True)
+    a['RSD'] = np.std(x)*100/np.mean(x)
+    a['Skew'] = skew(x, axis=0, bias=True)
+    a['Kurt'] = kurtosis(x, axis=0, bias=True)
     return a
\ No newline at end of file
diff --git a/src/Class_Mod/UMAP_.py b/src/Class_Mod/UMAP_.py
index b3f0c67..75c8746 100644
--- a/src/Class_Mod/UMAP_.py
+++ b/src/Class_Mod/UMAP_.py
@@ -21,6 +21,7 @@ class Umap:
         self.model.fit(self.numerical_data, y = self.categorical_data_encoded)
         self.scores_raw = self.model.transform(self.numerical_data)
         self.scores = pd.DataFrame(self.scores_raw)
+        self.scores.columns = [f'axis_{i+1}' for i in range(self.scores_raw.shape[1])]
 
     @property
     def scores_(self):
diff --git a/src/Report/report.py b/src/Report/report.py
index 7e79ae7..78029e5 100644
--- a/src/Report/report.py
+++ b/src/Report/report.py
@@ -3,12 +3,14 @@ from pathlib import Path
 import os
 import pandas as pd
 import os.path
+import re
 
 def intersect(l1, l2): 
     return l1.intersection(set(l2))
 def check(file):
     return os.path.isfile(file)
 def report(*args):
+    signal_preprocess = {'Snv':'Standard Normal Variate (SNV)'}
     dim_red_methods= {'PCA':'Principal Components Analysis (PCA)',
                       'UMAP':'Uniform Manifold Approximation and Projection (UMAP)',
                         'NMF':'Non-negative Matrix Factorization (NMF)'}  # List of dimensionality reduction algos
@@ -16,23 +18,28 @@ def report(*args):
                        'HDBSCAN':'Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN)',
                         'AP':'Affinity Propagation (AP)'} # List of clustering algos
     selec_strategy = {'center':'PCA','random':'PCA'}
+    reg_algo ={"Full-PLSR":'full Partial Least Squares (PLS)',
+                "Locally Weighted PLSR": 'Locally Weighted Partial Least Squares (LWPLS)',
+                "Interval-PLSR": "Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)"} 
+
     to_report=[]
     j=0
     for arg in args:
-        if isinstance(arg, str):
-            to_report.append(arg)
+        if isinstance(arg, str) or isinstance(arg, int):
+            to_report.append(str(arg))
         elif isinstance(arg, list):
-            to_report.extend(arg)
+            to_report.extend(list(map(str, arg)))
         elif isinstance(arg, pd.DataFrame):
             df_name = 'df' + str(j)
             j+=1
-            globals()[df_name] = arg
-
+            globals()[df_name] = arg.select_dtypes(include=['float64', 'int64'])
+    
     latex_report = ""
     latex_report += r"""\documentclass[a4paper,10pt]{article}
     \usepackage{fancyhdr}
     \usepackage{graphicx}
     \usepackage{geometry}
+    \usepackage{changepage}
     \geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=3cm }
     \usepackage{caption, subcaption}
     \usepackage{hyperref}
@@ -63,71 +70,66 @@ def report(*args):
     \begin{center}
     \textbf{{\Large NIRS WORKFLOW REPORT}} \\
     \end{center}"""
-    if 'sample_selection' in to_report:
-        latex_report += r"""\noindent
-        \textbf{QUERY MADE: } Sample selection performing.\\
-        \noindent
-        \textbf{ENTERED INPUTS: }{"""+to_report[1] + r"""}\\
-        \textbf{PRINCIPLE OF RESPONSE TO THE QUERY:} Representative subset selection has
-          been performed using the "sample selection" workflow that consists of applying
-            a sequence of data processing techniques, specifically, dimensionality reduction,
-              clustering, and samples selection techniques."""
-        
-        latex_report += r"""\section*{RESULTS}"""
-        latex_report += r"""\subsection*{Spectral data visualization}"""
-        latex_report += r"""Acquired spectra were visualized in fig1 by plotting the intensity
-          of absorption, reflectance, transmission, etc, against the wavelengths or wavenumbers.
-        This helps observe general patterns and trends in the spectra, and understand the 
-        variability within the data.
-         \begin{figure}[h]
-                \centering
-                \includegraphics[width=1\linewidth]{spectra_plot.png}
-                \caption{Acquired spectra}
-                \label{fig:raw_spectra}
-            \end{figure}"""
+    latex_report += r"""\noindent
+    \textbf{QUERY MADE: }{"""+ re.sub(r'([_%])', r'\\\1',to_report[0])+ r"""}.\\
+    \noindent
+    \textbf{ENTERED INPUTS: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\"""
+    latex_report += r"""\section*{Results}"""
+    latex_report += r"""\subsection*{Spectral data visualization}"""
+    latex_report += r"""Acquired spectra were visualized in fig1 by plotting the intensity
+    of absorption, reflectance, transmission, etc, against the wavelengths or wavenumbers.
+    This helps observe general patterns and trends in the spectra, and understand the 
+    variability within the data.
+    \begin{figure}[h]
+    \centering
+    \includegraphics[width=1\linewidth]{spectra_plot.png}
+    \caption{Acquired spectra}
+    \label{fig:raw_spectra}
+    \end{figure}"""
 
+    if 'Representative subset selection' in to_report:
         latex_report += r"""\subsection*{Multivariable Data Analysis}"""
         latex_report += r""" For optimal selection of subset of the samples to analyze through the
-          reference method, a workflow consisting of consecutively applying features extraction/dimensionality
+          reference method, a pipeline consisting of consecutively applying features extraction/dimensionality
             reduction and clustering analysis was developed. Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} 
             technique which helps represent the high dimensional spectra in a reduced perceptible 3D
-            subspace spanned by a few number of features (three features in our case), on of the spectra. 
-            While clustering analysis was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique which
-              helps group the data into groups of spectra that share the same carachteristics.
-              This workflow is widely used in the world of spectral data analysis for detecting outliers,
-              analysis the homogenity of the data, reducing the computational costs prior to supervised predictive modelling, etc.\\*"""
+            subspace spanned by a few number of features (three features in our case), while clustering analysis was performed
+              using the {"""+cluster_methods[to_report[3]] + r"""} technique which
+              helps group the data into groups of spectra that share the same carachteristics.\\*"""
         if "PCA" in to_report:
             latex_report += r"""\indent To detect the presence of any spectral outliers, the influence and residuals plots were constructed,
-              with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish Regular Observations (ROs),
-                which form a homogeneous group near the subspace generated by the PCs; Good Leverage Points (GLPs),
-                  which are at the same plane as the subspace but distant from the ROs; Orthogonal Observations (OOs), which have a
-                    large residual distance to the subspace, but whose projection is on the subspace; and, finally, Bad Leverage
-                      Points (BLPs), which have a large residual distance such that the projection on the subspace is away from ROs.\\*"""
+              with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish regular observations,
+                which form a homogeneous group near the subspace generated by the PCs; good leverage points,
+                  which are at the same plane as the subspace but distant from the ROs; orthogonal observations, which have a
+                    large residual distance to the subspace, but whose projection is on the subspace; and, finally, bad leverage
+                      points, which have a large residual distance such that the projection on the subspace is away from regular observations.\\*"""
               
         latex_report += """\indent Results of applying this workflow are displayed in fig. 1. Based of the features extracted using
           {"""+to_report[2]+ r"""}, {"""+to_report[3]+ r"""} revealed the  existance of {"""+to_report[5] + r"""}
             data clusters that are visualized with different colors.
         \begin{figure}[h]
+        \captionsetup{justification=centering}
             \centering
             \begin{minipage}[b]{0.33\textwidth}
-                \centering
                 \includegraphics[width=\linewidth]{scores_pc1_pc2.png}
             \end{minipage}%
             \begin{minipage}[b]{0.33\textwidth}
-                \centering
                 \includegraphics[width=\linewidth]{scores_pc1_pc3.png}
             \end{minipage}%
             \begin{minipage}[b]{0.33\textwidth}
-                \centering
                 \includegraphics[width=\linewidth]{scores_pc2_pc3.png}
             \end{minipage}
-        \caption{The pairwise projection of spectra on the reduced 3D subspace.}
+            \centering
+            \caption{Illustration of the pairwise projection of spectra onto the reduced 3 dimensional subspace, clustering, and sample selection
+            results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be
+            analyzed by a standard reference analytical procedure}
         \label{pcaplots}
         \end{figure}"""
+        latex_report +=r""" """
     
         if 'PCA' in to_report:
             latex_report += r"""
-            \begin{figure}
+            \begin{figure}[ht]
             \centering
             \begin{minipage}[b]{0.33\textwidth}
                 \centering
@@ -137,21 +139,82 @@ def report(*args):
                 \centering
                 \includegraphics[width=\linewidth]{hotelling_plot.png}
             \end{minipage}
-        \caption{The pairwise projection of spectra on the reduced 3D subspace.}
+        \caption{Outliers detection plots;(a) and (b) , respectively, correspond to the hotelling and influence plots}
         \label{hotelling_and_influence}
         \end{figure}
         """
         
         latex_report += r"""Following the exploratory data analysis, a subset sampling method, consisting of"""
-        if 'random' in to_report:
-            latex_report += r""" selecting the sample with the least euclidian distance to the center of each data cluster identified by {"""+to_report[3]+ r"""},"""
         if 'center' in to_report:
-              latex_report += r""" fitting a second clustering model, specifically kmeans, to each data cluster and selecting
-                3 samples or less from each subcluster (if a subcluster contains less than 3 samples, then all samples included
-                  in this subcluster are selected),"
+            latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs."""
+        if 'random' in to_report:
+              latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""}
+                 samples or less from each subcluster (if a subcluster contains less than 3 samples, then all samples included
+                  in this subcluster are selected), was applied."""
+                  
+        latex_report += r"""The subset of selected samples are identified to be representative and are suggested to be used for robust NIR calibration developement
+         , i.e, to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation)."""
 
-              the center was applied to select representative samples to be used for robust NIR calibration developement
-         , i.e, to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation)"""
+    elif 'Predictive model development' in to_report:
+        latex_report += r"""\paragraph{}To develop a robust NIR calibration that formally correlates the spectral signature of the samples in the NIR region
+          with the corresponding reference data obtained by analyzing the samples using a suitable reference analytical procedure,
+            a pipeline consisting of consecutively performing spectral signal correction followed by multivariable predictive modelling was applied.
+              Signal correction was performed by """
+        if 'No_transformation' not in to_report:
+            latex_report += r"""normalizing the raw spectra using {"""+signal_preprocess[to_report[3]]+ r""", then """
+        
+        if to_report[3] !="No_derivation":
+            latex_report += r"""taking the {"""+to_report[2]+ r"""}-order derivative of a the {"""+to_report[4]+ r"""}-order Savitzky-Golay (SG)
+            polynomial estimated over a moving window of {"""+to_report[5]+ r"""} data points"""
+        latex_report += r""". Subequently, the obtained data was split into two subsets using Kennard-Stone (KS) algorithm; a calibration (Cal) and Validation
+          (Val) subsets, the former ,consisting of 80\% of the data, was used for multivarible calibration development while the latter ,consisting of
+            the remaining 20\% of the data, was used for evaluating the predictive and the generalizability performance of the developed calibration."""
+        latex_report += r""" To optimally select hyperparameters of the model and the signal preprocessing methods, prevent that the model overfit the data,
+            and optimize the predictive performance of the model, 5-folds Cross Validation (CV) was performed."""
+        latex_report += r"""\paragraph{} Fig 5, and table 6 display descriptive summary of the input data, trainset, and testset."""
+    
+        latex_report += r"""
+        \begin{figure}[h]
+        \centering
+        \includegraphics[width=1\linewidth]{Histogram.png}
+        \caption{Kde plot visualizing the distribution of the target variable, a subset of training, and testing sets}
+        \label{fig:Histogram}
+        \end{figure}
+        """ + df0.style.format("${:.2f}$").to_latex( position_float = 'centering', hrules = True,
+                                                     caption = 'Descriptive statistics of the target variable, subsets used to develop and validate the predictive model',
+                                                     label= 'reg_perf') +r""""""
+
+        
+        latex_report += r"""Predictive modelling development was performed using the {"""+reg_algo[to_report[6]]+ r"""} regression method."""
+        if "Full-PLSR" in to_report:
+            latex_report += r"""the most important and influential spectral regions in the model, were visualized in fig.5"""
+        elif "Locally Weighted PLSR" in to_report:
+            """"""
+        elif "Interval-PLSR" in to_report:
+            latex_report += r"""Three intervalls were selected by the TPE-iPLS"""
+        
+        latex_report += r"""The calibration, CV, and prediction performance achieved by the developed model was evaluated
+          by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models. 
+          specifically, the Root Mean Squared Error (RMSE), the Ratio of Performance to Deviation (RPD), the Ratio of 
+          performance to inter-quartile (RPIQ). A table summarizing the model performance is shown bellow(Table. 4).\par""""""
+        """ + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'reg_perf') + r""""""
+
+        if "Full-PLSR" in to_report:
+        
+            latex_report += r""" To identify the important variables in the model, Variable Importance in Projection (VIP) test applied, and the important variables in the model were 
+            visualized in Fig.8 \par
+            
+            \begin{figure}[h]
+            \centering
+            \includegraphics[width=1\linewidth]{Variable_importance.png}
+            \caption{Visualizing important spectral regions identifiedin the PLS model on the raw and preprocessed average spectrum}
+            \label{fig:Histogram}
+            \end{figure}
+            """
+        
+        latex_report += r"""After numerically analyzing the performance of the model, a visual investigation (figs 7 and 8) of goodness of model fit was performed to identify potential
+          issues such as a pattern, that has not been captured by the model, or outliers.\par.
+          """
 
     latex_report += r"""
     \fontsize{8}{9}\selectfont
diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py
index 37b4492..60e2de2 100644
--- a/src/pages/1-samples_selection.py
+++ b/src/pages/1-samples_selection.py
@@ -62,6 +62,8 @@ labels = []
 color_palette = None
 dr_model = None # dimensionality reduction model
 cl_model = None # clustering model
+selection = None
+selection_number = None
 
 # loader for datafile
 data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
@@ -202,11 +204,13 @@ if not t.empty:
         # all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_
         all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_
         labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels]
+        ncluster = len(clu_centers)
 
     # 3- Affinity propagation
     elif clus_method == cluster_methods[3]:
         cl_model = AP(X = tcr)
         data, labels, clu_centers = cl_model.fit_optimal_
+        ncluster = len(clu_centers)
     
     if clus_method == cluster_methods[2]:
         #clustered = np.where(np.array(labels) != 'Non clustered')[0]
@@ -227,6 +231,8 @@ selected_samples_idx = []
 
 
 if labels:
+    num_clusters = len(np.unique(labels))
+    custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
     if clus_method:
         selection = scores.radio('Select samples selection strategy:',
                                     options = selec_strategy, index = default_sample_selection_option, key=102)
@@ -297,11 +303,8 @@ if not t.empty:
         st.write('Scores plot')
         # scores plot with clustering
         if list(labels) and meta_data.empty:
-            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
+            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels ,color_discrete_sequence= custom_color_palette)
             sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1)
-            
-
-    
         # scores plot with metadata
         elif len(list(labels)) == 0 and not meta_data.empty:
             filter = md_df_st_.columns
@@ -341,34 +344,33 @@ if not t.empty:
                 sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3)
 
         else:
-            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
+            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels if list(labels) else None,color_discrete_sequence= custom_color_palette)
             sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1)
         fig.update_traces(marker=dict(size=4))
 
         if selected_samples_idx:
             tt = tcr.iloc[selected_samples_idx,:]
-            fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],
-                              z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 5, color = 'black'),
+            fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],z = tt.loc[:,axis3],
+                               mode ='markers', marker = dict(size = 5, color = 'black'),
                               name = 'selected samples')        
-        st.plotly_chart(fig, use_container_width=True)
+        st.plotly_chart(fig, use_container_width = True)
 
         if labels:
-            num_clusters = len(np.unique(labels))
-
-            custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
-
-            # CrÃ©er et exporter le graphique Axe1-Axe2 en PNG
+            # export 2D scores plot
             comb = [i for i in combinations([1,2,3], 2)]
             subcap = ['a','b','c']
             for i in range(len(comb)):
-                fig_axe1_axe2 = px.scatter(tcr, x=eval(f'axis{str(comb[i][0])}'), y=eval(f'axis{str(comb[i][1])}'),
-                                            color=labels if list(labels) else None,
-                                            color_discrete_sequence= custom_color_palette)
-                fig_axe1_axe2.update_layout(font=dict(size=23))
-                fig_axe1_axe2.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1,
+                fig_export = px.scatter(tcr, x = eval(f'axis{str(comb[i][0])}'), y=eval(f'axis{str(comb[i][1])}'),
+                                            color = labels if list(labels) else None,
+                                            color_discrete_sequence = custom_color_palette)
+                fig_export.add_scatter(x = tt.loc[:,eval(f'axis{str(comb[i][0])}')], y = tt.loc[:,eval(f'axis{str(comb[i][1])}')],
+                               mode ='markers', marker = dict(size = 5, color = 'black'),
+                              name = 'selected samples')
+                fig_export.update_layout(font=dict(size=23))
+                fig_export.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1,
                                              font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
-                fig_axe1_axe2.update_traces(marker=dict(size= 10), showlegend= False)
-                fig_axe1_axe2.write_image(f'./Report/Figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png')
+                fig_export.update_traces(marker=dict(size= 10), showlegend= False)
+                fig_export.write_image(f'./Report/Figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png')
 
 
 
@@ -378,9 +380,6 @@ if not spectra.empty:
             st.write('Loadings plot')
             p = dr_model.loadings_
             freq = pd.DataFrame(colnames, index=p.index)
-            
-            
-            
             if test =='.dx':
                 if meta_data.loc[:,'xunits'][0] == '1/cm':
                     freq.columns = ['Wavenumber (1/cm)']
@@ -390,7 +389,6 @@ if not spectra.empty:
                     freq.columns = ['Wavelength (nm)']
                     xlab = 'Wavelength (nm)'
                     inv = None
-                    
             else:
                 freq.columns = ['Wavelength/Wavenumber']
                 xlab = 'Wavelength/Wavenumber'
@@ -476,8 +474,10 @@ if not spectra.empty:
                 annotation.font.size = 35
             fig.update_layout(font=dict(size=23), width=800, height=600)
             fig.update_traces(marker=dict(size= 10), showlegend= False)
+            fig.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
+                                             font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
             fig.write_image('./Report/figures/influence_plot.png', engine = 'kaleido')
-
+            
         with hotelling:
             st.write('TÂ²-Hotelling vs Q-residuals plot')
             # Hotelling
@@ -518,6 +518,8 @@ if not spectra.empty:
                 annotation.font.size = 35
             fig.update_layout(font=dict(size=23), width=800, height=600)
             fig.update_traces(marker=dict(size= 10), showlegend= False)
+            fig.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
+                                             font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
             fig.write_image("./Report/figures/hotelling_plot.png", format="png")
 
 
@@ -528,5 +530,5 @@ nb_clu = str(sam1.shape[0])
 if data_file:
     with st.container():
         if st.button("Download report"):
-            latex_report = report.report('sample_selection', data_file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, nb_clu,tcr, sam)
-            report.compile_latex()
+            latex_report = report.report('Representative subset selection', data_file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam)
+            report.compile_latex()
\ No newline at end of file
diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py
index bf4dd10..855d47e 100644
--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -16,8 +16,6 @@ if os.path.exists(repertoire_a_vider):
         elif os.path.isdir(chemin_fichier):
             os.rmdir(chemin_fichier)
 
-json_sp = pd.DataFrame()
-
 local_css(css_file / "style_model.css")
 
     ####################################### page Design #######################################
@@ -40,6 +38,7 @@ M9.write("-- Save the model --")
     ##############################################################################################
 
 reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
+regression_algo = None
       #######################################        ###########################################
 
 files_format = ['.csv', '.dx']
@@ -69,6 +68,7 @@ if file == files_format[0]:
         else: col = False
     
     if xcal_csv and ycal_csv:
+        file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name)
         xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0)
         yfile =  pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col)
         
@@ -103,6 +103,7 @@ if file == files_format[0]:
 elif file == files_format[1]:
     data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
     if data_file:
+        file_name = str(data_file.name)
         with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
             tmp.write(data_file.read())
             tmp_path = tmp.name
@@ -141,7 +142,7 @@ if not spectra.empty and not y.empty:
     ax1.margins(0)
     plt.tight_layout()
     M0.pyplot(fig) ######## Loaded graph
-    fig.savefig("./Report/figures/Spectre_mod.png")
+    fig.savefig("./Report/figures/spectra_plot.png")
     fig, ax2 = plt.subplots(figsize = (12,3))
     sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True)
     sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True)
@@ -151,19 +152,19 @@ if not spectra.empty and not y.empty:
     plt.tight_layout()
 
     M0.pyplot(fig)
-    fig.savefig("./Report/figures/histo.png")
+    fig.savefig("./Report/figures/Histogram.png")
 
 
     M0.write('Loaded data summary')
     M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2))
-    LoDaSum=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)
+    stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)
     ####################################### Insight into the loaded data
 
     #######################################
-    regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
+    regression_algo = M1.selectbox("Choose the algorithm for regression", options= reg_algo, key = 12, placeholder ="Choose an option")
     if regression_algo == reg_algo[1]:
         # Train model with model function from application_functions.py
-        Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10)
+        Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1)
         reg_model = Reg.model_
         #M2.dataframe(Pin.pred_data_)
     elif regression_algo == reg_algo[2]:
@@ -262,7 +263,7 @@ if not spectra.empty and not y.empty:
         ############
         cv2.write('-- Cross-Validation Summary--')
         cv2.write(Reg.CV_results_)
-        cv99=pd.DataFrame(Reg.CV_results_)
+        cv_results=pd.DataFrame(Reg.CV_results_)
         cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')
 
         fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", 
@@ -277,7 +278,7 @@ if not spectra.empty and not y.empty:
 
         cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
         cv1.plotly_chart(fig0, use_container_width=True)
-        fig0.write_image("./Report/figures/Predictions_V.png")
+        fig1.write_image("./Report/figures/Predictions_V.png")
 
         
         yc = Reg.pred_data_[0]
@@ -287,8 +288,6 @@ if not spectra.empty and not y.empty:
         M1.write('-- Spectral preprocessing info --')
         
         M1.write(Reg.best_hyperparams_print)
-        a_Test=Reg.best_hyperparams_print
-
         with open("data/params/Preprocessing.json", "w") as outfile:
             json.dump(Reg.best_hyperparams_, outfile)
         
@@ -304,10 +303,20 @@ if not spectra.empty and not y.empty:
         
         #my_circular_progress.st_circular_progress()
         #my_circular_progress.update_value(progress=20)
-        a=reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
+        a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
 
         M7.pyplot(a)
         plt.savefig('./Report/figures/Predictedvs.png')
+        prep_para = Reg.best_hyperparams_
+        prep_para.pop('n_components')
+
+        for i in ['deriv','polyorder']:
+            if Reg.best_hyperparams_[i] == 0:
+                prep_para[i] = '0'
+            elif Reg.best_hyperparams_[i] == 1:
+                prep_para[i] = '1st'
+            elif Reg.best_hyperparams_[i] > 1:
+                prep_para[i] = f"{Reg.best_hyperparams_[i]}nd"
 
         residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
         M8.pyplot(residual_plot)
@@ -352,28 +361,14 @@ if not spectra.empty and not y.empty:
                  ]
             )
             st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
-
-## Load .dx file
-Ac_Km = ['histo.png', 'Spectre_mod.png','Predictions_V.png','Allinone.png','Predictedvs.png','residual_plot.png']
-with st.container():
-    if st.button("Download the report"):
-        if regression_algo == reg_algo[1]:
-                latex_report = report.report(LoDaSum, 'model',Ac_Km,a_Test,json_sp,model_per,'full_plsr',cv99)
-                report.compile_latex()
-        else:
-            pass
-
-    else:
-        pass
+            
 
 
-if not spectra.empty and not y.empty:
+if not spectra.empty and not y.empty and regression_algo:
     if regression_algo in reg_algo[1:]:
         fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True)
         ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)')
         ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)')
-
-
         ax2.set_xlabel('Wavelenghts')
         plt.tight_layout()
         
@@ -391,12 +386,28 @@ if not spectra.empty and not y.empty:
                     
                     eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0)                
         if regression_algo == reg_algo[1]:
-                ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)],
+                ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0)[np.array(Reg.sel_ratio_.index)],
                              color = 'red', label = 'Important variables')
-                ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)],
+                ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0)[np.array(Reg.sel_ratio_.index)],
                              color = 'red', label = 'Important variables')
                 ax1.legend()
                 ax2.legend()
 
-        M2.write('-- Visualization of the spectral regions used for model creation --   ')
-        M2.pyplot(fig)
\ No newline at end of file
+        M2.write('-- Visualization of the spectral regions used for model creation --')
+        fig.savefig("./Report/figures/Variable_importance.png")
+        M2.pyplot(fig)
+
+## Load .dx file
+
+with st.container():
+    if st.button("Download the report"):
+        if regression_algo == reg_algo[1]:
+                    latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results)
+                    report.compile_latex()
+        if regression_algo is None:
+            st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸")
+        else:
+            pass
+
+    else:
+        pass
-- 
GitLab