diff --git a/src/Class_Mod/KMEANS_.py b/src/Class_Mod/KMEANS_.py index 9e69ba5b1d187ecad370ccb5d6783323892db942..8c67f1d8eeccc528d54afe61a88814705c12746a 100644 --- a/src/Class_Mod/KMEANS_.py +++ b/src/Class_Mod/KMEANS_.py @@ -39,7 +39,7 @@ class Sk_Kmeans: idxidx.append(f'{i+1}_clust') values.append((s[i] - s[i+1])*100 / s[i]) - id = np.max(np.where(np.array(values) > 20))+2 + id = np.max(np.where(np.array(values) > 10))+2 return id def fit_optimal(self, nclusters): diff --git a/src/Class_Mod/KennardStone.py b/src/Class_Mod/KennardStone.py new file mode 100644 index 0000000000000000000000000000000000000000..0e5c5cbe32150e3fd7cf4cdfe860db2766888043 --- /dev/null +++ b/src/Class_Mod/KennardStone.py @@ -0,0 +1,26 @@ +from Packages import * +from typing import Sequence, Dict, Optional, Union + +class KS: + def __init__(self, x:Optional[Union[np.ndarray|pd.DataFrame]], rset:Optional[Union[float|int]]): + self.x = x + self.ratio = rset + self._train, self._test = ks.train_test_split(self.x, train_size = self.ratio/100) + + @property + def calset(self): + clu = self._train.index.tolist() + + return self.x, clu + +class RDM: + def __init__(self, x:Optional[Union[np.ndarray|pd.DataFrame]], rset:Optional[Union[float|int]]): + self.x = x + self.ratio = rset + self._train, self._test = train_test_split(self.x, train_size = self.ratio/100) + + @property + def calset(self): + clu = self._train.index.tolist() + + return self.x, clu \ No newline at end of file diff --git a/src/Class_Mod/RegModels.py b/src/Class_Mod/RegModels.py index ce07a07e6bf541d8e078dfe12846d96d4868e28a..6e4cc01aa69614f1e85eb20165c68063a440d026 100644 --- a/src/Class_Mod/RegModels.py +++ b/src/Class_Mod/RegModels.py @@ -4,7 +4,7 @@ from Class_Mod import metrics, Snv, No_transformation, KF_CV, sel_ratio class Regmodel(object): - def __init__(self, train, test, n_iter, add_hyperparams = None, nfolds = 5, **kwargs): + def __init__(self, train, test, n_iter, add_hyperparams = None, nfolds = 3, **kwargs): self.SCORE = 100000000 self._xc, self._xt, self._ytrain, self._ytest = train[0], test[0], train[1], test[1] self._nc, self._nt, self._p = train[0].shape[0], test[0].shape[0], train[0].shape[1] @@ -152,7 +152,7 @@ class TpeIpls(Regmodel): self.n_arrets = self.n_intervall*2 - r = {'n_components': hp.randint('n_components', 2,20)} + r = {'n_components': hp.randint('n_components', 2,10)} r.update({f'v{i}': hp.randint(f'v{i}', 0, train[0].shape[1]) for i in range(1,self.n_arrets+1)}) super().__init__(train, test, n_iter, add_hyperparams = r) diff --git a/src/Class_Mod/__init__.py b/src/Class_Mod/__init__.py index 82330cbd9b33dc9a70d5ae8cb47b6fd7b01fdafc..82bee16645d433af12604ea26fdb7dfbeb26457f 100644 --- a/src/Class_Mod/__init__.py +++ b/src/Class_Mod/__init__.py @@ -15,4 +15,5 @@ from .SK_PLSR_ import PlsR from .PLSR_Preprocess import PlsProcess from .NMF_ import Nmf from .Ap import AP -from .RegModels import Plsr, TpeIpls \ No newline at end of file +from .RegModels import Plsr, TpeIpls +from .KennardStone import KS, RDM \ No newline at end of file diff --git a/src/Modules.py b/src/Modules.py index ba9e78454bea329013c893e864d5b1042b72ef1a..f440038abae3c75c9085ec85ff607bf4c7ffd893 100644 --- a/src/Modules.py +++ b/src/Modules.py @@ -10,4 +10,4 @@ pages_folder = Path("pages/") from style.header import add_header, add_sidebar from config.config import pdflatex_path local_css(css_file / "style.css") - +from Class_Mod import KS, RDM diff --git a/src/Report/report.py b/src/Report/report.py index dfbdab5b5a78ac9e5bdb9c84c6c8dfd2d2f3d5b0..4a0b309ae62b78f29e6dafa4fd95146926e648ab 100644 --- a/src/Report/report.py +++ b/src/Report/report.py @@ -10,17 +10,21 @@ def intersect(l1, l2): def check(file): return os.path.isfile(file) def report(*args): - signal_preprocess = {'Snv':'Standard Normal Variate (SNV)'} - dim_red_methods= {'PCA':'Principal Components Analysis (PCA)', - 'UMAP':'Uniform Manifold Approximation and Projection (UMAP)', - 'NMF':'Non-negative Matrix Factorization (NMF)'} # List of dimensionality reduction algos - cluster_methods = {'Kmeans':'Kmeans', - 'HDBSCAN':'Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN)', - 'AP':'Affinity Propagation (AP)'} # List of clustering algos + signal_preprocess = {'Snv':'Standard Normal Variate (SNV) \cite{barnes1989standard}', + 'SG': 'Savitzky-Golay (SG) \cite{savitzky1964smoothing}'} + dim_red_methods= {'PCA':'Principal Components Analysis (PCA) \cite{wold1987principal,ringner2008principal,greenacre2022principal,JMLR:v12:pedregosa11a}', + 'UMAP':'Uniform Manifold Approximation and Projection (UMAP) \cite{ghojogh2021uniform,JMLR:v12:pedregosa11a}', + 'NMF':'Non-negative Matrix Factorization (NMF) \cite{lopes2015non}'} # List of dimensionality reduction algos + cluster_methods = {'Kmeans':'Kmeans \cite{chong2021k,JMLR:v12:pedregosa11a}', + 'HDBSCAN':'Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN) \cite{mcinnes2017hdbscan}', + 'AP':'Affinity Propagation (AP) \cite{dueck2009affinity,JMLR:v12:pedregosa11a}', + 'KS':'Kennard-Stone algorithm (KS)', + 'RDM': 'random approach'} # List of clustering algos + selec_strategy = {'center':'PCA','random':'PCA'} - reg_algo ={"Full-PLSR":'full Partial Least Squares (PLS)', - "Locally Weighted PLSR": 'Locally Weighted Partial Least Squares (LWPLS)', - "Interval-PLSR": "Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)"} + reg_algo ={"PLS":'Partial Least Squares (PLS) \cite{Wold2001,JMLR:v12:pedregosa11a}', + "LW-PLS": 'Locally Weighted-Partial Least Squares (LW-PLS) \cite{Lesnoff2020}', + "TPE-iPLS": "Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)"} to_report=[] j=0 @@ -35,26 +39,30 @@ def report(*args): globals()[df_name] = arg.select_dtypes(include=['float64', 'int64']) latex_report = "" - latex_report += r"""\documentclass[a4paper,10pt]{article} + latex_report += r"""\documentclass[11pt]{article} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage{geometry} + \geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=5cm, + headheight=0.05cm, footskip=1.7cm} + \usepackage{changepage} - \geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=3cm, - headheight=0.05cm, footskip=1cm} - \usepackage{caption, subcaption} - \usepackage{hyperref} \usepackage{hyphenat} \usepackage{booktabs} \usepackage{times} - \usepackage{etoolbox,fancyhdr,xcolor} + \usepackage{parskip} + \usepackage{float} + \setlength{\parskip}{\baselineskip} % Example setting + \usepackage{cite} % For citing with range compression + \usepackage{etoolbox} + \usepackage{xcolor} \newcommand{\headrulecolor}[1]{\patchcmd{\headrule}{\hrule}{\color{#1}\hrule}{}{}} \newcommand{\footrulecolor}[1]{\patchcmd{\footrule}{\hrule}{\color{#1}\hrule}{}{}} \renewcommand{\headrulewidth}{1pt} \headrulecolor{red!100}% \renewcommand{\footrulewidth}{1pt} \footrulecolor{red!100}% - \graphicspath{{images/}, {Figures/}} + \graphicspath{{images/}, {Figures/}} \fancyhead[R]{\includegraphics[width=0.1\textwidth]{logo_cefe.png}} \fancyhead[L]{PACE - NIRS Analysis Report} \fancyfoot[L]{Project Name to fill} @@ -64,7 +72,20 @@ def report(*args): \addtolength{\topmargin}{-9.2942pt} \pagestyle{fancy} - \DeclareCaptionLabelFormat{myfigureformat}{\textbf{Fig. #2.}} + % Customize appearance of figure references + \usepackage{xcolor} % For defining colors + \definecolor{myblue}{RGB}{0,0,128} % RGB values for blue + + \usepackage{hyperref} + \hypersetup{colorlinks=true,linkcolor=myblue,citecolor=myblue,urlcolor=myblue} + \usepackage{cleveref} % For clever references + + + \usepackage{subcaption} + \usepackage{caption} + % Redefine cref formats for figures and tables + + \DeclareCaptionLabelFormat{myfigureformat}{\textbf{Fig. #2}} \captionsetup[figure]{ labelformat=myfigureformat, % Apply the custom format justification=centering, % Justify the caption text @@ -74,11 +95,15 @@ def report(*args): \DeclareCaptionLabelFormat{mytableformat}{\textbf{Table #2}} \captionsetup[table]{ labelformat=mytableformat, % Apply the custom format - justification=justified, % Justify the caption text + justification=centering, % Justify the caption text singlelinecheck=false, % Allow the caption to occupy multiple lines skip=0pt, % Vertical space between caption and table position=top % Position the caption at the top of the table } + \crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}} + \Crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}} % Capitalized version for beginning of sentence + \crefformat{table}{\textcolor{myblue}{table~#2#1#3}} + \Crefformat{table}{\textcolor{myblue}{Table~#2#1#3}} % Capitalized version for beginning of sentence \begin{document} @@ -90,10 +115,11 @@ def report(*args): latex_report += r"""\noindent \textbf{QUERY MADE: }{"""+ re.sub(r'([_%])', r'\\\1',to_report[0])+ r"""}.\\ \noindent - \textbf{ENTERED INPUTS: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\""" + \textbf{INPUT DATA: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\""" latex_report += r"""\section*{Results}""" + latex_report += r"""\subsection*{Spectral data visualization}""" - latex_report += r"""Acquired spectra were visualized in fig.\ref{raw_spectra} by plotting the signal of the samples captured in the specific spectral range + latex_report += r"""Acquired spectra were visualized in (\cref{raw_spectra}) by plotting the signal of the samples captured in the specific spectral range. This helps observe general patterns and trends in the spectra, and understand the variability within the data. \begin{figure}[h] \centering @@ -104,54 +130,107 @@ def report(*args): if 'Representative subset selection' in to_report: latex_report += r"""\subsection*{Multivariable Data Analysis}""" - latex_report += r"""\indent For optimal selection of subset of the samples to analyze through the \cite{Lesnoff2020} - reference method, a pipeline consisting of consecutively applying features extraction/dimensionality\cite{BellonMaurel2010,scikit-learn} - reduction and clustering analysis was developed. Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} - technique which helps represent the high dimensional spectra in a reduced perceptible 3D - subspace spanned by a few number of features (three features in our case), while clustering analysis was performed - using the {"""+cluster_methods[to_report[3]] + r"""} technique which - helps group the data into groups of spectra that share the same carachteristics. """ + latex_report += r""" Multivariable calibration models have widely been used for quantitative analysis and chemical analysis fields. + Different multivariable modelling techniques are used for calibration models developement, ranging from linear to non linear techniques, and the + performance of models developed using these techniques depends heavily on the overall quality of the data and the degree of representativeness + of calibration set, interchangeably called training set, used for its development, i.e, how much the training set captures the characteristics + and diversity of the entire population or dataset from which it is drawn \cite{li2016strategy}.\par""" + + latex_report += r""" For optimal selection of a reprentative subset of the samples to analyze through the + reference method and use for calibration models development, a pipeline consisting of consecutively applying features extraction (or dimensionality + reduction) and""" + + + if 'KS' in to_report or 'RDM' in to_report: + latex_report += r""" samples subset selection was developed.""" + else: + latex_report += r""" clustering analysis was developed.""" + + latex_report += r""" Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} technique that helps + represent the high dimensional spectra in a reduced perceptible 3D subspace spanned by a few number of features (three features in our case), while """ + + + if 'KS' in to_report or 'RDM' in to_report: + latex_report += r""" samples subset selection was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique.\par""" + else: + latex_report += r""" clustering analysis was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique that helps group the data into groups of spectra + that share the same carachteristics.\par""" - latex_report += r"""After applying the pipeline, a subset sampling method, consisting of""" - if 'center' in to_report: - latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs.""" - if 'random' in to_report: - latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""} - samples or less from each subcluster (if a subcluster contains less than {"""+to_report[7]+ r"""} samples, then all samples included - in this subcluster are selected), was applied.\\""" + + if 'KS' not in to_report and not 'RDM' in to_report: + latex_report += r""" After implementing the pipeline, a subset sampling method, consisting of""" + if 'center' in to_report: + latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs.""" + if 'random' in to_report: + latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""} + samples or less from each subcluster (if a subcluster contains less than {"""+to_report[7]+ r"""} samples, then all samples included + in this subcluster are selected), was applied.\par""" + if "PCA" in to_report: - latex_report += r"""\indent To detect the presence of any spectral outliers, the influence and residuals plots were constructed, + latex_report += r"""\indent To detect potential spectral outliers, the influence and residuals plots \cite{Mejia2017} were constructed, with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish regular observations, which form a homogeneous group near the subspace generated by the PCs; good leverage points, which are at the same plane as the subspace but distant from the ROs; orthogonal observations, which have a large residual distance to the subspace, but whose projection is on the subspace; and, finally, bad leverage - points, which have a large residual distance such that the projection on the subspace is away from regular observations.\\*""" + points, which have a large residual distance such that the projection on the subspace is away from regular observations.\par""" - latex_report += """\indent Results of applying this workflow are displayed in fig. 2. Based of the features extracted using - {"""+to_report[2]+ r"""}, {"""+to_report[3]+ r"""} revealed the existance of {"""+to_report[5] + r"""} - data clusters that are visualized with different colors. - \begin{figure}[h!] + latex_report += """ Results of applying this workflow are displayed in""" + if 'PCA' in to_report: + latex_report += """ (\cref{pcaplots,hotelling_and_influence,loadings}).""" + elif 'NMF' in to_report: + latex_report += """ (\cref{pcaplots,loadings}).""" + else: + latex_report += """ (\cref{pcaplots}).""" + + if 'KS' in to_report or 'RDM' in to_report: + latex_report += """ Based of the features extracted using {"""+to_report[2]+ r"""}, """ + + else: + latex_report += """ Based of the features extracted using {"""+to_report[2]+ r"""}, + {"""+to_report[3]+ r"""} revealed the existance of {"""+to_report[5] + r"""} data clusters, visualized with different colors, from which """ + + latex_report += r"""a subset of {"""+to_report[8]+ r"""} samples was selected""" + if 'KS' in to_report or 'RDM' in to_report: + latex_report += r""", by the {"""+cluster_methods[to_report[3]] + r"""},""" + + latex_report += r""" and extracted to be representative of the whole data set, i.e, to reflect the variation included in the whole data set. + This subset of samples is suggested to be used for a robust NIR calibration developement, + therefore should to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation) to collect data for the target variable to be modelled.\par""" + + latex_report += r""" + \begin{figure}[h] \captionsetup{justification=centering} - \centering - \begin{minipage}[b]{0.33\textwidth} + \centering + \begin{minipage}[b]{0.33\textwidth} \includegraphics[width=\linewidth]{scores_pc1_pc2.png} - \end{minipage}% - \begin{minipage}[b]{0.33\textwidth} + \end{minipage}% + \begin{minipage}[b]{0.33\textwidth} \includegraphics[width=\linewidth]{scores_pc1_pc3.png} - \end{minipage}% - \begin{minipage}[b]{0.33\textwidth} + \end{minipage}% + \begin{minipage}[b]{0.33\textwidth} \includegraphics[width=\linewidth]{scores_pc2_pc3.png} - \end{minipage} - \centering - \caption{Illustration of the pairwise projection of spectra onto the reduced 3 dimensional subspace, clustering, and sample selection + \end{minipage} + \centering + \caption{Illustration of the pairwise projection of spectra onto the reduced 3 dimensional subspace, clustering, and sample selection results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be analyzed by a standard reference analytical procedure} - \label{pcaplots} + \label{pcaplots} \end{figure} """ - + + if 'PCA' in to_report or 'NMF' in to_report: + latex_report += r""" + \begin{figure}[h!] + \centering + \includegraphics[width=.6\linewidth]{loadings_plot.png} + \caption{Loadings plot} + \label{loadings} + \end{figure} + """ if 'PCA' in to_report: latex_report += r""" + \newpage + \begin{raggedbottom} \begin{figure}[h!] \centering \begin{minipage}[b]{0.33\textwidth} @@ -165,31 +244,60 @@ def report(*args): \caption{Outliers detection plots;(a) and (b) , respectively, correspond to the hotelling and influence plots} \label{hotelling_and_influence} \end{figure} + \end{raggedbottom} """ - latex_report += r"""A subset of {"""+to_report[8]+ r"""} samples were identified and selected to be representative and were suggested to be used for robust NIR calibration developement - , i.e, to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation). - """ + + + + + + + + + + + + + + + + + + elif 'Predictive model development' in to_report: latex_report += r"""\paragraph{}To develop a robust NIR calibration that formally correlates the spectral signature of the samples in the NIR region with the corresponding reference data obtained by analyzing the samples using a suitable reference analytical procedure, - a pipeline consisting of consecutively performing spectral signal correction followed by multivariable predictive modelling was applied. - Signal correction was performed by """ + a pipeline consisting of consecutively performing spectral signal preprocessing followed by multivariable predictive modelling was applied. + Signal preprocessing was performed by """ + if 'No_transformation' not in to_report: latex_report += r"""normalizing the raw spectra using {"""+signal_preprocess[to_report[3]]+ r""", then """ if to_report[3] !="No_derivation": latex_report += r"""taking the {"""+to_report[2]+ r"""}-order derivative of a the {"""+to_report[4]+ r"""}-order Savitzky-Golay (SG) polynomial estimated over a moving window of {"""+to_report[5]+ r"""} data points""" - latex_report += r""". Subequently, the obtained data was split into two subsets using Kennard-Stone (KS) algorithm; a calibration (Cal) and Validation - (Val) subsets, the former ,consisting of 80\% of the data, was used for multivarible calibration development while the latter ,consisting of - the remaining 20\% of the data, was used for evaluating the predictive and the generalizability performance of the developed calibration.""" - latex_report += r""" To optimally select hyperparameters of the model and the signal preprocessing methods, prevent that the model overfit the data, - and optimize the predictive performance of the model, 5-folds Cross Validation (CV) was performed.""" - latex_report += r"""\paragraph{} Fig 5, and table 6 display descriptive summary of the input data, trainset, and testset.""" + + latex_report += r""". The obtained preprocessed spectra were appropriately matched with the reference values, then Kennard-Stone (KS) algorithm \cite{ferreira2021kennard} was used for + to split the dataset into two data subsets (\cref{fig:Histogram} and \cref{table:desc_stats}) for regression modeling; training and testing subsets, the former, consisting of 80\% of the data, was used to + develop a {"""+reg_algo[to_report[6]]+ r"""} predictive model, while the latter, consisting of the remaining 20\% of the data, was used to evaluate its + predictive and generalizability performance.\par""" + + if any(i in to_report for i in ('PLS', 'TPE-iPLS')): + latex_report += r""" The latente variables for the {"""+to_report[6]+ r"""} based model were estimated using the Non-linear Iterative Partial Least Squares (NIPALS) algorithm that was first introduced by + the econometrician and statistician Herman Andreas Ole Wold \cite{wold1975path}.""" + + latex_report += r""" The evaluation of the model performance was performed by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models, + specifically, the correlation coefficient (r), the coefficient of determination (R2), the Root Mean Squared Error (RMSE), the Mean Absolute Error (MAE), the Ratio of Performance to Deviation (RPD), the Ratio of + performance to inter-quartile (RPIQ) \cite{BellonMaurel2010}.\par""" + latex_report += r""" To optimize the performance of the calibration, the hyperparameters of predictive model and the selection of signal preprocessing methods were + performed simultaneously and automatically using the Tree-Structured Parzen Estimator (TPE) as an optimization algorithm. The optimal preprocessing-hyperparameters combination + was assumed to minimize the RMSE of 5-folds Cross-Validation (CV).\par""" + + latex_report += r""" - \begin{figure}[h] + \begin{figure}[H] \centering \includegraphics[width=1\linewidth]{Histogram.png} \caption{Kde plot visualizing the distribution of the target variable, a subset of training, and testing sets} @@ -197,12 +305,14 @@ def report(*args): \end{figure} """ + df0.style.format("${:.2f}$").to_latex( position_float = 'centering', hrules = True, caption = 'Descriptive statistics of the target variable, subsets used to develop and validate the predictive model', - label= 'reg_perf') +r"""""" + label= 'table:desc_stats') + r"""""" - latex_report += r"""Predictive modelling development was performed using the {"""+reg_algo[to_report[6]]+ r"""} regression method.""" latex_report += r""" - For fig.\ref{fig:CV} + \cref{fig:CV} and \cref{table:CV} show the CV results achieved with the best hyperparameters-preprocessing combination found by the optimization algorithm. + These results are beneficial and important the evaluating of the bias-variance tradeoff. The best hyperparameters-preprocessing combination was identified + and used to create a predictive model that was evaluated for its explanatory (train) and predictive (test) performance (\cref{table:reg_perf}).\par + \begin{figure}[h] \captionsetup{justification=centering} \centering @@ -212,38 +322,44 @@ def report(*args): \begin{minipage}[c]{0.5\textwidth} \includegraphics[width=\linewidth]{meas_vs_pred_cv_all.png} \end{minipage}% - \caption{ Visualization of measured vs predicted values scatter plot for cross-validation } - \label{CV} - \end{figure}""" - - if "Full-PLSR" in to_report: - latex_report += r"""the most important and influential spectral regions in the model, were visualized in fig.5""" - elif "Locally Weighted PLSR" in to_report: - """""" - elif "Interval-PLSR" in to_report: - latex_report += r"""Three intervalls were selected by the TPE-iPLS""" - - latex_report += r"""The calibration, CV, and prediction performance achieved by the developed model was evaluated - by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models. - specifically, the Root Mean Squared Error (RMSE), the Ratio of Performance to Deviation (RPD), the Ratio of - performance to inter-quartile (RPIQ). A table summarizing the model performance is shown bellow(Table. 4).\par"""""" - """ + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'reg_perf') + r"""""" + \caption{ Visualization of measured vs predicted values for cross-validation } + \label{fig:CV} + \end{figure} + """ + df2.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Cross-Validation summary', label= 'table:CV') + r""" + """ + latex_report += df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf') - if "Full-PLSR" in to_report: - - latex_report += r""" To identify the important variables in the model, Variable Importance in Projection (VIP) test applied, and the important variables in the model were - visualized in Fig.8 \par + if "PLS" in to_report: + latex_report += r"""To identify the most important and influential spectral regions in the model, Selectivity ratio (SR) \cite{kvalheim2020variable, farres2015comparison} test applied, and the important variables in the model were + visualized in \cref{fig:importance}. \par \begin{figure}[h] \centering \includegraphics[width=1\linewidth]{Variable_importance.png} \caption{Visualizing important spectral regions identified in the PLS model on the raw and preprocessed average spectrum} - \label{fig:Histogram} + \label{fig:importance} \end{figure} """ - - latex_report += r"""After numerically analyzing the performance of the model, a visual investigation (figs 7 and 8) of goodness of model fit was performed to identify potential - issues such as a pattern, that has not been captured by the model, or outliers.\par. + + elif "LW-PLS " in to_report: + """""" + elif "TPE-iPLS" in to_report: + latex_report += r""" + Many research papers have proved that interval selection methods, with different number of intervalls, helps reduce noise and model overfitting, + increases computational efficiency and results interpretability, and maximizes the model's predictive accuracy. For the current analysis, the selected spectral + intervalls or regions that were used for predictive model development were visualized in \cref{fig:importanceipls}. \par + + \begin{figure}[h] + \centering + \includegraphics[width=1\linewidth]{Variable_importance.png} + \caption{Visualizing spectral regions used for TPE-iPLS model development on the raw and preprocessed average spectrum} + \label{fig:importanceipls} + \end{figure} + """ + + # latex_report += r"""""" + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf') + r"""""" + latex_report += r""" Following a numerical analysis of the model performance, measured against predicted values \cite{pauwels2019evaluating} and residuals against measured \cite{belloto1985residual} plots (\cref{fig:diagnosis}) were analysed to + visually assess the goodness of model fit and to detect potential flaws such as a pattern that the model failed to capture or outliers.\par \begin{figure}[h] \captionsetup{justification=centering} @@ -255,11 +371,20 @@ def report(*args): \includegraphics[width=\linewidth]{residuals_plot.png} \end{minipage}% \caption{Post-hoc analysis of the developed predictive model; measured vs predicted values (a) and measured vs residuals (b) plots } - \label{pcaplots} + \label{fig:diagnosis} \end{figure}""" - latex_report += r""" + + latex_report += r""" + \clearpage + \pagebreak + \newpage + \section*{ACKNOWLEDGEMENTS} + This tool is provided by the Chemical Analysis Platform for Ecology - Montpellier, France.\\ + Thanks to Abderrahim DIANE, Mouhcine MAIMOUNI, Alexandre GRANIER, Remy BEUGNON, Vincent NEGRE et Nicolas BARTHES.\\ + Source code available at \href{https://src.koda.cnrs.fr/cefe/pace/nirs_workflow}{CNRS forge}. + \fontsize{8}{9}\selectfont - \bibliographystyle{apalike} + \bibliographystyle{IEEEtran} % \bibliographystyle{abbrv} \bibliography{refs.bib} \clearpage diff --git a/src/app.py b/src/app.py index cc8f226ce4197e976a0f9ac28544e70fb3e25cf0..f7ae082d532b038faa680874f503df23b29cd538 100644 --- a/src/app.py +++ b/src/app.py @@ -5,23 +5,90 @@ from Modules import * from Class_Mod.DATA_HANDLING import * -add_header() +# page_element=""" +# <style> +# [data-testid="stAppViewContainer"]{ +# background-image: url("https://www.cefe.cnrs.fr/templates/rt_zephyr/images/backgrounds/img-sky.jpg"); +# background-size: cover; +# } +# </style> +# """ +# st.markdown(page_element, unsafe_allow_html=True) +add_header() add_sidebar(pages_folder) + +st.markdown( + """ + <style> + [data-testid="stAppViewContainer"]{ + background-image: url("https://www.cefe.cnrs.fr/templates/rt_zephyr/images/backgrounds/img-sky.jpg"); + background-size: cover; + } + .header1 { color: black; } + .green { color: green; } + .centered-text { + text-align: center; + color: black;} + .header1 { color: black;font-size: 70px;font-family: monospace; } + .header2 { color: rgb(74,165,41); } + .header3 { color: green; } + .blackfont {color: black;} + + button { + height: auto;border-color:black; + width = 40px; + padding-top: 10px !important; + padding-bottom: 10px !important;} + </style> + """, + unsafe_allow_html=True +) + +from PIL import Image + + # Page header with st.container(): st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie-PACE :goat:") - st.title("NIRS Utils") - st.markdown("#### This app allows users to perform visualization, pre-treatments, samples selection & predictions on their NIRS spectra.") + # st.markdown("#### Welcome to") + st.markdown(""" + <h1 class="header1">Easily process <br> your spectral data<br> with <span class="green">NIRS Utils</span></h1> + """, unsafe_allow_html=True) + +for i in range(5): + mm1s, mmd2=st.columns([2,2]) +image = Image.open("C:/Users/diane/Desktop/nirs_workflow/src/images/general.jpg") +new_image = image.resize((700, 400)) +mmd2.image(new_image) + +mm1,mm, mm2=st.columns([2,1.5,2]) +with mm1: + # st.title("welcome to NIRS Utils") + st.markdown('<h2 class="centered-text">About</h2>', unsafe_allow_html=True) + + st.markdown('<h3 class="centered-text"> NIRS Utils is a powerful tool that was developed to ease the spectral data processing process. It benifits from the synergy between web and data science frameworks to offer a user-friendly interface featured a variety of analytical capabilities. Further information can be found here.</h3>', unsafe_allow_html=True) + #for easing the spectral data processing. + # st.markdown("We could add documentation here") + # st.write("Samples selection (PCA, [UMAP](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html), ...), Predictive Modelling ([Pinard](https://github.com/GBeurier/pinard), [LWPLSR](https://doi.org/10.1002/cem.3209), ...), and Predictions using your data (CSV or DX files) and/or PACE NIRS Database.") + +with mm2: + # st.title("welcome to NIRS Utils") + st.markdown('<h2 class="centered-text">Key Features</h2>', unsafe_allow_html=True) + st.markdown('<h3 class="centered-text"> Our app featured a variety of analytical capabilities that makes it the optimal solution for spectral data processing : <br> - Easy-to-use. <br> - Use advanced frameworks. <br> - Enhanced automation capabilities. <br> - Saves your time and efforts.</h3>', unsafe_allow_html=True) + + + +for i in range(3): header1, header2, header3,header4 = st.columns(4) - if header1.button("Inputs"): - st.switch_page(pages_folder / '4-inputs.py') - if header2.button("Samples Selection"): - st.switch_page(pages_folder / '1-samples_selection.py') - if header3.button("Models Creation"): - st.switch_page(pages_folder / '2-model_creation.py') - if header4.button("Predictions"): - st.switch_page(pages_folder / '3-prediction.py') - st.markdown("We could add documentation here") - st.write("Samples selection (PCA, [UMAP](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html), ...), Predictive Modelling ([Pinard](https://github.com/GBeurier/pinard), [LWPLSR](https://doi.org/10.1002/cem.3209), ...), and Predictions using your data (CSV or DX files) and/or PACE NIRS Database.") + +header1, header2, header3,header4 = st.columns(4) +if header1.button("Inputs"): + st.switch_page(pages_folder / '4-inputs.py') +if header2.button("Samples Selection"): + st.switch_page(pages_folder / '1-samples_selection.py') +if header3.button("Models Creation"): + st.switch_page(pages_folder / '2-model_creation.py') +if header4.button("Predictions"): + st.switch_page(pages_folder / '3-prediction.py') diff --git a/src/config/config.py b/src/config/config.py index 4aaa13569b9389fee298accd511a8d07357d7e40..d143aa7c810a12adbc1a16dff1a66360e22fd16c 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -2,5 +2,5 @@ from pathlib import Path # pdflatex_path = Path("C:/Users/maimouni/AppData/Local/Programs/MiKTeX/miktex/bin/x64/") -pdflatex_path = Path("C:/Donnees/Logiciels/Papps/PortableApps/Notepad++Portable/LaTEX/texmfs/install/miktex/bin/") -# pdflatex_path = Path("C:/Users/diane/AppData/Local/Programs/MiKTeX/miktex/bin/x64/") +# pdflatex_path = Path("C:/Donnees/Logiciels/Papps/PortableApps/Notepad++Portable/LaTEX/texmfs/install/miktex/bin/") +pdflatex_path = Path("C:/Users/diane/AppData/Local/Programs/MiKTeX/miktex/bin/x64/") diff --git a/src/images/graphical_abstract.jpg b/src/images/graphical_abstract.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c0b8d09f230d8f6ca36da81cbd59f633b2bb1aee Binary files /dev/null and b/src/images/graphical_abstract.jpg differ diff --git a/src/images/wp9684463-data-analytics-wallpapers.jpg b/src/images/wp9684463-data-analytics-wallpapers.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6ca06796059f1eb207a3b214537fc1b27b0f03c3 Binary files /dev/null and b/src/images/wp9684463-data-analytics-wallpapers.jpg differ diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 60e2de22c3a4ec06709230aed96b8e1fb564eee1..af79beaef0976b945fdb12b8f039842dfaf32390 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -21,7 +21,7 @@ add_sidebar(pages_folder) # algorithms available in our app dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos -cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos +cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP', 'KS', 'RDM'] # List of clustering algos selec_strategy = ['center','random'] if st.session_state["interface"] == 'simple': @@ -46,9 +46,9 @@ if st.session_state["interface"] == 'advanced': default_sample_selection_option = 0 ################################### I - Data Loading and Visualization ######################################## -st.header("I - Spectral Data Visualization", divider='blue') +st.title("Calibration Subset Selection") col2, col1 = st.columns([3, 1]) - +col2.image("C:/Users/diane/Desktop/nirs_workflow/src/images/graphical_abstract.jpg", use_column_width=True) ## Preallocation of data structure spectra = pd.DataFrame() meta_data = pd.DataFrame() @@ -66,8 +66,12 @@ selection = None selection_number = None # loader for datafile -data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) -if data_file: +data_file = col1.file_uploader("Data file", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) + + +if not data_file: + col1.warning('⚠️ Please load data file !') +else: # Retrieve the extension of the file test = data_file.name[data_file.name.find('.'):] ## Load .csv file @@ -101,12 +105,15 @@ if data_file: ## Visualize spectra +st.header("I - Spectral Data Visualization", divider='blue') if not spectra.empty: + n_samples = spectra.shape[0] + nwl = spectra.shape[1] # retrieve columns name and rows name of spectra colnames = list(spectra.columns) rownames = [str(i) for i in list(spectra.index)] spectra.index = rownames - + col2, col1 = st.columns([3, 1]) with col2: fig, ax = plt.subplots(figsize = (30,7)) if test =='.dx': @@ -125,43 +132,49 @@ if not spectra.empty: plt.tight_layout() st.pyplot(fig) + # update lines size + for line in ax.get_lines(): + line.set_linewidth(0.8) # Set the desired line width here + # Update the size of plot axis for exprotation to report l, w = fig.get_size_inches() fig.set_size_inches(8, 3) for label in (ax.get_xticklabels()+ax.get_yticklabels()): - ax.xaxis.label.set_size(10) - ax.yaxis.label.set_size(10) + ax.xaxis.label.set_size(9.5) + ax.yaxis.label.set_size(9.5) plt.tight_layout() fig.savefig("./Report/figures/spectra_plot.png", dpi=400) ## Export report fig.set_size_inches(l, w)# reset the plot size to its original size data_info = pd.DataFrame({'Name': [data_file.name], - 'Number of scanned samples': [spectra.shape[0]]}, + 'Number of scanned samples': [n_samples]}, index = ['Input file']) + with col1: + st.info('Information on the loaded data file') st.write(data_info) ## table showing the number of samples in the data file ############################## Exploratory data analysis ############################### st.header("II - Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') -scores, loadings, pc = st.columns([2, 3, 0.5]) -influence, hotelling, qexp = st.columns([2, 2, 1]) -st.header('III - Selected samples for chemical analysis', divider='blue') ###### 1- Dimensionality reduction ###### t = pd.DataFrame # scores p = pd.DataFrame # loadings if not spectra.empty: - dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, index = default_reduction_option, key = 37) - clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, index = default_clustering_option, key = 38) + bb1, bb2, bb3, bb4, bb5, bb6, bb7 = st.columns([1,1,0.6,0.6,0.6,1.5,1.5]) + dim_red_method = bb1.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, index = default_reduction_option, key = 37) + clus_method = bb2.selectbox("Clustering/sampling techniques: ", options = cluster_methods, index = default_clustering_option, key = 38) xc = standardize(spectra, center=True, scale=False) - if dim_red_method == dim_red_methods[1]: + if dim_red_method == dim_red_methods[0]: + bb1.warning('⚠️ Please choose an algothithm !') + elif dim_red_method == dim_red_methods[1]: dr_model = LinearPCA(xc, Ncomp=8) elif dim_red_method == dim_red_methods[2]: if not meta_data.empty: filter = md_df_st_.columns filter = filter.insert(0, 'Nothing') - col = pc.selectbox('Supervised UMAP by:', options= filter, key=108) + col = bb1.selectbox('Supervised UMAP by:', options= filter, key=108) if col == 'Nothing': supervised = None else: @@ -174,28 +187,37 @@ if not spectra.empty: dr_model = Nmf(spectra, Ncomp= 3) if dr_model: - axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) - axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1) - axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2) + axis1 = bb3.selectbox("x-axis", options = dr_model.scores_.columns, index=0) + axis2 = bb4.selectbox("y-axis", options = dr_model.scores_.columns, index=1) + axis3 = bb5.selectbox("z-axis", options = dr_model.scores_.columns, index=2) t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) ###### II - clustering ####### + if not t.empty: + if dim_red_method == 'UMAP': + scores = st.container() + else: + scores, loadings= st.columns([3,3]) + tcr = standardize(t) # Clustering # 1- K-MEANS Clustering + if clus_method == cluster_methods[0]: + bb2.warning('⚠️ Please choose an algothithm !') + if clus_method == cluster_methods[1]: cl_model = Sk_Kmeans(tcr, max_clusters = 25) ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters') - fig2 = px.bar(cl_model.inertia_.T, y = 'inertia') - scores.write(f"Suggested n_clusters : {cl_model.suggested_n_clusters_}") - scores.plotly_chart(fig2,use_container_width=True) - img = pio.to_image(fig2, format="png") - with open("./Report/figures/Elbow.png", "wb") as f: - f.write(img) + # fig2 = px.bar(cl_model.inertia_.T, y = 'inertia') + # scores.write(f"Suggested n_clusters : {cl_model.suggested_n_clusters_}") + # scores.plotly_chart(fig2,use_container_width=True) + # img = pio.to_image(fig2, format="png") + # with open("./Report/figures/Elbow.png", "wb") as f: + # f.write(img) data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster) # 2- HDBSCAN clustering @@ -211,14 +233,30 @@ if not t.empty: cl_model = AP(X = tcr) data, labels, clu_centers = cl_model.fit_optimal_ ncluster = len(clu_centers) + + elif clus_method == cluster_methods[4]: + rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)') + cl_model = KS(x = tcr, rset = rset) + calset = cl_model.calset + labels = ["ind"]*n_samples + ncluster = "1" + selection_number = 'None' + + elif clus_method == cluster_methods[5]: + rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)') + cl_model = RDM(x = tcr, rset = rset) + calset = cl_model.calset + labels = ["ind"]*n_samples + ncluster = "1" + selection_number = 'None' if clus_method == cluster_methods[2]: #clustered = np.where(np.array(labels) != 'Non clustered')[0] - clustered = np.arange(tcr.shape[0]) + clustered = np.arange(n_samples) non_clustered = np.where(np.array(labels) == 'Non clustered')[0] else: - clustered = np.arange(tcr.shape[0]) + clustered = np.arange(n_samples) non_clustered = None new_tcr = tcr.iloc[clustered,:] @@ -229,72 +267,44 @@ samples_df_chem = pd.DataFrame selected_samples = [] selected_samples_idx = [] - -if labels: +if not labels: + custom_color_palette = px.colors.qualitative.Plotly[:1] +elif labels: num_clusters = len(np.unique(labels)) custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] if clus_method: - selection = scores.radio('Select samples selection strategy:', - options = selec_strategy, index = default_sample_selection_option, key=102) - # Strategy 0 - if selection == selec_strategy[0]: - # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster - closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr) - selected_samples_idx = np.array(new_tcr.index)[list(closest)] - selected_samples_idx = selected_samples_idx.tolist() - - #### Strategy 1 - elif selection == selec_strategy[1]: - selection_number = scores.number_input('How many samples per cluster?', - min_value = 1, step=1, value = 3) - s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]] - for i in np.unique(s): - C = np.where(np.array(labels) == i)[0] - if C.shape[0] >= selection_number: - # scores.write(list(tcr.index)[labels== i]) - km2 = KMeans(n_clusters = selection_number) - km2.fit(tcr.iloc[C,:]) - clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) - selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index) - else: - selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list()) - # list indexes of selected samples for colored plot - - if selected_samples_idx: - if meta_data.empty: - sam1 = pd.DataFrame({'name': spectra.index[clustered][selected_samples_idx], - 'cluster':np.array(labels)[clustered][selected_samples_idx]}, - index = selected_samples_idx) + if clus_method == cluster_methods[4] or clus_method == cluster_methods[5]: + selected_samples_idx = calset[1] + selection = 'None' else: - sam1 = meta_data.iloc[clustered,:].iloc[selected_samples_idx,:] - sam1.insert(loc=0, column='index', value=selected_samples_idx) - sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx]) - sam1.index = np.arange(len(selected_samples_idx))+1 - st.write(f' - The total number of samples: {tcr.shape[0]}.\n- The number of selected samples for chemical analysis: {sam1.shape[0]} - {round(sam1.shape[0]/tcr.shape[0]*100, 1)}%.') - sam = sam1 - if clus_method == cluster_methods[2]: - unclus = st.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True) + selection = scores.radio('Select samples selection strategy:', + options = selec_strategy, index = default_sample_selection_option, key=102) + # Strategy 0 + if selection == selec_strategy[0]: + # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster + closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr) + selected_samples_idx = np.array(new_tcr.index)[list(closest)] + selected_samples_idx = selected_samples_idx.tolist() + + #### Strategy 1 + elif selection == selec_strategy[1]: + selection_number = scores.number_input('How many samples per cluster?', + min_value = 1, step=1, value = 3) + s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]] + for i in np.unique(s): + C = np.where(np.array(labels) == i)[0] + if C.shape[0] >= selection_number: + # scores.write(list(tcr.index)[labels== i]) + km2 = KMeans(n_clusters = selection_number) + km2.fit(tcr.iloc[C,:]) + clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) + selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index) + else: + selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list()) + # list indexes of selected samples for colored plot - if clus_method == cluster_methods[2]: - if selected_samples_idx: - if unclus: - if meta_data.empty: - sam2 = pd.DataFrame({'name': spectra.index[non_clustered], - 'cluster':['Non clustered']*len(spectra.index[non_clustered])}, - index = spectra.index[non_clustered]) - else : - sam2 = meta_data.iloc[non_clustered,:] - sam2.insert(loc=0, column='index', value= spectra.index[non_clustered]) - sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered])) - - sam = pd.concat([sam1, sam2], axis = 0) - sam.index = np.arange(sam.shape[0])+1 - st.write(f' The number of Non-clustered samples is {sam2.shape[0]} samples. Total selected samples: {sam1.shape[0] + sam2.shape[0]} - {round((sam1.shape[0] + sam2.shape[0]) / tcr.shape[0] * 100, 1)}%.') - else: - sam = sam1 - st.write(sam) ################################ Plots visualization ############################################ - + ## Scores if not t.empty: @@ -412,12 +422,13 @@ if not spectra.empty: f.write(img) ############################################################################################################# if dim_red_method == dim_red_methods[1]: + influence, hotelling = st.columns([3, 3]) with influence: st.write('Influence plot') # Laverage Hat = t.to_numpy() @ np.linalg.inv(np.transpose(t.to_numpy()) @ t.to_numpy()) @ np.transpose(t.to_numpy()) leverage = np.diag(Hat) / np.trace(Hat) - tresh3 = 2 * t.shape[1]/t.shape[0] + tresh3 = 2 * tcr.shape[1]/n_samples # Loadings p = pd.concat([dr_model.loadings_.loc[:,axis1], dr_model.loadings_.loc[:,axis2], dr_model.loadings_.loc[:,axis3]], axis = 1) # Matrix reconstruction @@ -429,7 +440,7 @@ if not spectra.empty: # color with metadata if not meta_data.empty and clus_method: if col == "None": - l1 = ["Samples"]* t.shape[0] + l1 = ["Samples"]* n_samples elif col == clus_method: l1 = labels @@ -441,7 +452,7 @@ if not spectra.empty: l1 = labels elif meta_data.empty and not clus_method: - l1 = ["Samples"]* t.shape[0] + l1 = ["Samples"]* n_samples elif not meta_data.empty and not clus_method: l1 = list(map(str.lower,md_df_st_[col])) @@ -455,7 +466,7 @@ if not spectra.empty: out3 = leverage > tresh3 out4 = residuals > tresh4 - for i in range(t.shape[0]): + for i in range(n_samples): if out3[i]: if not meta_data.empty: ann = meta_data.loc[:,'name'][i] @@ -477,7 +488,8 @@ if not spectra.empty: fig.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) fig.write_image('./Report/figures/influence_plot.png', engine = 'kaleido') - + + with hotelling: st.write('T²-Hotelling vs Q-residuals plot') # Hotelling @@ -485,9 +497,8 @@ if not spectra.empty: # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T) - I = t.shape[0] - fcri = sc.stats.f.isf(0.05, 3, I) - tresh0 = (3 * (I ** 2 - 1) * fcri) / (I * (I - 3)) + fcri = sc.stats.f.isf(0.05, 3, n_samples) + tresh0 = (3 * (n_samples ** 2 - 1) * fcri) / (n_samples * (n_samples - 3)) tresh1 = sc.stats.chi2.ppf(0.05, df = 3) fig = px.scatter(t, x = hotelling, y = residuals, color=labels if list(labels) else None, @@ -500,7 +511,7 @@ if not spectra.empty: out1 = residuals > tresh1 - for i in range(t.shape[0]): + for i in range(n_samples): if out0[i]: if not meta_data.empty: ann = meta_data.loc[:,'name'][i] @@ -522,12 +533,49 @@ if not spectra.empty: font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) fig.write_image("./Report/figures/hotelling_plot.png", format="png") +st.header('III - Selected Samples for Reference Analysis', divider='blue') +if labels: + sel, info = st.columns([3, 1]) + sel.write("Tabular identifiers of selected samples for reference analysis:") + if selected_samples_idx: + if meta_data.empty: + sam1 = pd.DataFrame({'name': spectra.index[clustered][selected_samples_idx], + 'cluster':np.array(labels)[clustered][selected_samples_idx]}, + index = selected_samples_idx) + else: + sam1 = meta_data.iloc[clustered,:].iloc[selected_samples_idx,:] + sam1.insert(loc=0, column='index', value=selected_samples_idx) + sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx]) + sam1.index = np.arange(len(selected_samples_idx))+1 + info.info(f'Information !\n - The total number of samples: {n_samples}.\n- The number of samples selected for reference analysis: {sam1.shape[0]}.\n - The proportion of samples selected for reference analysis: {round(sam1.shape[0]/n_samples*100)}%.') + sam = sam1 + if clus_method == cluster_methods[2]: + unclus = sel.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True) + + if clus_method == cluster_methods[2]: + if selected_samples_idx: + if unclus: + if meta_data.empty: + sam2 = pd.DataFrame({'name': spectra.index[non_clustered], + 'cluster':['Non clustered']*len(spectra.index[non_clustered])}, + index = spectra.index[non_clustered]) + else : + sam2 = meta_data.iloc[non_clustered,:] + sam2.insert(loc=0, column='index', value= spectra.index[non_clustered]) + sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered])) + + sam = pd.concat([sam1, sam2], axis = 0) + sam.index = np.arange(sam.shape[0])+1 + info.write(f' The number of Non-clustered samples is {sam2.shape[0]} samples. Total selected samples: {sam1.shape[0] + sam2.shape[0]} - {round((sam1.shape[0] + sam2.shape[0]) / n_samples * 100, 1)}%.') + else: + sam = sam1 + sel.write(sam) -Nb_ech = str(tcr.shape[0]) -nb_clu = str(sam1.shape[0]) # figs_list = os.listdir("./Report/figures") if data_file: + Nb_ech = str(n_samples) + nb_clu = str(sam1.shape[0]) with st.container(): if st.button("Download report"): latex_report = report.report('Representative subset selection', data_file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam) diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 84c0439c6ff038f68020c0896baff4571d432707..01890edfc0c5fedd345e1ef0f809d700ff188f93 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -22,20 +22,14 @@ local_css(css_file / "style_model.css") ####################################### page Design ####################################### st.title("Calibration Model Development") st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") -st.header("I - Data visualization", divider='blue') M0, M00 = st.columns([1, .4]) -st.header("II - Model creation", divider='blue') -M1, M2 = st.columns([2 ,4]) -st.header("Cross-Validation results") -cv1, cv2 = st.columns([2,2]) +M0.image("C:/Users/diane/Desktop/nirs_workflow/src/images/graphical_abstract.jpg", use_column_width=True) +# st.header("II - Model creation", divider='blue') +# st.header("Cross-Validation results") +# cv1, cv2 = st.columns([2,2]) cv3 = st.container() -st.header("III - Model Diagnosis", divider='blue') -M7, M8 = st.columns([2,2]) -M7.write('Predicted vs Measured values') -M8.write('Residuals plot') -M9 = st.container() -M9.write("-- Save the model --") + ############################################################################################## @@ -84,8 +78,8 @@ if file == files_format[0]: spectra = pd.DataFrame(spectra).astype(float) - if not meta_data.empty : - st.write(meta_data) + # if not meta_data.empty : + # st.write(meta_data) if spectra.shape[0] != y.shape[0]: M00.warning('X and Y have different sample size') @@ -117,7 +111,9 @@ elif file == files_format[1]: os.unlink(tmp_path) ### split the data +st.header("I - Data visualization", divider='blue') if not spectra.empty and not y.empty: + M0, M000 = st.columns([1, .4]) if np.array(spectra.columns).dtype.kind in ['i','f']: colnames = spectra.columns else: @@ -153,21 +149,36 @@ if not spectra.empty and not y.empty: fig.savefig("./Report/figures/Histogram.png") - M0.write('Loaded data summary') - M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)) + M000.write('Loaded data summary') + M000.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)) stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2) ####################################### Insight into the loaded data ####################################### Model creation ################################################### - reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] - regression_algo = M1.selectbox("Choose the algorithm for regression", options= reg_algo, key = 12, placeholder ="Choose an option") +st.header("II - Model creation", divider='blue') +if not spectra.empty and not y.empty: + M10, M20, M30, M40, M50 = st.columns([1,1,1,1,1]) + modes = ['regression', 'classification'] + mode =M10.radio("Supervised modelling mode", options=modes) + if mode == 'regression': + reg_algo = ["","PLS", "LW-PLS", "TPE-iPLS"] + regression_algo = M20.selectbox("Choose the regression algorithm", options= reg_algo, key = 12, placeholder ="Choose an option") + + elif mode == 'classification': + reg_algo = ["","PLS", "LW-PLS", "TPE-iPLS"] + regression_algo = M20.selectbox("Choose the classification algorithm", options= reg_algo, key = 12, placeholder ="Choose an option") + + + # split train data into nb_folds for cross_validation nb_folds = 3 folds = KF_CV.CV(X_train, y_train, nb_folds) if not regression_algo: - M1.warning('Choose a modelling algorithm from the dropdown list !') + M20.warning('Choose a modelling algorithm from the dropdown list !') + else: + M1, M2 = st.columns([2 ,4]) if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1) @@ -266,23 +277,24 @@ if not spectra.empty and not y.empty: elif regression_algo == reg_algo[3]: - s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3) - it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3) + s = M20.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3) + it = M20.number_input(label='Enter the number of iterations', min_value=1, max_value=3, value=2) progress_text = "The model is being created. Please wait." Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it) pro = M1.progress(0, text="The model is being created. Please wait!") pro.empty() - M1.progress(100, text = "The model has successfully been created!") + M20.progress(100, text = "The model has successfully been created!") time.sleep(1) reg_model = Reg.model_ - M2.write('-- Important Spectral regions used for model creation --') + intervalls = Reg.selected_features_.T intervalls_with_cols = Reg.selected_features_.T for i in range(intervalls.shape[0]): for j in range(intervalls.shape[1]): intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] + M2.write('-- Important Spectral regions used for model creation --') M2.table(intervalls_with_cols) # elif regression_algo == reg_algo[4]: @@ -293,7 +305,9 @@ if not spectra.empty and not y.empty: # ###############################################################################################################DDDVVVVVVVVVV + # ################# Model analysis ############ +if not spectra.empty and not y.empty: if regression_algo in reg_algo[1:] and Reg is not None: #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ') @@ -334,7 +348,8 @@ if not spectra.empty and not y.empty: # with M2: # st.components.v1.html(htmlfig, height=600) - + st.header("Cross-Validation results") + cv1, cv2 = st.columns([2,2]) ############ cv2.write('-- Cross-Validation Summary--') cv2.write(Reg.CV_results_) @@ -370,7 +385,7 @@ if not spectra.empty and not y.empty: # ########## M1.write("-- Model performance --") - if regression_algo != "Locally Weighted PLSR": + if regression_algo != reg_algo[2]: M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) else: M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) @@ -381,15 +396,24 @@ if not spectra.empty and not y.empty: #my_circular_progress.st_circular_progress() #my_circular_progress.update_value(progress=20) - if regression_algo != "Locally Weighted PLSR": + if regression_algo != reg_algo[2]: a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) else: a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) +st.header("III - Model Diagnosis", divider='blue') +if not spectra.empty and not y.empty: + if regression_algo in reg_algo[1:] and Reg is not None: + + M7, M8 = st.columns([2,2]) + M7.write('Predicted vs Measured values') + M8.write('Residuals plot') + + M7.pyplot(a) plt.savefig('./Report/figures/measured_vs_predicted.png') prep_para = Reg.best_hyperparams_ - if regression_algo != "Locally Weighted PLSR": + if regression_algo != reg_algo[2]: prep_para.pop('n_components') for i in ['deriv','polyorder']: if Reg.best_hyperparams_[i] == 0: @@ -399,7 +423,7 @@ if not spectra.empty and not y.empty: elif Reg.best_hyperparams_[i] > 1: prep_para[i] = f"{Reg.best_hyperparams_[i]}nd" - if regression_algo != "Locally Weighted PLSR": + if regression_algo != reg_algo[2]: residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) else: residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) @@ -407,10 +431,13 @@ if not spectra.empty and not y.empty: M8.pyplot(residual_plot) plt.savefig('./Report/figures/residuals_plot.png') - if regression_algo != "Locally Weighted PLSR": + if regression_algo != reg_algo[2]: rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) + + M9 = st.container() + M9.write("-- Save the model --") model_name = M9.text_input('Give it a name') date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_') if M9.button('Export Model'): @@ -454,7 +481,7 @@ if not spectra.empty and not y.empty and regression_algo: if regression_algo in reg_algo[1:] and Reg is not None: fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') - if regression_algo != "Locally Weighted PLSR_": + if regression_algo != reg_algo[2]: ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') ax2.set_xlabel('Wavelenghts') plt.tight_layout() @@ -471,8 +498,12 @@ if not spectra.empty and not y.empty and regression_algo: else: min, max = intervalls['from'][j], intervalls['to'][j] - eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0) + eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0) + + if regression_algo == reg_algo[1]: + # st.write(colnames[np.array(Reg.sel_ratio_.index)]) + # st.write(colnames[np.array(Reg.sel_ratio_.index)]) ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0)[np.array(Reg.sel_ratio_.index)], color = 'red', label = 'Important variables') ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0)[np.array(Reg.sel_ratio_.index)], @@ -483,6 +514,9 @@ if not spectra.empty and not y.empty and regression_algo: M2.write('-- Visualization of the spectral regions used for model creation --') fig.savefig("./Report/figures/Variable_importance.png") M2.pyplot(fig) + # if regression_algo == reg_algo[3]: + # M2.write('-- Important Spectral regions used for model creation --') + # M2.table(intervalls_with_cols) ## Load .dx file if Reg is not None: @@ -490,14 +524,21 @@ if Reg is not None: if st.button("Download the report"): if regression_algo == reg_algo[1]: latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results) - report.compile_latex() + + elif regression_algo == reg_algo[2]: + latex_report = report.report('Predictive model development', file_name, stats, + list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), regression_algo, model_per, cv_results) + + elif regression_algo == reg_algo[3]: + latex_report = report.report('Predictive model development', file_name, stats, + list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), regression_algo, model_per, cv_results) + + + if regression_algo is None: st.warning('Data processing has not been performed or finished yet!', icon = "⚠️") else: pass - + report.compile_latex() else: - pass - - - \ No newline at end of file + pass \ No newline at end of file diff --git a/src/pages/3-prediction.py b/src/pages/3-prediction.py index dcd59360d2870b62dfdd706e86b1d9df8c7fb782..7389167932eabcb6874540a0b303f913a53e204a 100644 --- a/src/pages/3-prediction.py +++ b/src/pages/3-prediction.py @@ -15,27 +15,29 @@ add_sidebar(pages_folder) local_css(css_file / "style_model.css") +st.title("Prediction making using a previously developed model") +M10, M20= st.columns([2, 1]) +M10.image("C:/Users/diane/Desktop/nirs_workflow/src/images/graphical_abstract.jpg", use_column_width=True) -st.header("Data loading", divider='blue') -M1, M2= st.columns([2, 1]) +# M1, M2= st.columns([2, 1]) -st.header('Data preprocessing', divider='blue') -M3, M4= st.columns([2, 1]) -st.header("Prediction making", divider='blue') -M5, M6 = st.columns([2, 0.01]) + +# st.header("Prediction making", divider='blue') +# M5, M6 = st.columns([2, 0.01]) files_format = ['.csv', '.dx'] -file = M2.file_uploader("Select NIRS Data to predict", type = files_format, help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") +file = M20.file_uploader("Select NIRS Data to predict", type = files_format, help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") export_folder = './data/predictions/' export_name = 'Predictions_of_' reg_algo = ["Interval-PLS"] pred_data = pd.DataFrame() loaded_model = None - -if file: +if not file: + M20.warning('Insert your spectral data file here!') +else: test = file.name[file.name.find('.'):] export_name += file.name[:file.name.find('.')] @@ -67,7 +69,9 @@ if file: # Load parameters +st.header("I - Spectral data visualization", divider='blue') if not pred_data.empty:# Load the model with joblib + M1, M2= st.columns([2, 1]) M1.write('Raw spectra') fig = plot_spectra(pred_data, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") M1.pyplot(fig) @@ -95,7 +99,9 @@ if not pred_data.empty: ################################################################################################ ## plot preprocessed spectra +st.header('II - Spectral data preprocessing', divider='blue') if not preprocessed.empty: + M3, M4= st.columns([2, 1]) M3.write('Preprocessed spectra') fig2 = plot_spectra(preprocessed, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") M3.pyplot(fig2) @@ -104,7 +110,9 @@ if not preprocessed.empty: M4.write('The spectra were preprocessed using:\n'+SG+"\n"+Norm) ################### Predictions making ########################## +st.header("III - Prediction making", divider='blue') if not pred_data.empty:# Load the model with joblib + M5, M6 = st.columns([2, 0.01]) #dir = os.listdir('data/models/')[1:] dir = os.listdir('data/models/') dir.insert(0,'') diff --git a/src/pages/4-inputs.py b/src/pages/4-inputs.py index 671182332587000596b195cbeb46c1a573dd0895..5b3369fd37768eef214e320b5c13ca247159fa2d 100644 --- a/src/pages/4-inputs.py +++ b/src/pages/4-inputs.py @@ -25,7 +25,8 @@ with st.container(): with st.form(key='my_form'): - st.header("Fill in your details:",divider="blue") + st.header("Complete and send the following form with the data context:",divider="blue") + st.warning('Make sure that the form is well completed, because the reliability of the results depends mainly on it !', icon="⚠️") col1, col3,col2 = st.columns((2,0.5,2)) diff --git a/src/style/header.py b/src/style/header.py index 0c01027999a443d8899d52547030651cea7de5aa..30719da2c2df503a922ec3d7102c666c21922230 100644 --- a/src/style/header.py +++ b/src/style/header.py @@ -2,14 +2,15 @@ from Packages import * def add_header(): st.markdown( """ - <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> - <h1 style="text-align: center; color: white;">PACE - MEEB / CEFE</h1> - <h2 style="text-align: center; color: white;">NIRS Utils</h2> + <div style="width: 100%;height: 170px; background-color: rgb(122,176,199); padding: 10px; margin-bottom: 10px; "> + <h1 style="text-align: center; color: green;">PACE - MEEB / CEFE</h1> + <h2 style="text-align: center; color: green;">NIRS Utils</h2> </div> """, unsafe_allow_html=True, ) + def add_sidebar(pages_folder): if 'interface' not in st.session_state: st.session_state['interface'] = 'simple' diff --git a/src/style/style_model.css b/src/style/style_model.css index b346c7507c170f87f6893f5f2aa319b3ce0974ff..b6399243cd4f13a6bc74074bc8a91e4a25573551 100644 --- a/src/style/style_model.css +++ b/src/style/style_model.css @@ -1,14 +1,24 @@ /* CSS Snippet from W3schools: https://www.w3schools.com/howto/howto_css_contact_form.asp */ -div[data-testid="column"]:nth-of-type(1) { - border:2px solid rgba(0,0,0, .4);border-radius: 20px;padding: 15px; -} -div[data-testid="column"]:nth-of-type(2) { - border:2px solid rgba(0,0,0, .4);border-radius: 20px;padding: 15px; +/* div[data-testid="column"]:nth-of-type(2) { + border:2px solid rgba(0,0,0, 1);border-radius: 20px;padding: 15px; text-align: left; +} */ + +div[data-testid="column"]:nth-of-type(1) { + border: 2px solid rgba(0, 0, 0, 1); + border-radius: 20px; + padding: 15px; + text-align: left; + box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3); /* Example shadow */ + transition: box-shadow 0.3s ease-in-out; /* Smooth transition for the shadow */ } -div[data-testid="column"]:nth-of-type(3) { - border:2px solid rgba(0,0,0, .4);border-radius: 20px;padding: 15px; +div[data-testid="column"]:nth-of-type(2) { + border: 2px solid rgba(0, 0, 0, 1); + border-radius: 20px; + padding: 15px; text-align: left; + box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3); /* Example shadow */ + transition: box-shadow 0.3s ease-in-out; /* Smooth transition for the shadow */ }