Newer
Older

Nicolas Barthes
committed
import pandas as pd
def intersect(l1, l2):
return l1.intersection(set(l2))
def check(file):
return os.path.isfile(file)

Nicolas Barthes
committed
def report(*args):
signal_preprocess = {'Snv':r'''Standard Normal Variate (SNV) \cite{barnes1989standard}''',
'SG': r'''Savitzky-Golay (SG) \cite{savitzky1964smoothing}'''}
dim_red_methods= {'PCA':r'''Principal Components Analysis (PCA) \cite{wold1987principal,ringner2008principal,greenacre2022principal,JMLR:v12:pedregosa11a}''',
'UMAP':r'''Uniform Manifold Approximation and Projection (UMAP) \cite{ghojogh2021uniform,JMLR:v12:pedregosa11a}''',
'NMF':r'''Non-negative Matrix Factorization (NMF) \cite{lopes2015non}'''} # List of dimensionality reduction algos
cluster_methods = {'Kmeans':r'''Kmeans \cite{chong2021k,JMLR:v12:pedregosa11a}''',
'HDBSCAN':r'''Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN) \cite{mcinnes2017hdbscan}''',
'AP':r'''Affinity Propagation (AP) \cite{dueck2009affinity,JMLR:v12:pedregosa11a}''',
'KS':r'''Kennard-Stone algorithm (KS)''',
'RDM': r'''random approach'''} # List of clustering algos

DIANE
committed
reg_algo ={"PLS":r'''Partial Least Squares (PLS) \cite{Wold2001,JMLR:v12:pedregosa11a}''',
"LW-PLS": r'''Locally Weighted-Partial Least Squares (LW-PLS) \cite{Lesnoff2020}''',
"TPE-iPLS": r'''Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)'''}

Nicolas Barthes
committed
for arg in args:
if isinstance(arg, str) or isinstance(arg, int):
to_report.append(str(arg))
to_report.extend(list(map(str, arg)))

Nicolas Barthes
committed
elif isinstance(arg, pd.DataFrame):
globals()[df_name] = arg.select_dtypes(include=['float64', 'int64'])

DIANE
committed
latex_report += r"""\documentclass[11pt]{article}
\usepackage{fancyhdr}
\usepackage{graphicx}
\usepackage{geometry}

DIANE
committed
\geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=5cm,
headheight=0.05cm, footskip=1.7cm}

Nicolas Barthes
committed
\usepackage{booktabs}

DIANE
committed
\usepackage{parskip}
\usepackage{float}
\setlength{\parskip}{\baselineskip} % Example setting
\usepackage{cite} % For citing with range compression
\usepackage{etoolbox}
\usepackage{xcolor}
\newcommand{\headrulecolor}[1]{\patchcmd{\headrule}{\hrule}{\color{#1}\hrule}{}{}}
\newcommand{\footrulecolor}[1]{\patchcmd{\footrule}{\hrule}{\color{#1}\hrule}{}{}}
\renewcommand{\headrulewidth}{1pt}
\headrulecolor{red!100}%
\renewcommand{\footrulewidth}{1pt}
\footrulecolor{red!100}%
\fancyhead[R]{\includegraphics[width=0.1\textwidth]{logo_cefe.png}}
\fancyhead[L]{PACE - NIRS Analysis Report}
\fancyfoot[L]{Project Name to fill}
\fancyfoot[C]{Plateforme d'Analyses Chimiques en Ecologie}
\fancyfoot[R]{\thepage}
\setlength{\headheight}{52pt}
\addtolength{\topmargin}{-9.2942pt}

DIANE
committed
% Customize appearance of figure references
\usepackage{xcolor} % For defining colors
\definecolor{myblue}{RGB}{0,0,128} % RGB values for blue
\usepackage{hyperref}
\hypersetup{colorlinks=true,linkcolor=myblue,citecolor=myblue,urlcolor=myblue}
\usepackage{cleveref} % For clever references
\usepackage{subcaption}
\usepackage{caption}
% Redefine cref formats for figures and tables
\DeclareCaptionLabelFormat{myfigureformat}{\textbf{Fig. #2}}
\captionsetup[figure]{
labelformat=myfigureformat, % Apply the custom format
justification=centering, % Justify the caption text
singlelinecheck=false, % Allow the caption to occupy multiple lines
labelsep=space, % Add a space after the label
}
\DeclareCaptionLabelFormat{mytableformat}{\textbf{Table #2}}
\captionsetup[table]{
labelformat=mytableformat, % Apply the custom format

DIANE
committed
justification=centering, % Justify the caption text
singlelinecheck=false, % Allow the caption to occupy multiple lines
skip=0pt, % Vertical space between caption and table
position=top % Position the caption at the top of the table
}

DIANE
committed
\crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}}
\Crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}} % Capitalized version for beginning of sentence
\crefformat{table}{\textcolor{myblue}{table~#2#1#3}}
\Crefformat{table}{\textcolor{myblue}{Table~#2#1#3}} % Capitalized version for beginning of sentence
\begin{document}
\noindent
\begin{center}
\textbf{{\Large NIRS WORKFLOW REPORT}} \\
\end{center}"""
latex_report += r"""\noindent
\textbf{QUERY MADE: }{"""+ re.sub(r'([_%])', r'\\\1',to_report[0])+ r"""}.\\
\noindent

DIANE
committed
\textbf{INPUT DATA: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\"""
latex_report += r"""\section*{Results}"""

DIANE
committed
latex_report += r"""\subsection*{Spectral data visualization}"""

DIANE
committed
latex_report += r"""Acquired spectra were visualized in (\cref{raw_spectra}) by plotting the signal of the samples captured in the specific spectral range.
This helps observe general patterns and trends in the spectra, and understand the variability within the data.
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{spectra_plot.png}
\caption{Acquired spectra}
if 'Representative subset selection' in to_report:
latex_report += r"""\subsection*{Multivariable Data Analysis}"""

DIANE
committed
latex_report += r""" Multivariable calibration models have widely been used for quantitative analysis and chemical analysis fields.
Different multivariable modelling techniques are used for calibration models developement, ranging from linear to non linear techniques, and the
performance of models developed using these techniques depends heavily on the overall quality of the data and the degree of representativeness
of calibration set, interchangeably called training set, used for its development, i.e, how much the training set captures the characteristics
and diversity of the entire population or dataset from which it is drawn \cite{li2016strategy}.\par"""
latex_report += r""" For optimal selection of a reprentative subset of the samples to analyze through the
reference method and use for calibration models development, a pipeline consisting of consecutively applying features extraction (or dimensionality
reduction) and"""

DIANE
committed
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""" samples subset selection was developed."""
else:
latex_report += r""" clustering analysis was developed."""
latex_report += r""" Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} technique that helps
represent the high dimensional spectra in a reduced perceptible 3D subspace spanned by a few number of features, while """

DIANE
committed
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""" samples subset selection was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique.\par"""
else:
latex_report += r""" clustering analysis was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique that helps group the data into groups of spectra
that share the same carachteristics.\par"""
if 'KS' not in to_report and not 'RDM' in to_report:
latex_report += r""" After implementing the pipeline, a subset sampling method, consisting of"""
if 'center' in to_report:
latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs."""
if 'random' in to_report:
latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""}
samples or less from each subcluster (if a subcluster contains less than {"""+to_report[7]+ r"""} samples, then all samples included
in this subcluster are selected), was applied.\par"""

DIANE
committed
latex_report += r"""\indent To detect potential spectral outliers, the influence and residuals plots \cite{Mejia2017} were constructed,
with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish regular observations,
which form a homogeneous group near the subspace generated by the PCs; good leverage points,
which are at the same plane as the subspace but distant from the ROs; orthogonal observations, which have a
large residual distance to the subspace, but whose projection is on the subspace; and, finally, bad leverage

DIANE
committed
points, which have a large residual distance such that the projection on the subspace is away from regular observations.\par"""
latex_report += r""" Results of applying this workflow are displayed in"""

DIANE
committed
if 'PCA' in to_report:
latex_report += r""" (\cref{pcaplots, hotelling_and_influence, loadings})."""

DIANE
committed
elif 'NMF' in to_report:

DIANE
committed
else:

DIANE
committed
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""" Based of the features extracted using {"""+to_report[2]+ r"""}, """

DIANE
committed
else:
latex_report += r""" Based of the features extracted using {"""+to_report[2]+ r"""},

DIANE
committed
{"""+to_report[3]+ r"""} revealed the existance of {"""+to_report[5] + r"""} data clusters, visualized with different colors, from which """
latex_report += r"""a subset of {"""+to_report[8]+ r"""} samples was selected"""
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""", by the {"""+cluster_methods[to_report[3]] + r"""},"""
latex_report += r""" and extracted to be representative of the whole data set, i.e, to reflect the variation included in the whole data set.
This subset of samples is suggested to be used for a robust NIR calibration developement,
therefore should to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation) to collect data for the target variable to be modelled.\par"""
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
pathtofig = os.listdir("./Report/out/figures")
sc = [name for name in pathtofig if name.startswith("score")]
if sc[0] not in ["scores_plot1D.png","scores_plot2D.png"]:
axisn = 'three'
elif sc[0] == "scores_plot2D.png":
axisn = 'two'
elif sc[0] == "scores_plot1D.png":
axisn = "one"
if len(sc) == 3:
latex_report += r"""
\begin{figure}[h]
\captionsetup{justification=centering}
\centering
\begin{minipage}[b]{0.33\textwidth}
\includegraphics[width=\linewidth]{scores_pc1_pc2.png}
\end{minipage}%
\begin{minipage}[b]{0.33\textwidth}
\includegraphics[width=\linewidth]{scores_pc1_pc3.png}
\end{minipage}%
\begin{minipage}[b]{0.33\textwidth}
\includegraphics[width=\linewidth]{scores_pc2_pc3.png}
\end{minipage}
\centering
\caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection
results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be
analyzed by a standard reference analytical procedure}
\label{pcaplots}
\end{figure}"""
elif len(sc) == 1:
latex_report += r"""
\begin{figure}[h!]
\centering
\includegraphics[width=.6\linewidth]{"""+sc[0] +r"""}
\caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection
results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be
analyzed by a standard reference analytical procedure}

DIANE
committed
if 'PCA' in to_report or 'NMF' in to_report:
latex_report += r"""
\begin{figure}[h!]
\centering
\includegraphics[width=.6\linewidth]{loadings_plot.png}
\caption{Loadings plot}
\label{loadings}
\end{figure}
"""

DIANE
committed
\newpage
\begin{raggedbottom}
\includegraphics[width=\linewidth]{influence_plot.png}
\includegraphics[width=\linewidth]{hotelling_plot.png}
\caption{Outliers detection plots;(a) and (b) , respectively, correspond to the hotelling and influence plots}

DIANE
committed
\end{raggedbottom}

DIANE
committed
elif 'Predictive model development' in to_report:
latex_report += r"""\paragraph{}To develop a robust NIR calibration that formally correlates the spectral signature of the samples in the NIR region
with the corresponding reference data obtained by analyzing the samples using a suitable reference analytical procedure,

DIANE
committed
a pipeline consisting of consecutively performing spectral signal preprocessing followed by multivariable predictive modelling was applied.
Signal preprocessing was performed by """
if 'No_transformation' not in to_report:
latex_report += r"""normalizing the raw spectra using {"""+signal_preprocess[to_report[3]]+ r""", then """
if to_report[3] !="No_derivation":
latex_report += r"""taking the {"""+to_report[2]+ r"""}-order derivative of a the {"""+to_report[4]+ r"""}-order Savitzky-Golay (SG)
polynomial estimated over a moving window of {"""+to_report[5]+ r"""} data points"""

DIANE
committed
latex_report += r""". The obtained preprocessed spectra were appropriately matched with the reference values, then Kennard-Stone (KS) algorithm \cite{ferreira2021kennard} was used for
to split the dataset into two data subsets (\cref{fig:Histogram} and \cref{table:desc_stats}) for regression modeling; training and testing subsets, the former, consisting of 80\% of the data, was used to
develop a {"""+reg_algo[to_report[6]]+ r"""} predictive model, while the latter, consisting of the remaining 20\% of the data, was used to evaluate its
predictive and generalizability performance.\par"""
if any(i in to_report for i in ('PLS', 'TPE-iPLS')):
latex_report += r""" The latente variables for the {"""+to_report[6]+ r"""} based model were estimated using the Non-linear Iterative Partial Least Squares (NIPALS) algorithm that was first introduced by
the econometrician and statistician Herman Andreas Ole Wold \cite{wold1975path}."""
latex_report += r""" The evaluation of the model performance was performed by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models,
specifically, the correlation coefficient (r), the coefficient of determination (R2), the Root Mean Squared Error (RMSE), the Mean Absolute Error (MAE), the Ratio of Performance to Deviation (RPD), the Ratio of
performance to inter-quartile (RPIQ) \cite{BellonMaurel2010}.\par"""
latex_report += r""" To optimize the performance of the calibration, the hyperparameters of predictive model and the selection of signal preprocessing methods were
performed simultaneously and automatically using the Tree-Structured Parzen Estimator (TPE) as an optimization algorithm. The optimal preprocessing-hyperparameters combination
was assumed to minimize the RMSE of 5-folds Cross-Validation (CV).\par"""

DIANE
committed
\begin{figure}[H]
\centering
\includegraphics[width=1\linewidth]{Histogram.png}
\caption{Kde plot visualizing the distribution of the target variable, a subset of training, and testing sets}
\label{fig:Histogram}
\end{figure}
""" + df0.style.format("${:.2f}$").to_latex( position_float = 'centering', hrules = True,
caption = 'Descriptive statistics of the target variable, subsets used to develop and validate the predictive model',

DIANE
committed
label= 'table:desc_stats') + r""""""

DIANE
committed
\cref{fig:CV} and \cref{table:CV} show the CV results achieved with the best hyperparameters-preprocessing combination found by the optimization algorithm.
These results are beneficial and important the evaluating of the bias-variance tradeoff. The best hyperparameters-preprocessing combination was identified
and used to create a predictive model that was evaluated for its explanatory (train) and predictive (test) performance (\cref{table:reg_perf}).\par
\begin{figure}[h]
\captionsetup{justification=centering}
\centering
\begin{minipage}[c]{0.5\textwidth}
\includegraphics[width=\linewidth]{meas_vs_pred_cv_onebyone.png}
\end{minipage}%
\begin{minipage}[c]{0.5\textwidth}
\includegraphics[width=\linewidth]{meas_vs_pred_cv_all.png}
\end{minipage}%

DIANE
committed
\caption{ Visualization of measured vs predicted values for cross-validation }
\label{fig:CV}
\end{figure}
""" + df2.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Cross-Validation summary', label= 'table:CV') + r"""
"""
latex_report += df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf')

DIANE
committed
if "PLS" in to_report:
latex_report += r"""To identify the most important and influential spectral regions in the model, Selectivity ratio (SR) \cite{kvalheim2020variable, farres2015comparison} test applied, and the important variables in the model were
visualized in \cref{fig:importance}. \par
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{Variable_importance.png}
\caption{Visualizing important spectral regions identified in the PLS model on the raw and preprocessed average spectrum}

DIANE
committed
\label{fig:importance}
elif "LW-PLS" in to_report:
latex_report += r"""The average of raw and preprocessed spectra is visualized in \cref{fig:importance}. \par
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{Variable_importance.png}
\caption{Visualizing the average spectrum computed for raw and preprocessed spectra}
\label{fig:importance}
\end{figure}
"""

DIANE
committed
elif "TPE-iPLS" in to_report:
latex_report += r"""
Many research papers have proved that interval selection methods, with different number of intervalls, helps reduce noise and model overfitting,
increases computational efficiency and results interpretability, and maximizes the model's predictive accuracy. For the current analysis, the selected spectral
intervalls or regions that were used for predictive model development were visualized in \cref{fig:importanceipls}. \par
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{Variable_importance.png}
\caption{Visualizing spectral regions used for TPE-iPLS model development on the raw and preprocessed average spectrum}
\label{fig:importanceipls}
\end{figure}
"""
# latex_report += r"""""" + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf') + r""""""
latex_report += r""" Following a numerical analysis of the model performance, measured against predicted values \cite{pauwels2019evaluating} and residuals against measured \cite{belloto1985residual} plots (\cref{fig:diagnosis}) were analysed to
visually assess the goodness of model fit and to detect potential flaws such as a pattern that the model failed to capture or outliers.\par
\begin{figure}[h]
\captionsetup{justification=centering}
\centering
\begin{minipage}[b]{0.5\textwidth}
\includegraphics[width=\linewidth]{measured_vs_predicted.png}
\end{minipage}%
\begin{minipage}[b]{0.5\textwidth}
\includegraphics[width=\linewidth]{residuals_plot.png}
\end{minipage}%
\caption{Post-hoc analysis of the developed predictive model; measured vs predicted values (a) and measured vs residuals (b) plots }

DIANE
committed
\label{fig:diagnosis}

DIANE
committed
latex_report += r"""
\clearpage
\pagebreak
\newpage
\section*{ACKNOWLEDGEMENTS}
This tool is provided by the Chemical Analysis Platform for Ecology - Montpellier, France.\\
Thanks to Abderrahim DIANE, Mouhcine MAIMOUNI, Alexandre GRANIER, Remy BEUGNON, Vincent NEGRE et Nicolas BARTHES.\\
Source code available at \href{https://src.koda.cnrs.fr/cefe/pace/nirs_workflow}{CNRS forge}.
\fontsize{8}{9}\selectfont

DIANE
committed
\bibliographystyle{IEEEtran}
% \bibliographystyle{abbrv}
\bibliography{refs.bib}
\clearpage
\end{document}"""
# export the .tex file in the Report folder
filename_path = Path("Report/")
filename = r'report.tex'
with open(filename_path / filename, 'w+') as latex_file:
latex_file.write(latex_report)
# create the Tex file - sections in args will be displayed: {'sample':'Sample Selection';'model':'Model Creation';'predict':'Predictions';'help':'LaTEX help for figs and tables';}

Nicolas Barthes
committed
# latex_report = report('sample', 'predict',)
my = Path("./Report/report.pdf")
if my.is_file():
os.remove("./Report/report.pdf")
filename_path = Path("Report/")
filename = 'report.tex'
# run pdflatex with bibtex compilation (2nd run)
for i in range(4):
if i == 1:
proc = subprocess.Popen([pdflatex_path / 'bibtex.exe', filename[:-4]], cwd = filename_path)
proc.communicate()
else:
proc = subprocess.Popen([pdflatex_path / 'pdflatex.exe', filename], cwd = filename_path)
proc.communicate()
# remove pdflatex compilation files
extensions = ['.log', '.aux', '.bbl', '.blg', '.out']
#for ext in extensions:
#os.unlink(str(filename_path / filename[:-4]) + ext)
# proc = subprocess.Popen([str(filename[:-4]) + '.pdf'], cwd = "./results", shell=True)