Newer
Older
import os.path
import re
import streamlit as st
def intersect(l1, l2):
return l1.intersection(set(l2))
def check(file):
return os.path.isfile(file)
def report(*args):
signal_preprocess = {'Snv':r'''Standard Normal Variate (SNV) \cite{barnes1989standard}''',
'SG': r'''Savitzky-Golay (SG) \cite{savitzky1964smoothing}'''}
dim_red_methods= {'PCA':r'''Principal Components Analysis (PCA) \cite{wold1987principal,ringner2008principal,greenacre2022principal,JMLR:v12:pedregosa11a}''',
'UMAP':r'''Uniform Manifold Approximation and Projection (UMAP) \cite{ghojogh2021uniform,JMLR:v12:pedregosa11a}''',
'NMF':r'''Non-negative Matrix Factorization (NMF) \cite{lopes2015non}'''} # List of dimensionality reduction algos
cluster_methods = {'Kmeans':r'''Kmeans \cite{chong2021k,JMLR:v12:pedregosa11a}''',
'HDBSCAN':r'''Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN) \cite{mcinnes2017hdbscan}''',
'AP':r'''Affinity Propagation (AP) \cite{dueck2009affinity,JMLR:v12:pedregosa11a}''',
'KS':r'''Kennard-Stone algorithm (KS)''',
'RDM': r'''random approach'''} # List of clustering algos
selec_strategy = {'center':'PCA','random':'PCA'}
reg_algo ={"PLS":r'''Partial Least Squares (PLS) \cite{Wold2001,JMLR:v12:pedregosa11a}''',
"LW-PLS": r'''Locally Weighted-Partial Least Squares (LW-PLS) \cite{Lesnoff2020}''',
"TPE-iPLS": r'''Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)'''}
to_report=[]
j=0
for arg in args:
if isinstance(arg, str) or isinstance(arg, int):
to_report.append(str(arg))
elif isinstance(arg, list):
to_report.extend(list(map(str, arg)))
df_name = 'df' + str(j)
j+=1
globals()[df_name] = arg.select_dtypes(include=['float64', 'int64'])
latex_report = ""
latex_report += r"""\documentclass[11pt]{article}
\usepackage{fancyhdr}
\usepackage{graphicx}
\usepackage{geometry}
\geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=5cm,
headheight=0.05cm, footskip=1.7cm}
\usepackage{changepage}
\usepackage{hyphenat}
\usepackage{booktabs}
\usepackage{times}
\usepackage{parskip}
\usepackage{float}
\setlength{\parskip}{\baselineskip} % Example setting
\usepackage{cite} % For citing with range compression
\usepackage{etoolbox}
\usepackage{xcolor}
\newcommand{\headrulecolor}[1]{\patchcmd{\headrule}{\hrule}{\color{#1}\hrule}{}{}}
\newcommand{\footrulecolor}[1]{\patchcmd{\footrule}{\hrule}{\color{#1}\hrule}{}{}}
\renewcommand{\headrulewidth}{1pt}
\headrulecolor{red!100}%
\renewcommand{\footrulewidth}{1pt}
\footrulecolor{red!100}%
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
\fancyhead[R]{\includegraphics[width=0.1\textwidth]{logo_cefe.png}}
\fancyhead[L]{PACE - NIRS Analysis Report}
\fancyfoot[L]{Project Name to fill}
\fancyfoot[C]{Plateforme d'Analyses Chimiques en Ecologie}
\fancyfoot[R]{\thepage}
\setlength{\headheight}{52pt}
\addtolength{\topmargin}{-9.2942pt}
\pagestyle{fancy}
% Customize appearance of figure references
\usepackage{xcolor} % For defining colors
\definecolor{myblue}{RGB}{0,0,128} % RGB values for blue
\usepackage{hyperref}
\hypersetup{colorlinks=true,linkcolor=myblue,citecolor=myblue,urlcolor=myblue}
\usepackage{cleveref} % For clever references
\usepackage{subcaption}
\usepackage{caption}
% Redefine cref formats for figures and tables
\DeclareCaptionLabelFormat{myfigureformat}{\textbf{Fig. #2}}
\captionsetup[figure]{
labelformat=myfigureformat, % Apply the custom format
justification=centering, % Justify the caption text
singlelinecheck=false, % Allow the caption to occupy multiple lines
labelsep=space, % Add a space after the label
}
\DeclareCaptionLabelFormat{mytableformat}{\textbf{Table #2}}
\captionsetup[table]{
labelformat=mytableformat, % Apply the custom format
justification=centering, % Justify the caption text
singlelinecheck=false, % Allow the caption to occupy multiple lines
skip=0pt, % Vertical space between caption and table
position=top % Position the caption at the top of the table
}
\crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}}
\Crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}} % Capitalized version for beginning of sentence
\crefformat{table}{\textcolor{myblue}{table~#2#1#3}}
\Crefformat{table}{\textcolor{myblue}{Table~#2#1#3}} % Capitalized version for beginning of sentence
\begin{document}
\noindent
\begin{center}
\textbf{{\Large NIRS WORKFLOW REPORT}} \\
\end{center}"""
latex_report += r"""\noindent
\textbf{QUERY MADE: }{"""+ re.sub(r'([_%])', r'\\\1',to_report[0])+ r"""}.\\
\noindent
\textbf{INPUT DATA: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\"""
latex_report += r"""\section*{Results}"""
latex_report += r"""\subsection*{Spectral data visualization}"""
latex_report += r"""Acquired spectra were visualized in (\cref{raw_spectra}) by plotting the signal of the samples captured in the specific spectral range.
This helps observe general patterns and trends in the spectra, and understand the variability within the data.
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{spectra_plot.png}
\caption{Acquired spectra}
\label{raw_spectra}
\end{figure}"""
if 'Representative subset selection' in to_report:
latex_report += r"""\subsection*{Multivariable Data Analysis}"""
latex_report += r""" Multivariable calibration models have widely been used for quantitative analysis and chemical analysis fields.
Different multivariable modelling techniques are used for calibration models developement, ranging from linear to non linear techniques, and the
performance of models developed using these techniques depends heavily on the overall quality of the data and the degree of representativeness
of calibration set, interchangeably called training set, used for its development, i.e, how much the training set captures the characteristics
and diversity of the entire population or dataset from which it is drawn \cite{li2016strategy}.\par"""
latex_report += r""" For optimal selection of a reprentative subset of the samples to analyze through the
reference method and use for calibration models development, a pipeline consisting of consecutively applying features extraction (or dimensionality
reduction) and"""
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""" samples subset selection was developed."""
else:
latex_report += r""" clustering analysis was developed."""
latex_report += r""" Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} technique that helps
represent the high dimensional spectra in a reduced perceptible 3D subspace spanned by a few number of features, while """
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""" samples subset selection was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique.\par"""
else:
latex_report += r""" clustering analysis was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique that helps group the data into groups of spectra
that share the same carachteristics.\par"""
if 'KS' not in to_report and not 'RDM' in to_report:
latex_report += r""" After implementing the pipeline, a subset sampling method, consisting of"""
if 'center' in to_report:
latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs."""
if 'random' in to_report:
latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""}
samples or less from each subcluster (if a subcluster contains less than {"""+to_report[7]+ r"""} samples, then all samples included
in this subcluster are selected), was applied.\par"""
if "PCA" in to_report:
latex_report += r"""\indent To detect potential spectral outliers, the influence and residuals plots \cite{Mejia2017} were constructed,
with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish regular observations,
which form a homogeneous group near the subspace generated by the PCs; good leverage points,
which are at the same plane as the subspace but distant from the ROs; orthogonal observations, which have a
large residual distance to the subspace, but whose projection is on the subspace; and, finally, bad leverage
points, which have a large residual distance such that the projection on the subspace is away from regular observations.\par"""
latex_report += r""" Results of applying this workflow are displayed in"""
if 'PCA' in to_report:
latex_report += r""" (\cref{pcaplots, hotelling_and_influence, loadings})."""
elif 'NMF' in to_report:
latex_report += r""" (\cref{pcaplots, loadings})."""
else:
latex_report += r""" (\cref{pcaplots})."""
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""" Based of the features extracted using {"""+to_report[2]+ r"""}, """
else:
latex_report += r""" Based of the features extracted using {"""+to_report[2]+ r"""},
{"""+to_report[3]+ r"""} revealed the existance of {"""+to_report[5] + r"""} data clusters, visualized with different colors, from which """
latex_report += r"""a subset of {"""+to_report[8]+ r"""} samples was selected"""
if 'KS' in to_report or 'RDM' in to_report:
latex_report += r""", by the {"""+cluster_methods[to_report[3]] + r"""},"""
latex_report += r""" and extracted to be representative of the whole data set, i.e, to reflect the variation included in the whole data set.
This subset of samples is suggested to be used for a robust NIR calibration developement,
therefore should to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation) to collect data for the target variable to be modelled.\par"""
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
sc = [name for name in pathtofig if name.startswith("score")]
if sc[0] not in ["scores_plot1D.png","scores_plot2D.png"]:
axisn = 'three'
elif sc[0] == "scores_plot2D.png":
axisn = 'two'
elif sc[0] == "scores_plot1D.png":
axisn = "one"
if len(sc) == 3:
latex_report += r"""
\begin{figure}[h]
\captionsetup{justification=centering}
\centering
\begin{minipage}[b]{0.33\textwidth}
\includegraphics[width=\linewidth]{scores_pc1_pc2.png}
\end{minipage}%
\begin{minipage}[b]{0.33\textwidth}
\includegraphics[width=\linewidth]{scores_pc1_pc3.png}
\end{minipage}%
\begin{minipage}[b]{0.33\textwidth}
\includegraphics[width=\linewidth]{scores_pc2_pc3.png}
\end{minipage}
\centering
\caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection
results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be
analyzed by a standard reference analytical procedure}
\label{pcaplots}
\end{figure}"""
elif len(sc) == 1:
latex_report += r"""
\begin{figure}[h!]
\centering
\includegraphics[width=.6\linewidth]{"""+sc[0] +r"""}
\caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection
results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be
analyzed by a standard reference analytical procedure}
\label{pcaplots}
\end{figure}"""
if 'PCA' in to_report or 'NMF' in to_report:
latex_report += r"""
\begin{figure}[h!]
\centering
\includegraphics[width=.6\linewidth]{loadings_plot.png}
\caption{Loadings plot}
\label{loadings}
\end{figure}
"""
if 'PCA' in to_report:
latex_report += r"""
\newpage
\begin{raggedbottom}
\begin{figure}[h!]
\centering
\begin{minipage}[b]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{influence_plot.png}
\end{minipage}%
\begin{minipage}[b]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{hotelling_plot.png}
\end{minipage}
\caption{Outliers detection plots;(a) and (b) , respectively, correspond to the hotelling and influence plots}
\label{hotelling_and_influence}
\end{figure}
\end{raggedbottom}
"""
elif 'Predictive model development' in to_report:
latex_report += r"""\paragraph{}To develop a robust NIR calibration that formally correlates the spectral signature of the samples in the NIR region
with the corresponding reference data obtained by analyzing the samples using a suitable reference analytical procedure,
a pipeline consisting of consecutively performing spectral signal preprocessing followed by multivariable predictive modelling was applied.
Signal preprocessing was performed by """
if 'No_transformation' not in to_report:
latex_report += r"""normalizing the raw spectra using {"""+signal_preprocess[to_report[3]]+ r""", then """
if to_report[3] !="No_derivation":
latex_report += r"""taking the {"""+to_report[2]+ r"""}-order derivative of a the {"""+to_report[4]+ r"""}-order Savitzky-Golay (SG)
polynomial estimated over a moving window of {"""+to_report[5]+ r"""} data points"""
latex_report += r""". The obtained preprocessed spectra were appropriately matched with the reference values, then Kennard-Stone (KS) algorithm \cite{ferreira2021kennard} was used for
to split the dataset into two data subsets (\cref{fig:Histogram} and \cref{table:desc_stats}) for regression modeling; training and testing subsets, the former, consisting of 80\% of the data, was used to
develop a {"""+reg_algo[to_report[6]]+ r"""} predictive model, while the latter, consisting of the remaining 20\% of the data, was used to evaluate its
predictive and generalizability performance.\par"""
if any(i in to_report for i in ('PLS', 'TPE-iPLS')):
latex_report += r""" The latente variables for the {"""+to_report[6]+ r"""} based model were estimated using the Non-linear Iterative Partial Least Squares (NIPALS) algorithm that was first introduced by
the econometrician and statistician Herman Andreas Ole Wold \cite{wold1975path}."""
latex_report += r""" The evaluation of the model performance was performed by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models,
specifically, the correlation coefficient (r), the coefficient of determination (R2), the Root Mean Squared Error (RMSE), the Mean Absolute Error (MAE), the Ratio of Performance to Deviation (RPD), the Ratio of
performance to inter-quartile (RPIQ) \cite{BellonMaurel2010}.\par"""
latex_report += r""" To optimize the performance of the calibration, the hyperparameters of predictive model and the selection of signal preprocessing methods were
performed simultaneously and automatically using the Tree-Structured Parzen Estimator (TPE) as an optimization algorithm. The optimal preprocessing-hyperparameters combination
was assumed to minimize the RMSE of 5-folds Cross-Validation (CV).\par"""
latex_report += r"""
\begin{figure}[H]
\centering
\includegraphics[width=1\linewidth]{Histogram.png}
\caption{Kde plot visualizing the distribution of the target variable, a subset of training, and testing sets}
\label{fig:Histogram}
\end{figure}
""" + df0.style.format("${:.2f}$").to_latex( position_float = 'centering', hrules = True,
caption = 'Descriptive statistics of the target variable, subsets used to develop and validate the predictive model',
label= 'table:desc_stats') + r""""""
latex_report += r"""
\cref{fig:CV} and \cref{table:CV} show the CV results achieved with the best hyperparameters-preprocessing combination found by the optimization algorithm.
These results are beneficial and important the evaluating of the bias-variance tradeoff. The best hyperparameters-preprocessing combination was identified
and used to create a predictive model that was evaluated for its explanatory (train) and predictive (test) performance (\cref{table:reg_perf}).\par
\begin{figure}[h]
\captionsetup{justification=centering}
\centering
\begin{minipage}[c]{0.5\textwidth}
\includegraphics[width=\linewidth]{meas_vs_pred_cv_onebyone.png}
\end{minipage}%
\begin{minipage}[c]{0.5\textwidth}
\includegraphics[width=\linewidth]{meas_vs_pred_cv_all.png}
\end{minipage}%
\caption{ Visualization of measured vs predicted values for cross-validation }
\label{fig:CV}
\end{figure}
""" + df2.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Cross-Validation summary', label= 'table:CV') + r"""
"""
latex_report += df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf')
if "PLS" in to_report:
latex_report += r"""To identify the most important and influential spectral regions in the model, Selectivity ratio (SR) \cite{kvalheim2020variable, farres2015comparison} test applied, and the important variables in the model were
visualized in \cref{fig:importance}. \par
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{Variable_importance.png}
\caption{Visualizing important spectral regions identified in the PLS model on the raw and preprocessed average spectrum}
\label{fig:importance}
\end{figure}
"""
elif "LW-PLS" in to_report:
latex_report += r"""The average of raw and preprocessed spectra is visualized in \cref{fig:importance}. \par
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{Variable_importance.png}
\caption{Visualizing the average spectrum computed for raw and preprocessed spectra}
\label{fig:importance}
\end{figure}
"""
elif "TPE-iPLS" in to_report:
latex_report += r"""
Many research papers have proved that interval selection methods, with different number of intervalls, helps reduce noise and model overfitting,
increases computational efficiency and results interpretability, and maximizes the model's predictive accuracy. For the current analysis, the selected spectral
intervalls or regions that were used for predictive model development were visualized in \cref{fig:importanceipls}. \par
\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{Variable_importance.png}
\caption{Visualizing spectral regions used for TPE-iPLS model development on the raw and preprocessed average spectrum}
\label{fig:importanceipls}
\end{figure}
"""
# latex_report += r"""""" + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf') + r""""""
latex_report += r""" Following a numerical analysis of the model performance, measured against predicted values \cite{pauwels2019evaluating} and residuals against measured \cite{belloto1985residual} plots (\cref{fig:diagnosis}) were analysed to
visually assess the goodness of model fit and to detect potential flaws such as a pattern that the model failed to capture or outliers.\par
\begin{figure}[h]
\captionsetup{justification=centering}
\centering
\begin{minipage}[b]{0.5\textwidth}
\includegraphics[width=\linewidth]{measured_vs_predicted.png}
\end{minipage}%
\begin{minipage}[b]{0.5\textwidth}
\includegraphics[width=\linewidth]{residuals_plot.png}
\end{minipage}%
\caption{Post-hoc analysis of the developed predictive model; measured vs predicted values (a) and measured vs residuals (b) plots }
\label{fig:diagnosis}
\end{figure}"""
latex_report += r"""
\clearpage
\pagebreak
\newpage
\section*{ACKNOWLEDGEMENTS}
This tool is provided by the Chemical Analysis Platform for Ecology - Montpellier, France.\\
Thanks to Abderrahim DIANE, Mouhcine MAIMOUNI, Alexandre GRANIER, Remy BEUGNON, Vincent NEGRE et Nicolas BARTHES.\\
Source code available at \href{https://src.koda.cnrs.fr/cefe/pace/nirs_workflow}{CNRS forge}.
\fontsize{8}{9}\selectfont
\bibliographystyle{IEEEtran}
% \bibliographystyle{abbrv}
\bibliography{refs.bib}
\clearpage
\end{document}"""
# export the .tex file in the report folder
filename_path = Path("report/")
filename = r'report.tex'
with open(filename_path / filename, 'w+') as latex_file:
latex_file.write(latex_report)
# create the Tex file - sections in args will be displayed: {'sample':'Sample Selection';'model':'Model Creation';'predict':'Predictions';'help':'LaTEX help for figs and tables';}
# latex_report = report('sample', 'predict',)
@st.cache_data
def generate_report(change):
my = Path("./report/report.pdf")
if my.is_file():
os.remove("./report/report.pdf")
# path to pdflatex
from config.config import pdflatex_path
filename_path = Path("report/")
filename = 'report.tex'
# run pdflatex with bibtex compilation (2nd run)
for i in range(4):
print(i)
if i == 1:
proc = subprocess.Popen([pdflatex_path / 'bibtex.exe', filename[:-4]], cwd = filename_path)
proc.communicate()
else:
proc = subprocess.Popen([pdflatex_path / 'pdflatex.exe', filename], cwd = filename_path)
proc.communicate()
# remove pdflatex compilation files
extensions = ['.log', '.aux', '.bbl', '.blg', '.out']
#for ext in extensions:
#os.unlink(str(filename_path / filename[:-4]) + ext)
# open the report
# proc = subprocess.Popen([str(filename[:-4]) + '.pdf'], cwd = "./results", shell=True)
proc.communicate()