report.py

import subprocess
from pathlib import Path
import os
from pandas import DataFrame
import os.path
import re
import streamlit as st

def intersect(l1, l2): 
    return l1.intersection(set(l2))
def check(file):
    return os.path.isfile(file)
def report(*args):
    signal_preprocess = {'Snv':r'''Standard Normal Variate (SNV) \cite{barnes1989standard}''',
                         'SG': r'''Savitzky-Golay (SG) \cite{savitzky1964smoothing}'''}
    dim_red_methods= {'PCA':r'''Principal Components Analysis (PCA) \cite{wold1987principal,ringner2008principal,greenacre2022principal,JMLR:v12:pedregosa11a}''',
                      'UMAP':r'''Uniform Manifold Approximation and Projection (UMAP) \cite{ghojogh2021uniform,JMLR:v12:pedregosa11a}''',
                        'NMF':r'''Non-negative Matrix Factorization (NMF) \cite{lopes2015non}'''}  # List of dimensionality reduction algos
    cluster_methods = {'Kmeans':r'''Kmeans \cite{chong2021k,JMLR:v12:pedregosa11a}''',
                       'HDBSCAN':r'''Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN) \cite{mcinnes2017hdbscan}''',
                        'AP':r'''Affinity Propagation (AP) \cite{dueck2009affinity,JMLR:v12:pedregosa11a}''',
                        'KS':r'''Kennard-Stone algorithm (KS)''',
                        'RDM': r'''random approach'''} # List of clustering algos
    
    selec_strategy = {'center':'PCA','random':'PCA'}
    reg_algo ={"PLS":r'''Partial Least Squares (PLS) \cite{Wold2001,JMLR:v12:pedregosa11a}''',
                "LW-PLS": r'''Locally Weighted-Partial Least Squares (LW-PLS) \cite{Lesnoff2020}''',
                "TPE-iPLS": r'''Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)'''} 

    to_report=[]
    j=0
    for arg in args:
        if isinstance(arg, str) or isinstance(arg, int):
            to_report.append(str(arg))
        elif isinstance(arg, list):
            to_report.extend(list(map(str, arg)))
        elif isinstance(arg, DataFrame):
            df_name = 'df' + str(j)
            j+=1
            globals()[df_name] = arg.select_dtypes(include=['float64', 'int64'])
    
    latex_report = ""
    latex_report += r"""\documentclass[11pt]{article}
    \usepackage{fancyhdr}
    \usepackage{graphicx}
    \usepackage{geometry}
    \geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=5cm,
      headheight=0.05cm, footskip=1.7cm}

    \usepackage{changepage}
    \usepackage{hyphenat}
    \usepackage{booktabs}
    \usepackage{times}
    \usepackage{parskip}
    \usepackage{float}
    \setlength{\parskip}{\baselineskip} % Example setting
    \usepackage{cite}     % For citing with range compression
    \usepackage{etoolbox}
    \usepackage{xcolor}
    \newcommand{\headrulecolor}[1]{\patchcmd{\headrule}{\hrule}{\color{#1}\hrule}{}{}}
    \newcommand{\footrulecolor}[1]{\patchcmd{\footrule}{\hrule}{\color{#1}\hrule}{}{}}
    \renewcommand{\headrulewidth}{1pt}
    \headrulecolor{red!100}%
    \renewcommand{\footrulewidth}{1pt}
    \footrulecolor{red!100}%
    \graphicspath{{images/}, {results/figures/}}
    \fancyhead[R]{\includegraphics[width=0.1\textwidth]{logo_cefe.png}}
    \fancyhead[L]{PACE - NIRS Analysis Report}
    \fancyfoot[L]{Project Name to fill}
    \fancyfoot[C]{Plateforme d'Analyses Chimiques en Ecologie}
    \fancyfoot[R]{\thepage}
    \setlength{\headheight}{52pt}
    \addtolength{\topmargin}{-9.2942pt}
    \pagestyle{fancy}

    % Customize appearance of figure references
    \usepackage{xcolor}   % For defining colors    
    \definecolor{myblue}{RGB}{0,0,128} % RGB values for blue

    \usepackage{hyperref}
    \hypersetup{colorlinks=true,linkcolor=myblue,citecolor=myblue,urlcolor=myblue}
    \usepackage{cleveref} % For clever references
    

    \usepackage{subcaption}
    \usepackage{caption}
    % Redefine cref formats for figures and tables

    \DeclareCaptionLabelFormat{myfigureformat}{\textbf{Fig. #2}}
    \captionsetup[figure]{
        labelformat=myfigureformat, % Apply the custom format
        justification=centering, % Justify the caption text
        singlelinecheck=false, % Allow the caption to occupy multiple lines
        labelsep=space, % Add a space after the label
    }
    \DeclareCaptionLabelFormat{mytableformat}{\textbf{Table #2}}
    \captionsetup[table]{
        labelformat=mytableformat, % Apply the custom format
        justification=centering, % Justify the caption text
        singlelinecheck=false, % Allow the caption to occupy multiple lines
        skip=0pt, % Vertical space between caption and table
        position=top % Position the caption at the top of the table
    }
    \crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}}
    \Crefformat{figure}{\textcolor{myblue}{Fig.~#2#1#3}} % Capitalized version for beginning of sentence
    \crefformat{table}{\textcolor{myblue}{table~#2#1#3}}
    \Crefformat{table}{\textcolor{myblue}{Table~#2#1#3}} % Capitalized version for beginning of sentence

    
    \begin{document}
    
    \noindent
    \begin{center}
    \textbf{{\Large NIRS WORKFLOW REPORT}} \\
    \end{center}"""
    latex_report += r"""\noindent
    \textbf{QUERY MADE: }{"""+ re.sub(r'([_%])', r'\\\1',to_report[0])+ r"""}.\\
    \noindent
    \textbf{INPUT DATA: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\"""
    latex_report += r"""\section*{Results}"""
    
    latex_report += r"""\subsection*{Spectral data visualization}"""
    latex_report += r"""Acquired spectra were visualized in (\cref{raw_spectra}) by plotting the signal of the samples captured in the specific spectral range.
    This helps observe general patterns and trends in the spectra, and understand the variability within the data.
    \begin{figure}[h]
    \centering
    \includegraphics[width=1\linewidth]{spectra_plot.png}
    \caption{Acquired spectra}
    \label{raw_spectra}
    \end{figure}"""

    if 'Representative subset selection' in to_report:
        latex_report += r"""\subsection*{Multivariable Data Analysis}"""
        latex_report += r""" Multivariable calibration models have widely been used for quantitative analysis and chemical analysis fields.
            Different multivariable modelling techniques are used for calibration models developement, ranging from linear to non linear techniques, and the
            performance of models developed using these techniques depends heavily on the overall quality of the data and the degree of representativeness
            of calibration set, interchangeably called training set, used for its development, i.e, how much the training set captures the characteristics
            and diversity of the entire population or dataset from which it is drawn \cite{li2016strategy}.\par"""
        
        latex_report += r""" For optimal selection of a reprentative subset of the samples to analyze through the 
        reference method and use for calibration models development, a pipeline consisting of consecutively applying features extraction (or dimensionality
        reduction) and"""
        

        if 'KS' in to_report or 'RDM' in to_report:
            latex_report += r""" samples subset selection was developed."""
        else:
            latex_report += r""" clustering analysis was developed."""
       
        latex_report += r""" Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} technique that helps 
            represent the high dimensional spectra in a reduced perceptible 3D subspace spanned by a few number of features, while """
        

        if 'KS' in to_report or 'RDM' in to_report:
            latex_report += r""" samples subset selection was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique.\par"""
        else:
            latex_report += r""" clustering analysis was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique that helps group the data into groups of spectra
            that share the same carachteristics.\par"""
        

        if 'KS' not in to_report and not 'RDM' in to_report:
            latex_report += r""" After implementing the pipeline, a subset sampling method, consisting of"""
            if 'center' in to_report:
                latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs."""
            if 'random' in to_report:
                latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""}
                    samples or less from each subcluster (if a subcluster contains less than {"""+to_report[7]+ r"""} samples, then all samples included
                    in this subcluster are selected), was applied.\par"""
                
                  
        if "PCA" in to_report:
            latex_report += r"""\indent To detect potential spectral outliers, the influence and residuals plots \cite{Mejia2017} were constructed,
              with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish regular observations,
                which form a homogeneous group near the subspace generated by the PCs; good leverage points,
                  which are at the same plane as the subspace but distant from the ROs; orthogonal observations, which have a
                    large residual distance to the subspace, but whose projection is on the subspace; and, finally, bad leverage
                      points, which have a large residual distance such that the projection on the subspace is away from regular observations.\par"""
              
        latex_report += r""" Results of applying this workflow are displayed in""" 
        if 'PCA' in to_report:
            latex_report += r""" (\cref{pcaplots, hotelling_and_influence, loadings})."""
        elif 'NMF' in to_report:
            latex_report += r""" (\cref{pcaplots, loadings})."""
        else:
            latex_report += r""" (\cref{pcaplots})."""

        if 'KS' in to_report or 'RDM' in to_report:
            latex_report += r""" Based of the features extracted using {"""+to_report[2]+ r"""}, """
        
        else:
            latex_report += r""" Based of the features extracted using {"""+to_report[2]+ r"""},
            {"""+to_report[3]+ r"""} revealed the  existance of {"""+to_report[5] + r"""} data clusters, visualized with different colors, from which """
        
        latex_report += r"""a subset of {"""+to_report[8]+ r"""} samples was selected"""
        if 'KS' in to_report or 'RDM' in to_report:
            latex_report += r""", by the {"""+cluster_methods[to_report[3]] + r"""},"""

        latex_report +=  r""" and extracted to be representative of the whole data set, i.e, to reflect the variation included in the whole data set.
              This subset of samples is suggested to be used for a robust NIR calibration developement,
                therefore should to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation) to collect data for the target variable to be modelled.\par"""
        
        pathtofig = os.listdir("./report/results/figures")
        sc = [name for name in pathtofig if name.startswith("score")]
        if sc[0] not in ["scores_plot1D.png","scores_plot2D.png"]:
            axisn = 'three'
        elif sc[0] == "scores_plot2D.png":
            axisn = 'two'
        elif sc[0] == "scores_plot1D.png":
            axisn = "one"
        if len(sc) == 3:
            latex_report += r"""
            \begin{figure}[h]
            \captionsetup{justification=centering}
            \centering
            \begin{minipage}[b]{0.33\textwidth}
                    \includegraphics[width=\linewidth]{scores_pc1_pc2.png}
            \end{minipage}%
            \begin{minipage}[b]{0.33\textwidth}
                    \includegraphics[width=\linewidth]{scores_pc1_pc3.png}
            \end{minipage}%
            \begin{minipage}[b]{0.33\textwidth}
                    \includegraphics[width=\linewidth]{scores_pc2_pc3.png}
            \end{minipage}
            \centering
            \caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection
            results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be
            analyzed by a standard reference analytical procedure}
            \label{pcaplots}
            \end{figure}"""
        
        elif len(sc) == 1:
            latex_report += r"""
            \begin{figure}[h!]
            \centering
            \includegraphics[width=.6\linewidth]{"""+sc[0] +r"""}
            \caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection
            results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be
            analyzed by a standard reference analytical procedure}
            \label{pcaplots}
            \end{figure}"""

        if 'PCA' in to_report or 'NMF' in to_report:
            latex_report += r"""
            \begin{figure}[h!]
            \centering
            \includegraphics[width=.6\linewidth]{loadings_plot.png}
            \caption{Loadings plot}
            \label{loadings}
            \end{figure}
            """
        if 'PCA' in to_report:
            latex_report += r"""
            \newpage
            \begin{raggedbottom}            
            \begin{figure}[h!]
            \centering
            \begin{minipage}[b]{0.33\textwidth}
                \centering
                \includegraphics[width=\linewidth]{influence_plot.png}
            \end{minipage}%
            \begin{minipage}[b]{0.33\textwidth}
                \centering
                \includegraphics[width=\linewidth]{hotelling_plot.png}
            \end{minipage}
        \caption{Outliers detection plots;(a) and (b) , respectively, correspond to the hotelling and influence plots}
        \label{hotelling_and_influence}
        \end{figure}
        \end{raggedbottom}
        """


    elif 'Predictive model development' in to_report:
        latex_report += r"""\paragraph{}To develop a robust NIR calibration that formally correlates the spectral signature of the samples in the NIR region
          with the corresponding reference data obtained by analyzing the samples using a suitable reference analytical procedure,
            a pipeline consisting of consecutively performing spectral signal preprocessing followed by multivariable predictive modelling was applied.
              Signal preprocessing was performed by """
        
        if 'No_transformation' not in to_report:
            latex_report += r"""normalizing the raw spectra using {"""+signal_preprocess[to_report[3]]+ r""", then """
        
        if to_report[3] !="No_derivation":
            latex_report += r"""taking the {"""+to_report[2]+ r"""}-order derivative of a the {"""+to_report[4]+ r"""}-order Savitzky-Golay (SG)
            polynomial estimated over a moving window of {"""+to_report[5]+ r"""} data points"""

        latex_report += r""". The obtained preprocessed spectra were appropriately matched with the reference values, then Kennard-Stone (KS) algorithm \cite{ferreira2021kennard} was used for 
        to split the dataset into two data subsets (\cref{fig:Histogram} and \cref{table:desc_stats}) for regression modeling; training and testing subsets, the former, consisting of 80\% of the data, was used to
          develop a {"""+reg_algo[to_report[6]]+ r"""} predictive model, while the latter, consisting of the remaining 20\% of the data, was used to evaluate its
            predictive and generalizability performance.\par"""
        
        if any(i in to_report for i in ('PLS', 'TPE-iPLS')):
            latex_report += r""" The latente variables for the {"""+to_report[6]+ r"""} based model were estimated using the Non-linear Iterative Partial Least Squares (NIPALS) algorithm that was first introduced by 
            the econometrician and statistician Herman Andreas Ole Wold \cite{wold1975path}."""

        latex_report += r""" The evaluation of the model performance was performed by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models, 
        specifically, the correlation coefficient (r), the coefficient of determination (R2), the Root Mean Squared Error (RMSE), the Mean Absolute Error (MAE), the Ratio of Performance to Deviation (RPD), the Ratio of 
          performance to inter-quartile (RPIQ) \cite{BellonMaurel2010}.\par"""
        latex_report += r""" To optimize the performance of the calibration, the hyperparameters of predictive model and the selection of signal preprocessing methods were
         performed simultaneously and automatically using the Tree-Structured Parzen Estimator (TPE) as an optimization algorithm. The optimal preprocessing-hyperparameters combination
           was assumed to minimize the RMSE of 5-folds Cross-Validation (CV).\par"""
        
        
        latex_report += r"""
        \begin{figure}[H]
        \centering
        \includegraphics[width=1\linewidth]{Histogram.png}
        \caption{Kde plot visualizing the distribution of the target variable, a subset of training, and testing sets}
        \label{fig:Histogram}
        \end{figure}
        """ + df0.style.format("${:.2f}$").to_latex( position_float = 'centering', hrules = True,
                                                     caption = 'Descriptive statistics of the target variable, subsets used to develop and validate the predictive model',
                                                     label= 'table:desc_stats') + r""""""

        
        latex_report += r"""
            \cref{fig:CV} and \cref{table:CV} show the CV results achieved with the best hyperparameters-preprocessing combination found by the optimization algorithm.
            These results are beneficial and important the evaluating of the bias-variance tradeoff. The best hyperparameters-preprocessing combination was identified 
            and used to create a predictive model that was evaluated for its explanatory (train) and predictive (test) performance (\cref{table:reg_perf}).\par

            \begin{figure}[h]
            \captionsetup{justification=centering}
            \centering
            \begin{minipage}[c]{0.5\textwidth}
                \includegraphics[width=\linewidth]{meas_vs_pred_cv_onebyone.png}
            \end{minipage}%
            \begin{minipage}[c]{0.5\textwidth}
                \includegraphics[width=\linewidth]{meas_vs_pred_cv_all.png}
            \end{minipage}%
            \caption{ Visualization of measured vs predicted values for cross-validation }
        \label{fig:CV}
        \end{figure}
        """ + df2.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Cross-Validation summary', label= 'table:CV') + r"""
        """
        latex_report += df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf')

        if "PLS" in to_report:
            latex_report += r"""To identify the most important and influential spectral regions in the model, Selectivity ratio (SR) \cite{kvalheim2020variable, farres2015comparison} test applied, and the important variables in the model were 
            visualized in \cref{fig:importance}. \par
            
            \begin{figure}[h]
            \centering
            \includegraphics[width=1\linewidth]{Variable_importance.png}
            \caption{Visualizing important spectral regions identified in the PLS model on the raw and preprocessed average spectrum}
            \label{fig:importance}
            \end{figure}
            """
        elif "LW-PLS" in to_report:
            latex_report += r"""The average of raw and preprocessed spectra is visualized in \cref{fig:importance}. \par
            
            \begin{figure}[h]
            \centering
            \includegraphics[width=1\linewidth]{Variable_importance.png}
            \caption{Visualizing the average spectrum computed for raw and preprocessed  spectra}
            \label{fig:importance}
            \end{figure}
            """
        elif "TPE-iPLS" in to_report:
            latex_report += r"""
            Many research papers have proved that interval selection methods, with different number of intervalls, helps reduce noise and model overfitting,
              increases computational efficiency and results interpretability, and maximizes the model's predictive accuracy. For the current analysis, the selected spectral 
              intervalls or regions that were used for predictive model development were visualized in \cref{fig:importanceipls}. \par

            \begin{figure}[h]
            \centering
            \includegraphics[width=1\linewidth]{Variable_importance.png}
            \caption{Visualizing spectral regions used for TPE-iPLS model development on the raw and preprocessed average spectrum}
            \label{fig:importanceipls}
            \end{figure}
            """

        # latex_report += r"""""" + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'table:reg_perf') + r""""""
        latex_report += r"""  Following a numerical analysis of the model performance, measured against predicted values \cite{pauwels2019evaluating} and residuals against measured \cite{belloto1985residual} plots (\cref{fig:diagnosis}) were analysed to 
        visually assess the goodness of model fit and to detect potential flaws such as a pattern that the model failed to capture or outliers.\par

        \begin{figure}[h]
        \captionsetup{justification=centering}
            \centering
            \begin{minipage}[b]{0.5\textwidth}
                \includegraphics[width=\linewidth]{measured_vs_predicted.png}
            \end{minipage}%
            \begin{minipage}[b]{0.5\textwidth}
                \includegraphics[width=\linewidth]{residuals_plot.png}
            \end{minipage}%
            \caption{Post-hoc analysis of the developed predictive model; measured vs predicted values (a) and measured vs residuals (b) plots }
        \label{fig:diagnosis}
        \end{figure}"""            
        
    latex_report += r"""                                                   
    \clearpage
    \pagebreak
    \newpage
    \section*{ACKNOWLEDGEMENTS}
    This tool is provided by the Chemical Analysis Platform for Ecology - Montpellier, France.\\
            Thanks to Abderrahim DIANE, Mouhcine MAIMOUNI, Alexandre GRANIER, Remy BEUGNON, Vincent NEGRE et Nicolas BARTHES.\\
            Source code available at \href{https://src.koda.cnrs.fr/cefe/pace/nirs_workflow}{CNRS forge}.

    \fontsize{8}{9}\selectfont
    \bibliographystyle{IEEEtran}
    % \bibliographystyle{abbrv}
    \bibliography{refs.bib}
    \clearpage
    \end{document}"""

    # export the .tex file in the report folder
    filename_path = Path("report/")
    filename = r'report.tex'
    with open(filename_path / filename, 'w+') as latex_file:
        latex_file.write(latex_report)

# create the Tex file - sections in args will be displayed: {'sample':'Sample Selection';'model':'Model Creation';'predict':'Predictions';'help':'LaTEX help for figs and tables';}
# latex_report = report('sample', 'predict',)
@st.cache_data
def generate_report(change):
    my = Path("./report/report.pdf")
    if my.is_file():
        os.remove("./report/report.pdf")

    # path to pdflatex
    from config.config import pdflatex_path
    filename_path = Path("report/")
    filename = 'report.tex'
    # run pdflatex with bibtex compilation (2nd run)
    for i in range(4):
        print(i)
        if i == 1:
            proc = subprocess.Popen([pdflatex_path / 'bibtex.exe', filename[:-4]], cwd = filename_path)
            proc.communicate()
        else:
            proc = subprocess.Popen([pdflatex_path / 'pdflatex.exe', filename], cwd = filename_path)
            proc.communicate()
    # remove pdflatex compilation files
    extensions = ['.log', '.aux', '.bbl', '.blg', '.out']
    #for ext in extensions:
        #os.unlink(str(filename_path / filename[:-4]) + ext)
    # open the report
    # proc = subprocess.Popen([str(filename[:-4]) + '.pdf'], cwd = "./results", shell=True)
    proc.communicate()