diff --git a/requirements.txt b/requirements.txt index 6298021f5c765695e1598f4ea09024c6f0e78223..ba7cbe31637ae005bf74dafc1419dbcbd09ae7bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ -streamlit==1.33.0 +streamlit>=1.33.0 st_pages==0.4.5 -requests==2.31.0 -Pillow==10.2.0 -protobuf==4.23.4 -watchdog==4.0.0 -pinard==1.1.0 +requests>=2.24.0 +Pillow==8.4.0 +protobuf==3.19.0 +watchdog==2.1.8 +pinard==1.0 juliacall==0.9.19 plotly==5.21.0 pyodbc==5.1.0 diff --git a/src/Class_Mod/DATA_HANDLING.py b/src/Class_Mod/DATA_HANDLING.py index 892c0c0854533b346a4e2363a61408c4d114a4ae..17e4dcb44161db3710fec2ccf94ad8363e35cfbc 100644 --- a/src/Class_Mod/DATA_HANDLING.py +++ b/src/Class_Mod/DATA_HANDLING.py @@ -80,6 +80,7 @@ def No_transformation(X): ######################################## Cross val split ############################ class KF_CV: ### method for generating test sets index + ### KFCV(dict) returns a testset indices/Fold @staticmethod def CV(x, y, n_folds:int): test_folds = {} @@ -90,30 +91,45 @@ class KF_CV: for _, i_test in kf.split(x, y): d.append(i_test) test_folds[folds_name[i]] = d[i] - return test_folds + return test_folds ## returns a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set ### Cross validate the model and return the predictions and samples index @staticmethod - def cross_val_predictor(model, x, y, n_folds:int): + def cross_val_predictor(model, folds, x, y): + """" model: the object to be cross-validated, + folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method) + x and y: the data used for CV""" x = np.array(x) y = np.array(y) yp = {} - folds = KF_CV.CV(x=x, y=y, n_folds=n_folds)### Test index key = list(folds.keys()) + n_folds = len(folds.keys()) for i in range(n_folds): model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0)) yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold - - + return yp # returns a tuple with keys are names of folds and the corresponding values are the predicted Y/fold + @staticmethod + def meas_pred_eq(y, ypcv, folds): + """" y: the target variable, + ypcv: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with predictions/fold (from cross_val_predictor method) + folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method) + x and y: the data used for CV + + returns: + two dataframe: + - a n x 4 dataframe containing measured values, predicted values, ols reg equation, and index (n is the total number of samples) + - a 2 x k dataframe containing ols regression coefficients(k is the number of folds) + """ cvcv = {} coeff = {} + y = np.array(y) for i, Fname in enumerate(folds.keys()): r = pd.DataFrame() - r['Predicted'] = yp[Fname] + r['Predicted'] = ypcv[Fname] r['Measured'] = y[folds[Fname]] - ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]),yp[Fname].reshape(-1,1)) + ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1)) r.index = folds[Fname] r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0] cvcv[i] = r @@ -123,37 +139,47 @@ class KF_CV: data['index'] = [data.index[i][1] for i in range(data.shape[0])] data.index = data['index'] coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept']) - return yp, folds, data, coeff - - ### compute metrics for each fold + return data, coeff ## returns values predicted in cross validation, ,coefficients of regression + @staticmethod - def process(model, x, y, n_folds:int): - f, idx,_ , _ = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds) + def metrics_cv(y, ypcv, folds): + y = np.array(y) e = {} - for i in idx.keys(): - e[i] = metrics().reg_(y.iloc[idx[i]],f[i]) + for i in folds.keys(): + e[i] = metrics().reg_(y[folds[i]],ypcv[i]) r = pd.DataFrame(e) - return r + r_print = r.copy() + r_print['mean'] = r.mean(axis = 1) + r_print['sd'] = r.std(axis = 1) + r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1) + return r.T, r_print.T - ### bias and variance + ### compute metrics for each fold @staticmethod - def cv_scores(model, x, y, n_folds:int): - x = KF_CV.process(model, x, y, n_folds) - mean = x.mean(axis = 1) - sd = x.std(axis = 1) - rsd = sd*100/mean - data = pd.concat([mean, sd, rsd], axis = 1).round(2) - data.columns = ['mean', 'sd', 'cv(%)'] - return data + def cv_scores(y, ypcv, folds): + """ Takes as input the Y vactor, the tuple of preducted values/fold(from cross_val_predictor method), and the index/fold(from CV method) + and returns two dataframes, the first is containing metrics scores/fold and the second is similar to the first by with additional mean, sd, and rsd variables + """ + y = np.array(y) + e = {} + for i in folds.keys(): + e[i] = metrics().reg_(y[folds[i]],ypcv[i]) + r = pd.DataFrame(e) + r_print = r + r_print['mean'] = r.mean(axis = 1) + r_print['sd'] = r.std(axis = 1) + r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1) + return r.T, r_print.T - ### Return ycv - @staticmethod - def ycv(model, x, y, n_folds:int): - ycv = np.zeros(y.shape[0]) - f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds) - for i in f.keys(): - ycv[idx[i]] = f[i] - return ycv + + # ### Return ycv + # @staticmethod + # def ycv(model, x, y, n_folds:int): + # ycv = np.zeros(y.shape[0]) + # f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds) + # for i in f.keys(): + # ycv[idx[i]] = f[i] + # return ycv ### Selectivity ratio diff --git a/src/Class_Mod/Miscellaneous.py b/src/Class_Mod/Miscellaneous.py index 4280bb31df9e9516443cd3f5000d67523ee2865c..69d7d240a6f88a4580eea86c21e2c05c112458d8 100644 --- a/src/Class_Mod/Miscellaneous.py +++ b/src/Class_Mod/Miscellaneous.py @@ -130,7 +130,7 @@ def desc_stats(x): a['Mean'] = np.mean(x) a['Median'] = np.median(x) a['S'] = np.std(x) - a['RSD(%)'] = np.std(x)*100/np.mean(x) - a['Skewness'] = skew(x, axis=0, bias=True) - a['Kurtosis'] = kurtosis(x, axis=0, bias=True) + a['RSD'] = np.std(x)*100/np.mean(x) + a['Skew'] = skew(x, axis=0, bias=True) + a['Kurt'] = kurtosis(x, axis=0, bias=True) return a \ No newline at end of file diff --git a/src/Class_Mod/RegModels.py b/src/Class_Mod/RegModels.py index 49813dbdd93dc5ca4a2a08d0c413a202fb3cf2cb..ce07a07e6bf541d8e078dfe12846d96d4868e28a 100644 --- a/src/Class_Mod/RegModels.py +++ b/src/Class_Mod/RegModels.py @@ -44,17 +44,17 @@ class Regmodel(object): return self.pretreated @property - def get_params_(self): + def get_params_(self):### This method return the search space where the optimization algorithm will search for optimal subset of hyperparameters return self._hyper_params def objective(self, params): pass @property - def best_hyperparams_(self): + def best_hyperparams_(self): ### This method returns the subset of selected hyperparametes return self._best @property - def best_hyperparams_print(self): + def best_hyperparams_print(self):### This method returns a sentence telling what signal preprocessing method was applied if self._best['normalization'] == 'Snv': a = 'Standard Normal Variate (SNV)' @@ -66,15 +66,15 @@ class Regmodel(object): return SG+"\n"+Norm @property - def model_(self): + def model_(self): # This method returns the developed model return self._model @property - def pred_data_(self): + def pred_data_(self): ## this method returns the predicted data in training and testing steps return self._yc, self._yt @property - def cv_data_(self): + def cv_data_(self): ## Cross validation data return self._ycv @property @@ -91,7 +91,7 @@ class Regmodel(object): def sel_ratio_(self): return self._sel_ratio -########################################### ######################################### +########################################### PLSR ######################################### class Plsr(Regmodel): def __init__(self, train, test, n_iter = 10): super().__init__(train, test, n_iter, add_hyperparams = {'n_components': hp.randint('n_components', 2,20)}) @@ -115,19 +115,23 @@ class Plsr(Regmodel): x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] Model = PLSRegression(scale = False, n_components = params['n_components']) - self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) - self._cv_df['Average'] = self._cv_df.mean(axis = 1) - self._cv_df['S'] = self._cv_df.std(axis = 1) - self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] - self._cv_df = self._cv_df.T.round(2) - score = self._cv_df.loc["CV(%)",'rmse'] + # self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) + # self._cv_df['Average'] = self._cv_df.mean(axis = 1) + # self._cv_df['S'] = self._cv_df.std(axis = 1) + # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] + # self._cv_df = self._cv_df.T.round(2) + folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) + yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] + + score = self._cv_df.loc["cv",'rmse'] Model = PLSRegression(scale = False, n_components = params['n_components']) Model.fit(x2[0], self._ytrain) if self.SCORE > score: self.SCORE = score - self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) + self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds) self._yc = Model.predict(x2[0]) self._yt = Model.predict(x2[1]) self._model = Model @@ -141,7 +145,7 @@ class Plsr(Regmodel): return score - ############################################ ######################################### + ############################################ iplsr ######################################### class TpeIpls(Regmodel): def __init__(self, train, test, n_iter = 10, n_intervall = 5): self.n_intervall = n_intervall @@ -179,26 +183,29 @@ class TpeIpls(Regmodel): # print(x2) # ## Modelling + folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) try: Model = PLSRegression(scale = False, n_components = params['n_components']) - self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) + yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] except ValueError as ve: params["n_components"] = 1 Model = PLSRegression(scale = False, n_components = params['n_components']) - self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) - - self._cv_df['Average'] = self._cv_df.mean(axis = 1) - self._cv_df['S'] = self._cv_df.std(axis = 1) - self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] - self._cv_df = self._cv_df.T.round(2) - score = self._cv_df.loc['CV(%)','rmse'] + yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] + # self._cv_df['Average'] = self._cv_df.mean(axis = 1) + # self._cv_df['S'] = self._cv_df.std(axis = 1) + # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] + # self._cv_df = self._cv_df.T.round(2) + score = self._cv_df.loc['cv','rmse'] Model = PLSRegression(scale = False, n_components = params['n_components']) Model.fit(x2[0][:,id], self._ytrain) if self.SCORE > score: self.SCORE = score - self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) + self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds) + self._yc = Model.predict(x2[0][:,id]) self._yt = Model.predict(x2[1][:,id]) self._model = Model @@ -216,7 +223,9 @@ class TpeIpls(Regmodel): return score - ############################################ ######################################### + + ########################################### LWPLSR ######################################### + ############################################ Pcr ######################################### class Pcr(Regmodel): def __init__(self, train, test, n_iter = 10, n_val = 5): diff --git a/src/Class_Mod/UMAP_.py b/src/Class_Mod/UMAP_.py index b3f0c6768fbc0d175625fed2f855e142ff551646..75c874639a0510fb48dac87a89c17d45ba8f5d7a 100644 --- a/src/Class_Mod/UMAP_.py +++ b/src/Class_Mod/UMAP_.py @@ -21,6 +21,7 @@ class Umap: self.model.fit(self.numerical_data, y = self.categorical_data_encoded) self.scores_raw = self.model.transform(self.numerical_data) self.scores = pd.DataFrame(self.scores_raw) + self.scores.columns = [f'axis_{i+1}' for i in range(self.scores_raw.shape[1])] @property def scores_(self): diff --git a/src/Report/report.py b/src/Report/report.py index 7e79ae71cc3d3bbf2d861ce94e3b91696f980082..78029e589cdc8b1fa2d7728c29320046234498f4 100644 --- a/src/Report/report.py +++ b/src/Report/report.py @@ -3,12 +3,14 @@ from pathlib import Path import os import pandas as pd import os.path +import re def intersect(l1, l2): return l1.intersection(set(l2)) def check(file): return os.path.isfile(file) def report(*args): + signal_preprocess = {'Snv':'Standard Normal Variate (SNV)'} dim_red_methods= {'PCA':'Principal Components Analysis (PCA)', 'UMAP':'Uniform Manifold Approximation and Projection (UMAP)', 'NMF':'Non-negative Matrix Factorization (NMF)'} # List of dimensionality reduction algos @@ -16,23 +18,28 @@ def report(*args): 'HDBSCAN':'Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN)', 'AP':'Affinity Propagation (AP)'} # List of clustering algos selec_strategy = {'center':'PCA','random':'PCA'} + reg_algo ={"Full-PLSR":'full Partial Least Squares (PLS)', + "Locally Weighted PLSR": 'Locally Weighted Partial Least Squares (LWPLS)', + "Interval-PLSR": "Tree-structured Parzen estimator-interval Partial Least Squares (TPE-iPLS)"} + to_report=[] j=0 for arg in args: - if isinstance(arg, str): - to_report.append(arg) + if isinstance(arg, str) or isinstance(arg, int): + to_report.append(str(arg)) elif isinstance(arg, list): - to_report.extend(arg) + to_report.extend(list(map(str, arg))) elif isinstance(arg, pd.DataFrame): df_name = 'df' + str(j) j+=1 - globals()[df_name] = arg - + globals()[df_name] = arg.select_dtypes(include=['float64', 'int64']) + latex_report = "" latex_report += r"""\documentclass[a4paper,10pt]{article} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage{geometry} + \usepackage{changepage} \geometry{a4paper, left=2cm, right=2cm, top=1.5cm, bottom=3cm } \usepackage{caption, subcaption} \usepackage{hyperref} @@ -63,71 +70,66 @@ def report(*args): \begin{center} \textbf{{\Large NIRS WORKFLOW REPORT}} \\ \end{center}""" - if 'sample_selection' in to_report: - latex_report += r"""\noindent - \textbf{QUERY MADE: } Sample selection performing.\\ - \noindent - \textbf{ENTERED INPUTS: }{"""+to_report[1] + r"""}\\ - \textbf{PRINCIPLE OF RESPONSE TO THE QUERY:} Representative subset selection has - been performed using the "sample selection" workflow that consists of applying - a sequence of data processing techniques, specifically, dimensionality reduction, - clustering, and samples selection techniques.""" - - latex_report += r"""\section*{RESULTS}""" - latex_report += r"""\subsection*{Spectral data visualization}""" - latex_report += r"""Acquired spectra were visualized in fig1 by plotting the intensity - of absorption, reflectance, transmission, etc, against the wavelengths or wavenumbers. - This helps observe general patterns and trends in the spectra, and understand the - variability within the data. - \begin{figure}[h] - \centering - \includegraphics[width=1\linewidth]{spectra_plot.png} - \caption{Acquired spectra} - \label{fig:raw_spectra} - \end{figure}""" + latex_report += r"""\noindent + \textbf{QUERY MADE: }{"""+ re.sub(r'([_%])', r'\\\1',to_report[0])+ r"""}.\\ + \noindent + \textbf{ENTERED INPUTS: }{"""+ re.sub(r'([_%])', r"\\\1", to_report[1])+ r"""}.\\""" + latex_report += r"""\section*{Results}""" + latex_report += r"""\subsection*{Spectral data visualization}""" + latex_report += r"""Acquired spectra were visualized in fig1 by plotting the intensity + of absorption, reflectance, transmission, etc, against the wavelengths or wavenumbers. + This helps observe general patterns and trends in the spectra, and understand the + variability within the data. + \begin{figure}[h] + \centering + \includegraphics[width=1\linewidth]{spectra_plot.png} + \caption{Acquired spectra} + \label{fig:raw_spectra} + \end{figure}""" + if 'Representative subset selection' in to_report: latex_report += r"""\subsection*{Multivariable Data Analysis}""" latex_report += r""" For optimal selection of subset of the samples to analyze through the - reference method, a workflow consisting of consecutively applying features extraction/dimensionality + reference method, a pipeline consisting of consecutively applying features extraction/dimensionality reduction and clustering analysis was developed. Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} technique which helps represent the high dimensional spectra in a reduced perceptible 3D - subspace spanned by a few number of features (three features in our case), on of the spectra. - While clustering analysis was performed using the {"""+cluster_methods[to_report[3]] + r"""} technique which - helps group the data into groups of spectra that share the same carachteristics. - This workflow is widely used in the world of spectral data analysis for detecting outliers, - analysis the homogenity of the data, reducing the computational costs prior to supervised predictive modelling, etc.\\*""" + subspace spanned by a few number of features (three features in our case), while clustering analysis was performed + using the {"""+cluster_methods[to_report[3]] + r"""} technique which + helps group the data into groups of spectra that share the same carachteristics.\\*""" if "PCA" in to_report: latex_report += r"""\indent To detect the presence of any spectral outliers, the influence and residuals plots were constructed, - with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish Regular Observations (ROs), - which form a homogeneous group near the subspace generated by the PCs; Good Leverage Points (GLPs), - which are at the same plane as the subspace but distant from the ROs; Orthogonal Observations (OOs), which have a - large residual distance to the subspace, but whose projection is on the subspace; and, finally, Bad Leverage - Points (BLPs), which have a large residual distance such that the projection on the subspace is away from ROs.\\*""" + with outlyingness limits established at the 95\% confidence level. Together, these plots helps distinguish regular observations, + which form a homogeneous group near the subspace generated by the PCs; good leverage points, + which are at the same plane as the subspace but distant from the ROs; orthogonal observations, which have a + large residual distance to the subspace, but whose projection is on the subspace; and, finally, bad leverage + points, which have a large residual distance such that the projection on the subspace is away from regular observations.\\*""" latex_report += """\indent Results of applying this workflow are displayed in fig. 1. Based of the features extracted using {"""+to_report[2]+ r"""}, {"""+to_report[3]+ r"""} revealed the existance of {"""+to_report[5] + r"""} data clusters that are visualized with different colors. \begin{figure}[h] + \captionsetup{justification=centering} \centering \begin{minipage}[b]{0.33\textwidth} - \centering \includegraphics[width=\linewidth]{scores_pc1_pc2.png} \end{minipage}% \begin{minipage}[b]{0.33\textwidth} - \centering \includegraphics[width=\linewidth]{scores_pc1_pc3.png} \end{minipage}% \begin{minipage}[b]{0.33\textwidth} - \centering \includegraphics[width=\linewidth]{scores_pc2_pc3.png} \end{minipage} - \caption{The pairwise projection of spectra on the reduced 3D subspace.} + \centering + \caption{Illustration of the pairwise projection of spectra onto the reduced 3 dimensional subspace, clustering, and sample selection + results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be + analyzed by a standard reference analytical procedure} \label{pcaplots} \end{figure}""" + latex_report +=r""" """ if 'PCA' in to_report: latex_report += r""" - \begin{figure} + \begin{figure}[ht] \centering \begin{minipage}[b]{0.33\textwidth} \centering @@ -137,21 +139,82 @@ def report(*args): \centering \includegraphics[width=\linewidth]{hotelling_plot.png} \end{minipage} - \caption{The pairwise projection of spectra on the reduced 3D subspace.} + \caption{Outliers detection plots;(a) and (b) , respectively, correspond to the hotelling and influence plots} \label{hotelling_and_influence} \end{figure} """ latex_report += r"""Following the exploratory data analysis, a subset sampling method, consisting of""" - if 'random' in to_report: - latex_report += r""" selecting the sample with the least euclidian distance to the center of each data cluster identified by {"""+to_report[3]+ r"""},""" if 'center' in to_report: - latex_report += r""" fitting a second clustering model, specifically kmeans, to each data cluster and selecting - 3 samples or less from each subcluster (if a subcluster contains less than 3 samples, then all samples included - in this subcluster are selected)," + latex_report += r""" selecting {"""+to_report[7]+ r"""} samples, each from a distict cluster, with the least euclidian distance to the center of the cluster identified by {"""+to_report[3]+ r"""} and to which it the sample belongs.""" + if 'random' in to_report: + latex_report += r""" fitting a second clustering model, specifically kmeans, to each individual data cluster and selecting {"""+to_report[7]+ r"""} + samples or less from each subcluster (if a subcluster contains less than 3 samples, then all samples included + in this subcluster are selected), was applied.""" + + latex_report += r"""The subset of selected samples are identified to be representative and are suggested to be used for robust NIR calibration developement + , i.e, to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation).""" - the center was applied to select representative samples to be used for robust NIR calibration developement - , i.e, to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation)""" + elif 'Predictive model development' in to_report: + latex_report += r"""\paragraph{}To develop a robust NIR calibration that formally correlates the spectral signature of the samples in the NIR region + with the corresponding reference data obtained by analyzing the samples using a suitable reference analytical procedure, + a pipeline consisting of consecutively performing spectral signal correction followed by multivariable predictive modelling was applied. + Signal correction was performed by """ + if 'No_transformation' not in to_report: + latex_report += r"""normalizing the raw spectra using {"""+signal_preprocess[to_report[3]]+ r""", then """ + + if to_report[3] !="No_derivation": + latex_report += r"""taking the {"""+to_report[2]+ r"""}-order derivative of a the {"""+to_report[4]+ r"""}-order Savitzky-Golay (SG) + polynomial estimated over a moving window of {"""+to_report[5]+ r"""} data points""" + latex_report += r""". Subequently, the obtained data was split into two subsets using Kennard-Stone (KS) algorithm; a calibration (Cal) and Validation + (Val) subsets, the former ,consisting of 80\% of the data, was used for multivarible calibration development while the latter ,consisting of + the remaining 20\% of the data, was used for evaluating the predictive and the generalizability performance of the developed calibration.""" + latex_report += r""" To optimally select hyperparameters of the model and the signal preprocessing methods, prevent that the model overfit the data, + and optimize the predictive performance of the model, 5-folds Cross Validation (CV) was performed.""" + latex_report += r"""\paragraph{} Fig 5, and table 6 display descriptive summary of the input data, trainset, and testset.""" + + latex_report += r""" + \begin{figure}[h] + \centering + \includegraphics[width=1\linewidth]{Histogram.png} + \caption{Kde plot visualizing the distribution of the target variable, a subset of training, and testing sets} + \label{fig:Histogram} + \end{figure} + """ + df0.style.format("${:.2f}$").to_latex( position_float = 'centering', hrules = True, + caption = 'Descriptive statistics of the target variable, subsets used to develop and validate the predictive model', + label= 'reg_perf') +r"""""" + + + latex_report += r"""Predictive modelling development was performed using the {"""+reg_algo[to_report[6]]+ r"""} regression method.""" + if "Full-PLSR" in to_report: + latex_report += r"""the most important and influential spectral regions in the model, were visualized in fig.5""" + elif "Locally Weighted PLSR" in to_report: + """""" + elif "Interval-PLSR" in to_report: + latex_report += r"""Three intervalls were selected by the TPE-iPLS""" + + latex_report += r"""The calibration, CV, and prediction performance achieved by the developed model was evaluated + by measuring its scores on a set of agnostic statistical metrics widely used to evaluate NIR calibration models. + specifically, the Root Mean Squared Error (RMSE), the Ratio of Performance to Deviation (RPD), the Ratio of + performance to inter-quartile (RPIQ). A table summarizing the model performance is shown bellow(Table. 4).\par"""""" + """ + df1.style.format("${:.2f}$").to_latex(position_float = 'centering', hrules = True, caption = 'Model performances summary', label= 'reg_perf') + r"""""" + + if "Full-PLSR" in to_report: + + latex_report += r""" To identify the important variables in the model, Variable Importance in Projection (VIP) test applied, and the important variables in the model were + visualized in Fig.8 \par + + \begin{figure}[h] + \centering + \includegraphics[width=1\linewidth]{Variable_importance.png} + \caption{Visualizing important spectral regions identifiedin the PLS model on the raw and preprocessed average spectrum} + \label{fig:Histogram} + \end{figure} + """ + + latex_report += r"""After numerically analyzing the performance of the model, a visual investigation (figs 7 and 8) of goodness of model fit was performed to identify potential + issues such as a pattern, that has not been captured by the model, or outliers.\par. + """ latex_report += r""" \fontsize{8}{9}\selectfont diff --git a/src/form_data.json b/src/form_data.json new file mode 100644 index 0000000000000000000000000000000000000000..d53ad6fd77167794b213e4bacb16327ab2458e39 --- /dev/null +++ b/src/form_data.json @@ -0,0 +1 @@ +{"meta_project": "Life of Brian", "meta_sample_species": "Life of Brian", "meta_sample_category": "Soil", "meta_sample_pretreatment": "Powder", "meta_machine_ID": "Life of Brian", "meta_sample_sub_category": "Green leave", "meta_sample_humidity": "Dry", "meta_scan_place": "Pace"} \ No newline at end of file diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 37b4492731ffb0175317d11227ccb9f442387466..60e2de22c3a4ec06709230aed96b8e1fb564eee1 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -62,6 +62,8 @@ labels = [] color_palette = None dr_model = None # dimensionality reduction model cl_model = None # clustering model +selection = None +selection_number = None # loader for datafile data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) @@ -202,11 +204,13 @@ if not t.empty: # all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_ all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_ labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels] + ncluster = len(clu_centers) # 3- Affinity propagation elif clus_method == cluster_methods[3]: cl_model = AP(X = tcr) data, labels, clu_centers = cl_model.fit_optimal_ + ncluster = len(clu_centers) if clus_method == cluster_methods[2]: #clustered = np.where(np.array(labels) != 'Non clustered')[0] @@ -227,6 +231,8 @@ selected_samples_idx = [] if labels: + num_clusters = len(np.unique(labels)) + custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] if clus_method: selection = scores.radio('Select samples selection strategy:', options = selec_strategy, index = default_sample_selection_option, key=102) @@ -297,11 +303,8 @@ if not t.empty: st.write('Scores plot') # scores plot with clustering if list(labels) and meta_data.empty: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels ,color_discrete_sequence= custom_color_palette) sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1) - - - # scores plot with metadata elif len(list(labels)) == 0 and not meta_data.empty: filter = md_df_st_.columns @@ -341,34 +344,33 @@ if not t.empty: sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) else: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels if list(labels) else None,color_discrete_sequence= custom_color_palette) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) fig.update_traces(marker=dict(size=4)) if selected_samples_idx: tt = tcr.iloc[selected_samples_idx,:] - fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2], - z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 5, color = 'black'), + fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],z = tt.loc[:,axis3], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') - st.plotly_chart(fig, use_container_width=True) + st.plotly_chart(fig, use_container_width = True) if labels: - num_clusters = len(np.unique(labels)) - - custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] - - # Créer et exporter le graphique Axe1-Axe2 en PNG + # export 2D scores plot comb = [i for i in combinations([1,2,3], 2)] subcap = ['a','b','c'] for i in range(len(comb)): - fig_axe1_axe2 = px.scatter(tcr, x=eval(f'axis{str(comb[i][0])}'), y=eval(f'axis{str(comb[i][1])}'), - color=labels if list(labels) else None, - color_discrete_sequence= custom_color_palette) - fig_axe1_axe2.update_layout(font=dict(size=23)) - fig_axe1_axe2.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, + fig_export = px.scatter(tcr, x = eval(f'axis{str(comb[i][0])}'), y=eval(f'axis{str(comb[i][1])}'), + color = labels if list(labels) else None, + color_discrete_sequence = custom_color_palette) + fig_export.add_scatter(x = tt.loc[:,eval(f'axis{str(comb[i][0])}')], y = tt.loc[:,eval(f'axis{str(comb[i][1])}')], + mode ='markers', marker = dict(size = 5, color = 'black'), + name = 'selected samples') + fig_export.update_layout(font=dict(size=23)) + fig_export.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) - fig_axe1_axe2.update_traces(marker=dict(size= 10), showlegend= False) - fig_axe1_axe2.write_image(f'./Report/Figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') + fig_export.update_traces(marker=dict(size= 10), showlegend= False) + fig_export.write_image(f'./Report/Figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') @@ -378,9 +380,6 @@ if not spectra.empty: st.write('Loadings plot') p = dr_model.loadings_ freq = pd.DataFrame(colnames, index=p.index) - - - if test =='.dx': if meta_data.loc[:,'xunits'][0] == '1/cm': freq.columns = ['Wavenumber (1/cm)'] @@ -390,7 +389,6 @@ if not spectra.empty: freq.columns = ['Wavelength (nm)'] xlab = 'Wavelength (nm)' inv = None - else: freq.columns = ['Wavelength/Wavenumber'] xlab = 'Wavelength/Wavenumber' @@ -476,8 +474,10 @@ if not spectra.empty: annotation.font.size = 35 fig.update_layout(font=dict(size=23), width=800, height=600) fig.update_traces(marker=dict(size= 10), showlegend= False) + fig.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, + font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) fig.write_image('./Report/figures/influence_plot.png', engine = 'kaleido') - + with hotelling: st.write('T²-Hotelling vs Q-residuals plot') # Hotelling @@ -518,6 +518,8 @@ if not spectra.empty: annotation.font.size = 35 fig.update_layout(font=dict(size=23), width=800, height=600) fig.update_traces(marker=dict(size= 10), showlegend= False) + fig.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, + font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) fig.write_image("./Report/figures/hotelling_plot.png", format="png") @@ -528,5 +530,5 @@ nb_clu = str(sam1.shape[0]) if data_file: with st.container(): if st.button("Download report"): - latex_report = report.report('sample_selection', data_file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, nb_clu,tcr, sam) - report.compile_latex() + latex_report = report.report('Representative subset selection', data_file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam) + report.compile_latex() \ No newline at end of file diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 6ba279e338d2f1558044a188624b4bf0ca007d76..43b0e5fae3142a9dddae711701f9d374ae0ae654 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -17,8 +17,6 @@ if os.path.exists(repertoire_a_vider): elif os.path.isdir(chemin_fichier): os.rmdir(chemin_fichier) -json_sp = pd.DataFrame() - local_css(css_file / "style_model.css") ####################################### page Design ####################################### @@ -40,17 +38,13 @@ M9 = st.container() M9.write("-- Save the model --") ############################################################################################## -reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] - ####################################### ########################################### files_format = ['.csv', '.dx'] file = M00.radio('Select files format:', options = files_format) - -### Data spectra = pd.DataFrame() y = pd.DataFrame() - - +regression_algo = None +Reg = None # load .csv file if file == files_format[0]: xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") @@ -61,6 +55,8 @@ if file == files_format[0]: options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) if hdrx == "yes": col = 0 else: col = False + else: + M00.warning('Insert your spectral data file here!') ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") if ycal_csv: @@ -68,8 +64,11 @@ if file == files_format[0]: hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3) if hdry == "yes": col = 0 else: col = False + else: + M00.warning('Insert your target data file here!') if xcal_csv and ycal_csv: + file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0) yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col) @@ -93,17 +92,15 @@ if file == files_format[0]: spectra = pd.DataFrame else: - M1.warning('Tune decimal and separator parameters') - - - - - + M00.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !') ## Load .dx file elif file == files_format[1]: data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") - if data_file: + if not data_file: + M00.warning('Load your file here!') + else : + file_name = str(data_file.name) with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: tmp.write(data_file.read()) tmp_path = tmp.name @@ -115,7 +112,7 @@ elif file == files_format[1]: y = chem_data.loc[:,yname].loc[measured] spectra = spectra.loc[measured] else: - M00.warning('Warning: Chemical data are not included in your file !', icon="âš ï¸") + M00.warning('Warning: your file includes no target variables to model !', icon="âš ï¸") os.unlink(tmp_path) ### split the data @@ -142,7 +139,7 @@ if not spectra.empty and not y.empty: ax1.margins(0) plt.tight_layout() M0.pyplot(fig) ######## Loaded graph - fig.savefig("./Report/figures/Spectre_mod.png") + fig.savefig("./Report/figures/spectra_plot.png") fig, ax2 = plt.subplots(figsize = (12,3)) sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True) sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True) @@ -152,31 +149,38 @@ if not spectra.empty and not y.empty: plt.tight_layout() M0.pyplot(fig) - fig.savefig("./Report/figures/histo.png") + fig.savefig("./Report/figures/Histogram.png") M0.write('Loaded data summary') - M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)) - LoDaSum=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2) + M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)) + stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2) ####################################### Insight into the loaded data - ####################################### - regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) + + ####################################### Model creation ################################################### + reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] + regression_algo = M1.selectbox("Choose the algorithm for regression", options= reg_algo, key = 12, placeholder ="Choose an option") + # split train data into nb_folds for cross_validation + nb_folds = 3 + folds = KF_CV.CV(X_train, y_train, nb_folds) + + if not regression_algo: + M1.warning('Choose a modelling algorithm from the dropdown list !') if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py - Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10) + Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1) reg_model = Reg.model_ #M2.dataframe(Pin.pred_data_) + elif regression_algo == reg_algo[2]: info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.') # export data to csv for Julia train/test data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() # Cross-Validation calculation - nb_folds = 3 - st.write('KFold for Cross-Validation = ' + str(nb_folds)) - # split train data into nb_folds - folds = KF_CV.CV(x_train_np, y_train_np, nb_folds) + + st.write('KFold for Cross-Validation = ' + str(nb_folds)) d = {} for i in range(nb_folds): d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] @@ -202,39 +206,60 @@ if not spectra.empty and not y.empty: Reg_json = json.load(outfile) # delete csv files for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) - # delete json file after import + # # delete json file after import os.unlink(temp_path / "lwplsr_outputs.json") # format result data into Reg object - pred = ['pred_data_train', 'pred_data_test'] + pred = ['pred_data_train', 'pred_data_test']### keys of the dict for i in range(nb_folds): - pred.append("CV" + str(i+1)) - Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + pred.append("CV" + str(i+1)) ### add cv folds keys to pred + + Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], + 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + Reg.CV_results_ = pd.DataFrame() Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} - # set indexes to Reg.pred_data (train, test, folds idx) + # # set indexes to Reg.pred_data (train, test, folds idx) for i in range(len(pred)): Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) if i == 0: # data_train + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) Reg.pred_data_[i].index = list(y_train.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] elif i == 1: # data_test + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) Reg.pred_data_[i].index = list(y_test.index) - else: # CVi + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + else: + # CVi Reg.pred_data_[i].index = folds[list(folds)[i-2]] - Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) - Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = Reg.pred_data_[i] - Reg.cv_data_['idxCV']['Fold' + str(i-1)] = folds[list(folds)[i-2]] - Reg.CV_results_.sort_index(inplace = True) - Reg.CV_results_.columns = ['Ypredicted_CV'] - # if you want to display Reg.cv_data_ containing, by fold, YpredCV and idxCV - # cv2.json(Reg.cv_data_) - # Display end of modeling message on the interface - info.empty() + # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) + Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) + Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) + #Reg.cv_data_['idxCV'] and folds contains the same data + + Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] + # #### cross validation results print + Reg.best_hyperparams_print = Reg.best_hyperparams_ + # ## plots + Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds) + # st.write(Reg.cv_data_ ) + # # Reg.CV_results_.sort_index(inplace = True) + # # Reg.CV_results_.columns = ['Ypredicted_CV'] + # # if you want to display Reg.cv_data_ containing, by fold, YpredCV and idxCV + # # cv2.json(Reg.cv_data_) + # # Display end of modeling message on the interface + # info.empty() M1.success('Model created!') except FileNotFoundError as e: # Display error message on the interface if modeling is wrong info.empty() M1.warning('- ERROR during model creation -') Reg = None + +####################### + + + elif regression_algo == reg_algo[3]: s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3) it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3) @@ -262,7 +287,8 @@ if not spectra.empty and not y.empty: - ################# Model analysis ############ +# ###############################################################################################################DDDVVVVVVVVVV +# ################# Model analysis ############ if regression_algo in reg_algo[1:] and Reg is not None: #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ') @@ -307,22 +333,23 @@ if not spectra.empty and not y.empty: ############ cv2.write('-- Cross-Validation Summary--') cv2.write(Reg.CV_results_) - cv99=pd.DataFrame(Reg.CV_results_) + cv_results=pd.DataFrame(Reg.CV_results_) cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --') - fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", + fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", color_discrete_sequence=px.colors.qualitative.G10) - fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash")) + fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), + y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash")) fig1.update_traces(marker_size=7, showlegend=False) cv2.plotly_chart(fig1, use_container_width=True) - fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, + fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000) fig0.update_traces(marker_size=8, showlegend=False) - fig0.write_image("./Report/figures/Allinone.png") + fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png") cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --') cv1.plotly_chart(fig0, use_container_width=True) - fig0.write_image("./Report/figures/Predictions_V.png") + fig1.write_image("./Report/figures/meas_vs_pred_cv_all.png") yc = Reg.pred_data_[0] @@ -332,16 +359,16 @@ if not spectra.empty and not y.empty: M1.write('-- Spectral preprocessing info --') M1.write(Reg.best_hyperparams_print) - a_Test=Reg.best_hyperparams_print - with open("data/params/Preprocessing.json", "w") as outfile: json.dump(Reg.best_hyperparams_, outfile) -########## +# ########## M1.write("-- Model performance --") - M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) - + if regression_algo != "Locally Weighted PLSR": + M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) + else: + M1.dataframe(metrics(t = [y_test, yt], method='regression').scores_) model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) #from st_circular_progress import CircularProgress #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance', @@ -349,16 +376,34 @@ if not spectra.empty and not y.empty: #my_circular_progress.st_circular_progress() #my_circular_progress.update_value(progress=20) - a=reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) + if regression_algo != "Locally Weighted PLSR": + a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) + else: + a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) M7.pyplot(a) - plt.savefig('./Report/figures/Predictedvs.png') + plt.savefig('./Report/figures/measured_vs_predicted.png') + prep_para = Reg.best_hyperparams_ + if regression_algo != "Locally Weighted PLSR": + prep_para.pop('n_components') + for i in ['deriv','polyorder']: + if Reg.best_hyperparams_[i] == 0: + prep_para[i] = '0' + elif Reg.best_hyperparams_[i] == 1: + prep_para[i] = '1st' + elif Reg.best_hyperparams_[i] > 1: + prep_para[i] = f"{Reg.best_hyperparams_[i]}nd" + + if regression_algo != "Locally Weighted PLSR": + residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) + else: + residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) - residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) M8.pyplot(residual_plot) - plt.savefig('./Report/figures/residual_plot.png') - - rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT + plt.savefig('./Report/figures/residuals_plot.png') + + if regression_algo != "Locally Weighted PLSR": + rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) model_name = M9.text_input('Give it a name') @@ -397,28 +442,15 @@ if not spectra.empty and not y.empty: ] ) st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') - -## Load .dx file -Ac_Km = ['histo.png', 'Spectre_mod.png','Predictions_V.png','Allinone.png','Predictedvs.png','residual_plot.png'] -with st.container(): - if st.button("Download the report"): - if regression_algo == reg_algo[1]: - latex_report = report.report(LoDaSum, 'model',Ac_Km,a_Test,json_sp,model_per,'full_plsr',cv99) - report.compile_latex() - else: - pass - - else: - pass + -if not spectra.empty and not y.empty: +if not spectra.empty and not y.empty and regression_algo: if regression_algo in reg_algo[1:] and Reg is not None: fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') - ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') - - + if regression_algo != "Locally Weighted PLSR": + ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') ax2.set_xlabel('Wavelenghts') plt.tight_layout() @@ -436,12 +468,31 @@ if not spectra.empty and not y.empty: eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0) if regression_algo == reg_algo[1]: - ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], + ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0)[np.array(Reg.sel_ratio_.index)], color = 'red', label = 'Important variables') - ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], + ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0)[np.array(Reg.sel_ratio_.index)], color = 'red', label = 'Important variables') ax1.legend() ax2.legend() - M2.write('-- Visualization of the spectral regions used for model creation -- ') - M2.pyplot(fig) \ No newline at end of file + M2.write('-- Visualization of the spectral regions used for model creation --') + fig.savefig("./Report/figures/Variable_importance.png") + M2.pyplot(fig) + +## Load .dx file +if Reg is not None: + with st.container(): + if st.button("Download the report"): + if regression_algo == reg_algo[1]: + latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results) + report.compile_latex() + if regression_algo is None: + st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸") + else: + pass + + else: + pass + + + \ No newline at end of file