diff --git a/src/Class_Mod/DATA_HANDLING.py b/src/Class_Mod/DATA_HANDLING.py index 56af54af8fa7f40d237e2febd6972590a1cb72d1..84cb46e0ec254a9526cf0eca39333f405b198655 100644 --- a/src/Class_Mod/DATA_HANDLING.py +++ b/src/Class_Mod/DATA_HANDLING.py @@ -80,4 +80,33 @@ def Snv(X): return pd.DataFrame(c.T, index=X.index, columns= X.columns) def Non(X): - return X \ No newline at end of file + return X + + +######################################## Cross val split ############################ +########################## Kennard-Stone split method for generating CV-folds ###################################### +def CV(x, y, n_folds:int): + test_folds = {} + folds_name = [f'cv{i+1}' for i in range(n_folds)] + kf = ks.KFold(n_splits=3, device='cpu') + + for i in range(n_folds): + d = [] + for _, i_test in kf.split(x, y): + d.append(i_test) + test_folds[folds_name[i]] = d[i] + return test_folds + +################################### Cross validate a model and return predictions/fold and samples idx ####################################################### +def cross_val_predictor(model, x, y, nfolds): + x = np.array(x) + y = np.array(y) + + yp = {} + folds = CV(x=x, y=y, n_folds=3)### Test index + key = list(folds.keys()) + + for i in range(nfolds): + model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0)) + yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold + return yp, folds \ No newline at end of file diff --git a/src/Class_Mod/Evaluation_Metrics.py b/src/Class_Mod/Evaluation_Metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..ebe94c0fa613913c426ce373bb649f7e5315ae8b --- /dev/null +++ b/src/Class_Mod/Evaluation_Metrics.py @@ -0,0 +1,56 @@ +from Packages import * + +class metrics: + def __init__(self, c:Optional[float] = None, cv:Optional[List] = None, t:Optional[List] = None, method = 'regression')-> pd.DataFrame: + phase = [c, cv, t] + index = np.array(["train", "cv", "test"]) + notnone = [i for i in range(3) if phase[i] != None] + met_index = index[notnone] + methods = ['regression', 'classification'] + perf = {} + for i in notnone: + if method == 'regression': + perf[index[i]] = metrics.reg_(phase[i][0], phase[i][1]) + + elif method == 'classification': + perf[index[i]] = metrics.class_(phase[i][0], phase[i][1]) + + if notnone == 1: + self.ret = perf.T + else: + self.ret = pd.DataFrame(perf).T + + @staticmethod + def reg_(meas, pred): + meas = np.array(meas) + pred = np.array(pred) + xbar = np.mean(meas) # the average of measured values + e = np.subtract(meas , pred) + e2 = e**2# the squared error + + # Sum of squared: + # TOTAL + sst = np.sum((meas - xbar)**2) + # RESIDUAL + ssr = np.sum(e2) + # REGRESSION OR MODEL + ssm = np.sum(pred - xbar) + + + # Compute statistical metrics + metr = {} + metr['r'] = np.corrcoef(meas, pred)[0, 1] + metr['r2'] = 1-ssr/sst + metr['rmse'] = np.sqrt(np.mean(e2)) + metr['mae'] = np.mean(np.abs(e2)) + metr['rpd'] = np.std(meas)/np.sqrt(np.mean(e2)) + metr['rpiq'] = (np.quantile(meas, .75) - np.quantile(meas, .25))/np.sqrt(np.mean(e2)) + return metr + + @staticmethod + def class_(meas, pred): + pass + + @property + def scores_(self): + return self.ret \ No newline at end of file diff --git a/src/Class_Mod/PLSR_.py b/src/Class_Mod/PLSR_.py index 1ccd3d08ddf767e1a7a1a917f7c219d701849a33..062f17026d3ee8a6db86b869537b6989b39cc729 100644 --- a/src/Class_Mod/PLSR_.py +++ b/src/Class_Mod/PLSR_.py @@ -1,6 +1,6 @@ from Packages import * from Class_Mod.Miscellaneous import * -from Class_Mod.Regression_metrics import metrics +from Class_Mod.Evaluation_Metrics import metrics class PinardPlsr: def __init__(self, x_train, y_train, x_test, y_test): @@ -45,21 +45,6 @@ class PinardPlsr: def model_(self): return self.trained - @property - def metrics_(self): - metc = metrics(self.y_train, self.yc) - metc = metc.evaluate_ - - metcv = metrics(self.y_train, self.ycv) - metcv = metcv.evaluate_ - - mett = metrics( self.y_test, self.yt) - mett = mett.evaluate_ - - met = pd.concat([metc, metcv, mett], axis = 0) - met.index = ['calib','cv','test'] - return met - @property def pred_data_(self): diff --git a/src/Class_Mod/PLSR_Preprocess.py b/src/Class_Mod/PLSR_Preprocess.py index 72aa9841093da02e3bafc84371f9cbd3ce4e12f5..7904ef6d8033a6fc23acd0ac0e7703a60987acef 100644 --- a/src/Class_Mod/PLSR_Preprocess.py +++ b/src/Class_Mod/PLSR_Preprocess.py @@ -94,21 +94,7 @@ class PlsProcess: @property def model_(self): return self.model - @property - def metrics_(self): - metc = metrics(self.y_train, self.yc) - metc = metc.evaluate_ - - metcv = metrics(self.y_train, self.ycv) - metcv = metcv.evaluate_ - - mett = metrics( self.y_test, self.yt) - mett = mett.evaluate_ - - met = pd.concat([metc, metcv, mett], axis = 0) - met.index = ['calib','cv','test'] - return met - + @property def pred_data_(self): return self.yc, self.ycv, self.yt \ No newline at end of file diff --git a/src/Class_Mod/Regression_metrics.py b/src/Class_Mod/Regression_metrics.py deleted file mode 100644 index 5ec8cacb36eb7eecd515311aa3ec0f2f4120be24..0000000000000000000000000000000000000000 --- a/src/Class_Mod/Regression_metrics.py +++ /dev/null @@ -1,31 +0,0 @@ -from Packages import * -class metrics: - def __init__(self, meas, pred): - if isinstance(meas, pd.DataFrame) or isinstance(meas, pd.Series): - self.meas = np.array(meas).reshape(-1) - self.pred = np.array(pred).reshape(-1) - - @property - def evaluate_(self): - xbar = np.mean(self.meas) # the average of measured values - e = np.subtract(self.meas, self.pred) - e2 = e**2# the squared error - - # Sum of squared: - # TOTAL - sst = np.sum((self.meas- xbar)**2) - # RESIDUAL - ssr = np.sum(e2) - # REGRESSION OR MODEL - ssm = np.sum(self.pred - xbar) - - - # Compute statistical metrics - metr = pd.DataFrame() - metr['r'] = [np.corrcoef(self.meas, self.pred)[0,1]] - metr['r2'] = [1-ssr/sst] - metr['rmse'] = [np.sqrt(np.mean(e2))] - metr['mae'] = [np.mean(np.abs(e2))] - metr['rpd'] = [np.std(self.meas)/np.sqrt(np.mean(e2))] - metr['rpiq'] = [(np.quantile(self.meas,.75)-np.quantile(self.meas,.25))/np.sqrt(np.mean(e2))] - return metr.round(3) \ No newline at end of file diff --git a/src/Class_Mod/SK_PLSR_.py b/src/Class_Mod/SK_PLSR_.py index 7b9de1cbf2f1e776d91b97d700ab58b2893ec079..f08082f99fade1759f9837fb8ae059742f0456a3 100644 --- a/src/Class_Mod/SK_PLSR_.py +++ b/src/Class_Mod/SK_PLSR_.py @@ -1,64 +1,121 @@ from Packages import * from Class_Mod.Miscellaneous import * -from Class_Mod.Regression_metrics import metrics - - - +from Class_Mod.Evaluation_Metrics import metrics +from Class_Mod.DATA_HANDLING import Snv class PlsR: + SCORE = 100000000 + def __init__(self, x_train, y_train, x_test, y_test): + self.PLS_params = {} + a = [0, 1, 2] + if min(a)==0: + b = [0] + elif min(a)==1: + b= [0,1] + elif min(a) ==2: + b = [0, 1, 2] + + self.PLS_params['Preprocess'] = {'Scatter':hp.choice('Scatter',['Snv', None]), + 'window_length_sg':hp.choice('window_length_sg', [9, 13, 17, 21]), + 'polyorder_sg':hp.choice('polyorder_sg',a), + 'deriv_sg':hp.choice('deriv_sg', b)} + + self.PLS_params['n_components'] = hp.choice("n_components", list(np.arange(1,21))) + self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test - - self.trained = PLSRegression(n_components= self._optimize(), scale = False) - self.trained.fit(self.x_train, self.y_train) + self.p = self.x_train.shape[1] + + trials = Trials() + best_params = fmin(fn=self.objective, + space=self.PLS_params, + algo=tpe.suggest, # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach + max_evals=100, + trials=trials, + verbose=0) + ##################################################################################################### + if self.best['Preprocess']['Scatter'] is None: + xtrain = self.x_train + xtest = self.x_test + elif self.best['Preprocess']['Scatter'] == 'Snv': + xtrain = Snv(self.x_train) + xtest = Snv(self.x_test) + + x_train = savgol_filter(xtrain, window_length = self.best['Preprocess']['window_length_sg'], + polyorder = self.best['Preprocess']['polyorder_sg'], + deriv=self.best['Preprocess']['deriv_sg']) + + x_test = savgol_filter(xtest, window_length = self.best['Preprocess']['window_length_sg'], + polyorder = self.best['Preprocess']['polyorder_sg'], + deriv=self.best['Preprocess']['deriv_sg']) + - self.yc = pd.DataFrame(self.trained.predict(self.x_train)) # make predictions on test data and assign to Y_preds variable - self.ycv = pd.DataFrame(cross_val_predict(self.trained, self.x_train, self.y_train, cv = 3)) # make predictions on test data and assign to Y_preds variable - self.yt = pd.DataFrame(self.trained.predict(self.x_test)) # make predictions on test data and assign to Y_preds variable + ###################################################################################################### + self.trained = PLSRegression(n_components= self.best['n_components'], scale = False) + self.trained.fit(x_train, self.y_train) - def _optimize(self): - nlv = 21 - rmse = np.ones(21) - rmse[0] = 0.002 - lv = {} - ratio = [] - for i in range(1,nlv): - m = PLSRegression(n_components= i, scale = False) - ycv = cross_val_predict(m, self.x_train, self.y_train, cv = 5) - rmse[i] = mean_squared_error(self.y_train, ycv) + self.yc = pd.DataFrame(self.trained.predict(x_train)) # make predictions on test data and assign to Y_preds variable + self.ycv = pd.DataFrame(cross_val_predict(self.trained, x_train, self.y_train, cv = 3)) # make predictions on test data and assign to Y_preds variable + self.yt = pd.DataFrame(self.trained.predict(x_test)) # make predictions on test data and assign to Y_preds variable + ####################################################################################################### + + def objective(self, params): + ws = params['Preprocess']['window_length_sg'] + po = params['Preprocess']['polyorder_sg'] + dr = params['Preprocess']['deriv_sg'] - ratio.append(((rmse[i-1]-rmse[i])/rmse[i-1])*100) - return np.argmax(ratio)+1 + if params['Preprocess']['Scatter'] is None: + xtrain = self.x_train + xtest = self.x_test + elif params['Preprocess']['Scatter'] == 'Snv': + xtrain = Snv(self.x_train) + xtest = Snv(self.x_test) - ################################################################################################################ + x_train = savgol_filter(xtrain, window_length = params['Preprocess']['window_length_sg'], + polyorder = params['Preprocess']['polyorder_sg'], + deriv=params['Preprocess']['deriv_sg']) + + x_test = savgol_filter(xtest, window_length = params['Preprocess']['window_length_sg'], + polyorder = params['Preprocess']['polyorder_sg'], + deriv=params['Preprocess']['deriv_sg']) + + m = PLSRegression( n_components= params['n_components'], scale = False ) + m.fit(x_train, self.y_train) - ################################################################################################################ + yc = m.predict(x_train) + ycv = cross_val_predict( m, x_train, self.y_train, cv = 5) + yt = m.predict(x_test) + + rmsec = mean_squared_error(self.y_train, yc) + rmsecv = mean_squared_error(self.y_train, ycv) + rmset = mean_squared_error(self.y_test, yt) + + SCORE = (rmsecv/rmset) + (rmsecv/rmsec) + (rmset/rmsec) + if SCORE < PlsR.SCORE: + PlsR.SCORE = SCORE + self.best = params + print(SCORE) + print(params) + return SCORE + @property def model_(self): return self.trained - - @property - def metrics_(self): - metc = metrics(self.y_train, self.yc) - metc = metc.evaluate_ - - metcv = metrics(self.y_train, self.ycv) - metcv = metcv.evaluate_ - - mett = metrics( self.y_test, self.yt) - mett = mett.evaluate_ - - met = pd.concat([metc, metcv, mett], axis = 0) - met.index = ['calib','cv','test'] - return met + @property + def best_hyperparams(self): + self.best + self.b = {'Scatter':self.best['Preprocess']['Scatter'], 'Saitzky-Golay derivative parameters':{'polyorder':self.best['Preprocess']['polyorder_sg'], + 'deriv':self.best['Preprocess']['deriv_sg'], + 'window_length':self.best['Preprocess']['window_length_sg']}} + return self.b + @property def pred_data_(self): - return self.yc, self.ycv, self.yt \ No newline at end of file diff --git a/src/Class_Mod/__init__.py b/src/Class_Mod/__init__.py index 155f62559033d087dcb7c42c65e1bdaac7fb5aba..e9d334d53ab7739a5e4832d384016eaf1920b657 100644 --- a/src/Class_Mod/__init__.py +++ b/src/Class_Mod/__init__.py @@ -6,7 +6,7 @@ from .UMAP_ import Umap from .DATA_HANDLING import * from .PLSR_ import PinardPlsr from .LWPLSR_ import LWPLSR -from .Regression_metrics import metrics +from .Evaluation_Metrics import metrics from .VarSel import TpeIpls from .Miscellaneous import resid_plot, reg_plot from .DxReader import DxRead, read_dx diff --git a/src/Packages.py b/src/Packages.py index 7e8bb19e17b4ebdd58ad1d7b5ffeccaa02986807..debec3041c731c54064b7b5c1eb0f2a5ee393548 100644 --- a/src/Packages.py +++ b/src/Packages.py @@ -11,11 +11,13 @@ import datetime import numpy as np import pandas as pd from matplotlib import colors +from typing import Optional, List from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder import time from scipy.stats import skew, kurtosis from scipy.signal import savgol_filter, find_peaks_cwt, detrend import scipy as sc +import kennard_stone as ks ### Exploratory data analysis-Dimensionality reduction from umap.umap_ import UMAP from sklearn.decomposition import PCA, NMF diff --git a/src/data/params/Preprocessing.json b/src/data/params/Preprocessing.json index 821db7c9c527189fe8a3a6dee1f63a20957e807e..fac056691fb6176471327aacf46b1d55a7c9f80b 100644 --- a/src/data/params/Preprocessing.json +++ b/src/data/params/Preprocessing.json @@ -1 +1 @@ -{"Scatter": "Non", "Saitzky-Golay derivative parameters": {"polyorder": 0, "deriv": 0, "window_length": 23}} \ No newline at end of file +{"Scatter": "Snv", "Saitzky-Golay derivative parameters": {"polyorder": 2, "deriv": 1, "window_length": 15}} \ No newline at end of file diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 86013f4393622706a6af041f359fd9fff7ed584e..f22968685cfc4893c0f74aed0ba575c28048776f 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -210,7 +210,7 @@ if not spectra.empty and not y.empty: json.dump(Reg.best_hyperparams, outfile) M2.write("-- Performance metrics --") - M2.dataframe(Reg.metrics_) + M2.dataframe(metrics(c = [y_train, yc], cv = [y_train, ycv], t = [y_test, yt], method='regression').scores_) #from st_circular_progress import CircularProgress #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance', # size = "medium", track_color = "black", color = "blue")