From 2ec31fd74abb4906e49f3cfc657c5e09e20b1833 Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Fri, 31 May 2024 12:08:06 +0200 Subject: [PATCH] =?UTF-8?q?-=20Cross=20validation=20methods=20-=20Implemen?= =?UTF-8?q?ting=20CV=20and=20visualizing=20the=20results=20-=20Organizatio?= =?UTF-8?q?n=20des=20classes=20des=20algos=20de=20r=C3=A9gression?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/Class_Mod/DATA_HANDLING.py | 36 ++++--- src/Class_Mod/RegModels.py | 185 +++++++++++++++++++++++++++++++++ src/Class_Mod/__init__.py | 3 +- src/Modules.py | 13 ++- src/pages/2-model_creation.py | 52 ++++++--- 5 files changed, 261 insertions(+), 28 deletions(-) create mode 100644 src/Class_Mod/RegModels.py diff --git a/src/Class_Mod/DATA_HANDLING.py b/src/Class_Mod/DATA_HANDLING.py index 1e8848b..ddce505 100644 --- a/src/Class_Mod/DATA_HANDLING.py +++ b/src/Class_Mod/DATA_HANDLING.py @@ -1,4 +1,5 @@ from Packages import * +from .Evaluation_Metrics import metrics ## try to automatically detect the field separator within the CSV def find_delimiter(filename): @@ -62,24 +63,17 @@ def MinMaxScale(X): sc = pd.DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns) return sc - - ######################################## Spectral preprocessing - def Detrend(X): c = detrend(X, axis=-1, type='linear', bp=0, overwrite_data=False) return c -def Sg11(X): - c = savgol_filter(X, polyorder=1, deriv=1, window_length = 7) - return c - def Snv(X): xt = np.array(X).T c = (xt-xt.mean())/xt.std() return pd.DataFrame(c.T, index=X.index, columns= X.columns) -def Non(X): +def No_transformation(X): return X @@ -105,18 +99,36 @@ class KF_CV: y = np.array(y) yp = {} - folds = CV(x=x, y=y, n_folds=n_folds)### Test index + folds = KF_CV.CV(x=x, y=y, n_folds=n_folds)### Test index key = list(folds.keys()) for i in range(n_folds): model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0)) yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold - return yp, folds + + + cvcv = {} + coeff = {} + for i, Fname in enumerate(folds.keys()): + r = pd.DataFrame() + r['Predicted'] = yp[Fname] + r['Measured'] = y[folds[Fname]] + ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]),yp[Fname].reshape(-1,1)) + r.index = folds[Fname] + r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0] + cvcv[i] = r + coeff[Fname] = [ols.coef_[0][0], ols.intercept_[0]] + + data = pd.concat(cvcv, axis = 0) + data['index'] = [data.index[i][1] for i in range(data.shape[0])] + data.index = data['index'] + coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept']) + return yp, folds, data, coeff ### compute metrics for each fold @staticmethod def process(model, x, y, n_folds:int): - f, idx = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds) + f, idx,_ , _ = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds) e = {} for i in idx.keys(): e[i] = metrics().reg_(y.iloc[idx[i]],f[i]) @@ -138,7 +150,7 @@ class KF_CV: @staticmethod def ycv(model, x, y, n_folds:int): ycv = np.zeros(y.shape[0]) - f, idx = KF_CV.cross_val_predictor(model, x,y, n_folds) + f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds) for i in f.keys(): ycv[idx[i]] = f[i] return ycv diff --git a/src/Class_Mod/RegModels.py b/src/Class_Mod/RegModels.py new file mode 100644 index 0000000..fb56b48 --- /dev/null +++ b/src/Class_Mod/RegModels.py @@ -0,0 +1,185 @@ +from Packages import * +from Class_Mod import metrics, Snv, No_transformation, KF_CV + + +class Regmodel(object): + + def __init__(self, train : [pd.DataFrame, pd.DataFrame], test : [pd.DataFrame, pd.DataFrame], n_iter, add_hyperparams = None, nfolds = 5, **kwargs): + self.SCORE = 100000000 + self._xc, self._xt, self._ytrain, self._ytest = train[0], test[0], train[1], test[1] + self._nc, self._nt, self._p = train[0].shape[0], test[0].shape[0], train[0].shape[1] + self._model, self._best = None, None + self._yc, self._ycv, self._yt = None, None, None + self._cv_df = pd.DataFrame + self._nfolds = nfolds + self.bands = pd.DataFrame() + self.important_features = pd.DataFrame() + self._hyper_params = {'polyorder': hp.choice('polyorder', [0, 1, 2]), + 'deriv': hp.choice('deriv', [0, 1, 2]), + 'window_length': hp.choice('window_length', [15, 21, 27, 33]), + 'scatter': hp.choice('scatter', ['Snv', 'No_transformation'])} + if add_hyperparams is not None: + self._hyper_params.update(add_hyperparams) + self._best = None + + trials = Trials() + best_params = fmin(fn=self.objective, + space=self._hyper_params, + algo=tpe.suggest, # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach + max_evals=n_iter, + trials=trials, + verbose=1) + + @property + def train_data_(self): + return [self._xc, self._ytrain] + + @property + def test_data_(self): + return [self._xt, self._ytest] + + @property + def get_params_(self): + return self._hyper_params + + def objective(self, params): + pass + + @property + def best_hyperparams(self): + return self._best + + @property + def model_(self): + return self._model + + @property + def pred_data_(self): + return self._yc, self._yt + + @property + def cv_data_(self): + return self._ycv + + @property + def CV_results_(self): + return self._cv_df + @property + def important_features_(self): + return self.important_features + +########################################### ######################################### +class Plsr(Regmodel): + def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10): + super().__init__(train, test, n_iter, add_hyperparams = {'n_components': hp.randint('n_components', 2,20)}) + ### parameters in common + + def objective(self, params): + x0 = [self._xc, self._xt] + + x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)] + + a, b, c = params['deriv'], params['polyorder'], params['window_length'] + if a > b or b > c: + if self._best is not None: + a, b, c = self._best['deriv'], self._best['polyorder'], self._best['window_length'] + + else: + a, b, c = 0, 0, 1 + + params['deriv'], params['polyorder'], params['window_length'] = a, b, c + x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] + + Model = PLSRegression(scale = False, n_components = params['n_components']) + self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) + self._cv_df['Average'] = self._cv_df.mean(axis = 1) + self._cv_df['S'] = self._cv_df.std(axis = 1) + self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] + self._cv_df = self._cv_df.T.round(2) + score = self._cv_df.loc["CV(%)",'rmse'] + + Model = PLSRegression(scale = False, n_components = params['n_components']) + Model.fit(x2[0], self._ytrain) + + if self.SCORE > score: + self.SCORE = score + self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) + self._yc = Model.predict(x2[0]) + self._yt = Model.predict(x2[1]) + self._model = Model + self._best = params + self.x2 = x2[0] + return score + + + ############################################ ######################################### +class TpeIpls(Regmodel): + def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10, n_intervall = 5): + self.n_intervall = n_intervall + self.n_arrets = self.n_intervall*2 + self.bands = pd.DataFrame() + self.bands.index = ['from', 'to'] + + r = {'n_components': hp.randint('n_components', 2,20)} + r.update({f'v{i}': hp.randint(f'v{i}', 0, train[0].shape[1]) for i in range(1,self.n_arrets+1)}) + + super().__init__(train, test, n_iter, add_hyperparams = r) + ### parameters in common + + def objective(self, params): + ### wevelengths index + self.idx = [params[f'v{i}'] for i in range(1,self.n_arrets+1)] + self.idx.sort() + arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)] + id = np.unique(np.concatenate(arrays, axis=0), axis=0) + + # ## Preprocessing + x0 = [self._xc, self._xt] + x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)] + + a, b, c = params['deriv'], params['polyorder'], params['window_length'] + if a > b or b > c: + if self._best is not None: + a, b, c = self._best['deriv'], self._best['polyorder'], self._best['window_length'] + + else: + a, b, c = 0, 0, 1 + + params['deriv'], params['polyorder'], params['window_length'] = a, b, c + x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] + # print(x2) + + # ## Modelling + Model = PLSRegression(scale = False, n_components = params['n_components']) + self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) + self._cv_df['Average'] = self._cv_df.mean(axis = 1) + self._cv_df['S'] = self._cv_df.std(axis = 1) + self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] + self._cv_df = self._cv_df.T.round(2) + score = self._cv_df.loc['CV(%)','rmse'] + + Model = PLSRegression(scale = False, n_components = params['n_components']) + Model.fit(x2[0][:,id], self._ytrain) + + if self.SCORE > score: + self.SCORE = score + self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) + self._yc = Model.predict(x2[0][:,id]) + self._yt = Model.predict(x2[1][:,id]) + self._model = Model + self._best = params + self.x2 = x2[0][:,id] + self.segments = arrays + + for i in range(len(self.segments)): + self.bands[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]] + self.bands.index = ['from','to'] + + return score + + ############################################ ######################################### + +class Pcr(Regmodel): + def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10, n_val = 5): + super.__init__() + {f'pc{i}': hp.randint(f'pc{i+1}', 0, train[0].shape[1]) for i in range(self.n_val)} \ No newline at end of file diff --git a/src/Class_Mod/__init__.py b/src/Class_Mod/__init__.py index e9d334d..82330cb 100644 --- a/src/Class_Mod/__init__.py +++ b/src/Class_Mod/__init__.py @@ -7,7 +7,7 @@ from .DATA_HANDLING import * from .PLSR_ import PinardPlsr from .LWPLSR_ import LWPLSR from .Evaluation_Metrics import metrics -from .VarSel import TpeIpls +#from .VarSel import TpeIpls from .Miscellaneous import resid_plot, reg_plot from .DxReader import DxRead, read_dx from .HDBSCAN_Clustering import Hdbscan @@ -15,3 +15,4 @@ from .SK_PLSR_ import PlsR from .PLSR_Preprocess import PlsProcess from .NMF_ import Nmf from .Ap import AP +from .RegModels import Plsr, TpeIpls \ No newline at end of file diff --git a/src/Modules.py b/src/Modules.py index 21c64b5..db36ca7 100644 --- a/src/Modules.py +++ b/src/Modules.py @@ -1,9 +1,18 @@ from Packages import * -from Class_Mod import PlsR, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP +from Class_Mod import Plsr, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP from Class_Mod import LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx, PlsProcess from Class_Mod.DATA_HANDLING import * from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra, local_css from style.header import add_header from Report import report css_file = Path("style/") -local_css(css_file / "style.css") \ No newline at end of file + +local_css(css_file / "style.css") + +# path = os.path.dirname(os.path.abspath(__file__)).replace('\\','/') +# d1 = path.find('/') +# css_file = path[:d1]+'/style' +# st.session_state["interface"] = st.session_state.get('interface') +# if st.session_state["interface"] == 'simple': +# hide_pages("Predictions") +# local_css(css_file +"/style.css") diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index d3b560c..54ee0f9 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -10,6 +10,10 @@ add_header() st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") + +#path = os.path.dirname(os.path.abspath(__file__)).replace('\\','/') +#css_file = path[:path.find('/pages')]+'/style' +#local_css(css_file +"/style_model.css") local_css(css_file / "style_model.css") ####################################### page Design ####################################### @@ -19,13 +23,17 @@ st.header("I - Data visualization", divider='blue') M0, M00 = st.columns([1, .4]) st.header("II - Model creation", divider='blue') M1, M2, M3 = st.columns([2,2,2]) +st.header("Cross_Validation") +cv1, cv2 = st.columns([2,2]) +cv3 = st.container() + st.header("III - Model Diagnosis", divider='blue') M7, M8 = st.columns([2,2]) M7.write('Predicted vs Measured values') M8.write('Residuals plot') M9 = st.container() M9.write("-- Save the model --") - ###################################################################### + ############################################################################################## reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] ####################################### ########################################### @@ -144,8 +152,7 @@ if not spectra.empty and not y.empty: regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py - Reg = PlsProcess(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test, scale = False, Kfold=3) - Reg.tune(n_iter=500) + Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10) reg_model = Reg.model_ #M2.dataframe(Pin.pred_data_) elif regression_algo == reg_algo[2]: @@ -166,7 +173,7 @@ if not spectra.empty and not y.empty: Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) for i in range(len(pred)): Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) - if i is not 4: + if i != 4: Reg.pred_data_[i].index = list(y_train.index) else: Reg.pred_data_[i].index = list(y_test.index) @@ -176,16 +183,15 @@ if not spectra.empty and not y.empty: it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100) progress_text = "The model is being created. Please wait." - Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s) + Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it) pro = M1.progress(0, text="The model is being created. Please wait!") - rega = Reg.BandSelect(n_iter = it) pro.empty() M1.progress(100, text = "The model has successfully been created!") time.sleep(1) reg_model = Reg.model_ M3.write('-- Spectral regions used for model creation --') - wls = rega[0] - M3.table(wls) + intervalls = Reg.bands.T + M3.table(intervalls) fig, ax = plt.subplots(figsize = (12, 6)) X_train.mean().plot(ax = ax) for i in range(s): @@ -193,12 +199,13 @@ if not spectra.empty and not y.empty: num = {'u','i','f','c'} if np.array(X_train.columns).dtype.kind in num: plt.plot(X_train.columns, X_train.mean(), color = 'black') - ax.axvspan(X_train.columns[rega[0]['from'][i]], X_train.columns[rega[0]['to'][i]], color='#2a52be', alpha=0.5, lw=0) + ax.axvspan(X_train.columns[intervalls['from'][i]], X_train.columns[intervalls['to'][i]], + color='#2a52be', alpha=0.5, lw=0) plt.tight_layout() plt.margins(x = 0) else: - plt.plot(np.arange(X_train.shape[1]), X_train.mean()) - ax.axvspan(rega[0]['from'][i], rega[0]['to'][i], color='#2a52be', alpha=0.5, lw=0) + plt.plot(np.arange(X_train.shape[1]), X_train.mean(), color = 'black') + ax.axvspan(intervalls['from'][i], intervalls['to'][i], color='#2a52be', alpha=0.5, lw=0) plt.tight_layout() plt.margins(x = 0) @@ -211,8 +218,26 @@ if not spectra.empty and not y.empty: ################# Model analysis ############ if regression_algo in reg_algo[1:]: + cv2.write('-- Cross-Validation Summary--') + cv2.write(Reg.CV_results_) + cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --') + + fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", + color_discrete_sequence=px.colors.qualitative.G10) + fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash")) + fig1.update_traces(marker_size=7, showlegend=False) + + cv2.plotly_chart(fig1) + + fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, + color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000) + fig0.update_traces(marker_size=8, showlegend=False) + + cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --') + cv1.plotly_chart(fig0) + yc = Reg.pred_data_[0] - yt = Reg.pred_data_[2] + yt = Reg.pred_data_[1] #if M2.write('-- Spectral preprocessing info --') @@ -231,7 +256,8 @@ if not spectra.empty and not y.empty: M7.pyplot(reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)) M8.pyplot(resid_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)) - + + # rega = Reg.important_features_ ##### ADD FEATURES IMPORTANCE PLOT #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) model_name = M9.text_input('Give it a name') -- GitLab