diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 3ab2543ddb6487cdab66fedc473eef5282d3d270..c9a338934b7c364a4aa7ae1866ada17a16e3aade 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -289,7 +289,7 @@ if not x_block.empty and not y.empty: with c4: # select type of supervised modelling problem mode = c4.radio("The nature of the target variable :", - options=['Continuous', 'Categorical']) + options=['Continuous', 'Categorical'], disabled =True) hash_ = ObjectHash(current=hash_, add=mode) match st.session_state["interface"]: @@ -352,123 +352,129 @@ if not x_block.empty and not y.empty: # Training set preparation for cross-validation(CV) with c5: # Model columns nb_folds = 3 - @st.cache_data def RequestingModelCreation(change): + from utils.regress import Plsr + pre = Plsr(train=[X_train, y_train], test=[ + X_test, y_test], n_iter=40, cv=nb_folds) global Model match model_type: case 'PLS': - from utils.regress import Plsr - Model = Plsr(train=[X_train, y_train], test=[ - X_test, y_test], n_iter=100, cv=nb_folds) + Model = pre case 'TPE-iPLS': from utils.regress import TpeIpls Model = TpeIpls(train=[X_train, y_train], test=[ - X_test, y_test], n_intervall=internum, n_iter=iternum, cv=nb_folds) - + X_test, y_test], n_intervall=internum, n_iter=iternum, cv=nb_folds, bestglobalparams = pre.best_hyperparams_) + Model.best_fit() + case 'LW-PLS': - # split train data into nb_folds for cross_validation - folds = KF_CV.CV(X_train, y_train, nb_folds) - # export data to csv for Julia train/test - global x_train_np, y_train_np, x_test_np, y_test_np - data_to_work_with = ['x_train_np', - 'y_train_np', 'x_test_np', 'y_test_np'] - x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy( - ), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() - # Cross-Validation calculation - d = {} - for i in range(nb_folds): - d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list( - folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] - data_to_work_with.append("xtr_fold{0}".format(i+1)) - data_to_work_with.append("ytr_fold{0}".format(i+1)) - data_to_work_with.append("xte_fold{0}".format(i+1)) - data_to_work_with.append("yte_fold{0}".format(i+1)) - # check best pre-treatment with a global PLSR model - from utils.regress import Plsr - pre = Plsr(train=[X_train, y_train], test=[ - X_test, y_test], n_iter=5) - temp_path = Path('temp/') - with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: - json.dump(pre.best_hyperparams_, outfile) - # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files - for i in data_to_work_with: - if 'fold' in i: - j = d[i] - else: - j = globals()[i] - np.savetxt(temp_path / str(i + ".csv"), - j, delimiter=",") - open(temp_path / 'model', 'w').close() - # run Julia Jchemo as subprocess - import subprocess - subprocess_path = Path("utils/") - subprocess.run( - [str(sys.executable), subprocess_path / "lwplsr_call.py"]) - # retrieve json results from Julia JChemo - try: - with open(temp_path / "lwplsr_outputs.json", "r") as outfile: - Reg_json = json.load(outfile) - # delete csv files - for i in data_to_work_with: - os.unlink(temp_path / str(i + ".csv")) - # delete json file after import - os.unlink(temp_path / "lwplsr_outputs.json") - os.unlink(temp_path / "lwplsr_preTreatments.json") - os.unlink(temp_path / 'model') - # format result data into Reg object - # keys of the dict - pred = ['pred_data_train', 'pred_data_test'] - for i in range(nb_folds): - # add cv folds keys to pred - pred.append("CV" + str(i+1)) - - from utils.regress import LwplsObject - Model = LwplsObject(Reg_json=Reg_json, pred=pred) - Model.CV_results_ = DataFrame() - Model.cv_data_ = {'YpredCV': {}, 'idxCV': {}} - # set indexes to Model.pred_data (train, test, folds idx) - for i in range(len(pred)): - Model.pred_data_[i] = Model.pred_data_[ - i].T.reset_index().drop(columns=['index']) - if i == 0: # data_train - Model.pred_data_[i].index = list(y_train.index) - Model.pred_data_[i] = Model.pred_data_[ - i].iloc[:, 0] - elif i == 1: # data_test - Model.pred_data_[i].index = list(y_test.index) - Model.pred_data_[i] = Model.pred_data_[ - i].iloc[:, 0] - else: - # CVi - Model.pred_data_[i].index = folds[list(folds)[ - i-2]] - Model.cv_data_[ - 'YpredCV']['Fold' + str(i-1)] = np.array(Model.pred_data_[i]).reshape(-1) - Model.cv_data_[ - 'idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) - - Model.CV_results_ = KF_CV.metrics_cv(y=y_train, ypcv=Model.cv_data_[ - 'YpredCV'], folds=folds)[1] - # cross validation results print - Model.best_hyperparams_print = Model.best_hyperparams_ - # plots - Model.cv_data_ = KF_CV().meas_pred_eq(y=np.array(y_train), - ypcv=Model.cv_data_['YpredCV'], folds=folds) - Model.pretreated_spectra_ = pre.pretreated_spectra_ - - Model.best_hyperparams_print = { - **pre.best_hyperparams_, **Model.best_hyperparams_} - Model.best_hyperparams_ = { - **pre.best_hyperparams_, **Model.best_hyperparams_} - - Model.__hash__ = ObjectHash( - current=hash_, add=Model.best_hyperparams_print) - except FileNotFoundError: - Model = None - for i in data_to_work_with: - os.unlink(temp_path / str(i + ".csv")) + from utils.regress import LWPLS + Model = LWPLS(train = [X_train, y_train], test = [X_test, y_test], n_iter = 10, cv = nb_folds, bestglobalparams = pre.best_hyperparams_) + Model.best_fit() + + # The snippet of code below was first used to communicate with Julia for developing lwplsr() LWPLS modelling, but just lately, lwplsr() xas written in Python and utilized instead. + # case 'LW-PLS': + # # split train data into nb_folds for cross_validation + # folds = KF_CV.CV(X_train, y_train, nb_folds) + # # export data to csv for Julia train/test + # global x_train_np, y_train_np, x_test_np, y_test_np + # data_to_work_with = ['x_train_np', + # 'y_train_np', 'x_test_np', 'y_test_np'] + # x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy( + # ), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() + # # Cross-Validation calculation + # d = {} + # for i in range(nb_folds): + # d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list( + # folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] + # data_to_work_with.append("xtr_fold{0}".format(i+1)) + # data_to_work_with.append("ytr_fold{0}".format(i+1)) + # data_to_work_with.append("xte_fold{0}".format(i+1)) + # data_to_work_with.append("yte_fold{0}".format(i+1)) + # # check best pre-treatment with a global PLSR model + # from utils.regress import Plsr + # pre = Plsr(train=[X_train, y_train], test=[X_test, y_test], n_iter=5) + # temp_path = Path('temp/') + # with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: + # json.dump(pre.best_hyperparams_, outfile) + # # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files + # for i in data_to_work_with: + # if 'fold' in i: + # j = d[i] + # else: + # j = globals()[i] + # np.savetxt(temp_path / str(i + ".csv"), + # j, delimiter=",") + # open(temp_path / 'model', 'w').close() + # # run Julia Jchemo as subprocess + # import subprocess + # subprocess_path = Path("utils/") + # subprocess.run( + # [str(sys.executable), subprocess_path / "lwplsr_call.py"]) + # # retrieve json results from Julia JChemo + # try: + # with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + # Reg_json = json.load(outfile) + # # delete csv files + # for i in data_to_work_with: + # os.unlink(temp_path / str(i + ".csv")) + # # delete json file after import + # os.unlink(temp_path / "lwplsr_outputs.json") + # os.unlink(temp_path / "lwplsr_preTreatments.json") + # os.unlink(temp_path / 'model') + # # format result data into Reg object + # # keys of the dict + # pred = ['pred_data_train', 'pred_data_test'] + # for i in range(nb_folds): + # # add cv folds keys to pred + # pred.append("CV" + str(i+1)) + + # from utils.regress import LwplsObject + # Model = LwplsObject(Reg_json=Reg_json, pred=pred) + # Model.CV_results_ = DataFrame() + # Model.cv_data_ = {'YpredCV': {}, 'idxCV': {}} + # # set indexes to Model.pred_data (train, test, folds idx) + # for i in range(len(pred)): + # Model.pred_data_[i] = Model.pred_data_[ + # i].T.reset_index().drop(columns=['index']) + # if i == 0: # data_train + # Model.pred_data_[i].index = list(y_train.index) + # Model.pred_data_[i] = Model.pred_data_[ + # i].iloc[:, 0] + # elif i == 1: # data_test + # Model.pred_data_[i].index = list(y_test.index) + # Model.pred_data_[i] = Model.pred_data_[ + # i].iloc[:, 0] + # else: + # # CVi + # Model.pred_data_[i].index = folds[list(folds)[ + # i-2]] + # Model.cv_data_[ + # 'YpredCV']['Fold' + str(i-1)] = np.array(Model.pred_data_[i]).reshape(-1) + # Model.cv_data_[ + # 'idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) + + # Model.CV_results_ = KF_CV.metrics_cv(y=y_train, ypcv=Model.cv_data_[ + # 'YpredCV'], folds=folds)[1] + # # cross validation results print + # Model.best_hyperparams_print = Model.best_hyperparams_ + # # plots + # Model.cv_data_ = KF_CV().meas_pred_eq(y=np.array(y_train), + # ypcv=Model.cv_data_['YpredCV'], folds=folds) + # Model.pretreated_spectra_ = pre.pretreated_spectra_ + + # Model.best_hyperparams_print = { + # **pre.best_hyperparams_, **Model.best_hyperparams_} + # Model.best_hyperparams_ = { + # **pre.best_hyperparams_, **Model.best_hyperparams_} + + # Model.__hash__ = ObjectHash( + # current=hash_, add=Model.best_hyperparams_print) + # except FileNotFoundError: + # Model = None + # for i in data_to_work_with: + # os.unlink(temp_path / str(i + ".csv")) case "": Model = None @@ -546,6 +552,7 @@ if model_type: ax3.grid() ax3.set_xlabel('Wavelenghts/Wavenumbers') ax3.set_ylabel('Vip') + case 'TPE-iPLS': fig, (ax1, ax2, ax3) = plt.subplots( 3, 1, figsize=(12, 4), sharex=True) @@ -570,6 +577,7 @@ if model_type: ax3.set_ylabel('Vip') ax3.grid() ax3.set_xlabel('Wavelenghts/Wavenumbers') + case 'LW-PLS': fig, (ax1, ax2) = plt.subplots( 2, 1, figsize=(12, 4), sharex=True) @@ -682,6 +690,7 @@ if model_type: prep_para[i] = str(modelling.best_hyperparams_[i])+'nd' # reg plot and residuals plot + yc = y_train if model_type == "LW-PLS" else yc measured_vs_predicted = reg_plot([y_train, y_test], [ yc, yt], train_idx=train_index, test_idx=test_index, trainplot=False if model_type == "LW-PLS" else True) residuals_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, diff --git a/src/pages/3-prediction.py b/src/pages/3-prediction.py index 971974b4200ce9d9ad12335f56f72328e3b9dda7..dadd3539640fb4d3dc44171fe131093ae665bf42 100644 --- a/src/pages/3-prediction.py +++ b/src/pages/3-prediction.py @@ -140,9 +140,10 @@ with c1: x_block.columns = x_block.columns.astype(str) yname = system_data['data']['target'].name st.info("Loaded model to predict " + yname) - shared_elements = set(system_data['predictors_']).intersection(x_block.columns) - if len(shared_elements)==len(system_data['predictors_']): - pred_data = x_block.loc[:,system_data['predictors_']] + shared_elements = set( + system_data['predictors_']).intersection(x_block.columns) + if len(shared_elements) == len(system_data['predictors_']): + pred_data = x_block.loc[:, system_data['predictors_']] else: st.error( 'The names of the features (columns) in the training set and the prediction set are not identical. Thus, prediction cannot be performed.') @@ -264,55 +265,63 @@ if not preprocessed.empty: else: st.write('Model was fitted on '+str(nvar) + 'but prediction data has '+str(preprocesseddf.shape[1])) - case 'LW-PLS': + case 'LW-PLS': try: - temp_path = Path('temp/') - # export data to csv for Julia train/pred - # with pretreatments - spectra = preprocess_spectra( - system_data['data']['raw-spectra'], change=hash_) - x_pred = preprocessed - rownames = x_pred.index.to_list() - y = system_data['data']['target'] - data_to_work_with = [ - 'spectra_np', 'y_np', 'x_pred_np'] - spectra_np, y_np, x_pred_np = spectra[1].to_numpy( - ), y.to_numpy(), x_pred.to_numpy() - # export spectra, y, x_pred to temp folder as csv files - for i in data_to_work_with: - j = globals()[i] - np.savetxt( - temp_path / str(i + ".csv"), j, delimiter=",") - # export best LWPLSR params - with open(temp_path / "lwplsr_best_params.json", "w+") as outfile: - json.dump(system_data['lwpls_params'], outfile) - # create empty file to specify LWPLSR_Call.py that we want predictions - open(temp_path / 'predict', 'w').close() - # # run Julia Jchemo as subprocess - import subprocess - subprocess_path = Path("utils/") - subprocess.run( - [str(sys.executable), subprocess_path / "LWPLSR_Call.py"]) - # retrieve json results from Julia JChemo - try: - with open(temp_path / "lwplsr_outputs.json", "r") as outfile: - modelling_json = json.load(outfile) - # delete csv files - for i in data_to_work_with: - os.unlink(temp_path / str(i + ".csv")) - os.unlink(temp_path / 'predict') - # delete json file after import - os.unlink(temp_path / "lwplsr_outputs.json") - os.unlink( - temp_path / "lwplsr_best_params.json") - # keys of the json dict - result = DataFrame(modelling_json['y_pred']) - result.index = rownames - result.columns = ['Results'] - except FileNotFoundError as e: - for i in data_to_work_with: - os.unlink(temp_path / str(i + ".csv")) - os.unlink(temp_path / 'predict') + _, spectra = preprocess_spectra( + system_data['data']['raw-spectra'].iloc[system_data['data']['training_data_idx'], :], change=hash_) + from utils.lwplsr_julia_converted import lwpls + result = DataFrame(lwpls(Xtrain=np.array(spectra), Xtest=np.array(preprocessed), + ytrain=np.array(system_data['data']['target'].iloc[system_data['data']['training_data_idx']]), + globalplsVL=system_data['model_']['globalplsVL'], metric=system_data['model_']['dist'], + h=system_data['model_']['h'], k=system_data['model_']['k'], + localplsVL=system_data['model_']['localplsVL'], center=True, scale=False, sklearn=True), index =preprocessed.index) + # temp_path = Path('temp/') + # # export data to csv for Julia train/pred + # # with pretreatments + # spectra = preprocess_spectra( + # system_data['data']['raw-spectra'], change=hash_) + # x_pred = preprocessed + # rownames = x_pred.index.to_list() + # y = system_data['data']['target'] + # data_to_work_with = [ + # 'spectra_np', 'y_np', 'x_pred_np'] + # spectra_np, y_np, x_pred_np = spectra[1].to_numpy( + # ), y.to_numpy(), x_pred.to_numpy() + # # export spectra, y, x_pred to temp folder as csv files + # for i in data_to_work_with: + # j = globals()[i] + # np.savetxt( + # temp_path / str(i + ".csv"), j, delimiter=",") + # # export best LWPLSR params + # with open(temp_path / "lwplsr_best_params.json", "w+") as outfile: + # json.dump(system_data['lwpls_params'], outfile) + # # create empty file to specify LWPLSR_Call.py that we want predictions + # open(temp_path / 'predict', 'w').close() + # # # run Julia Jchemo as subprocess + # import subprocess + # subprocess_path = Path("utils/") + # subprocess.run( + # [str(sys.executable), subprocess_path / "LWPLSR_Call.py"]) + # # retrieve json results from Julia JChemo + # try: + # with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + # modelling_json = json.load(outfile) + # # delete csv files + # for i in data_to_work_with: + # os.unlink(temp_path / str(i + ".csv")) + # os.unlink(temp_path / 'predict') + # # delete json file after import + # os.unlink(temp_path / "lwplsr_outputs.json") + # os.unlink( + # temp_path / "lwplsr_best_params.json") + # # keys of the json dict + # result = DataFrame(modelling_json['y_pred']) + # result.index = rownames + # result.columns = ['Results'] + # except FileNotFoundError as e: + # for i in data_to_work_with: + # os.unlink(temp_path / str(i + ".csv")) + # os.unlink(temp_path / 'predict') except: st.error('Error during LWPLSR predictions') @@ -335,15 +344,14 @@ if not result.empty: st.info('descriptive statistics for the model output') st.write(DataFrame(desc_stats(result))) - ################################# Download results ################################# - with st.container(): + ################################# Download results ################################# + with st.container(): if not result.empty: @st.cache_data(show_spinner=False) def preparing_results_for_downloading(change): with open(Path('report/results/dataset/')/predfile.name, "wb") as f: f.write(predfile.getvalue()) - rawspectraplot.savefig( './report/results/figures/raw_spectra.png') prepspectraplot.savefig( @@ -379,4 +387,4 @@ if not result.empty: HandleItems.delete_files(keep=['.py', '.pyc', '.bib']) except: - pass \ No newline at end of file + pass diff --git a/src/utils/data_handling.py b/src/utils/data_handling.py index c84abd96f523f7d051861d8efc49be83524ea929..1c427b58543d856a4f93b067c5371b3b7c06fdaa 100644 --- a/src/utils/data_handling.py +++ b/src/utils/data_handling.py @@ -372,7 +372,7 @@ class KF_CV: DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1, 1)) r.index = folds[Fname] r['Folds'] = [str(Fname)+'(Predicted = '+str(np.round(ols.intercept_[0], 2)) + - str(np.round(ols.coef_[0][0], 2))+' x Measured'] * r.shape[0] + str(np.round(ols.coef_[0][0], 2))+' x Measured'+ ')'] * r.shape[0] cvcv[i] = r coeff[Fname] = [ols.coef_[0][0], ols.intercept_[0]] diff --git a/src/utils/lwplsr_julia_converted.py b/src/utils/lwplsr_julia_converted.py index 2935312b0dab29e711d91d17388fa5100091d1f3..ae62221ee5ddebf81ef248cae1453ec73fe94d4c 100644 --- a/src/utils/lwplsr_julia_converted.py +++ b/src/utils/lwplsr_julia_converted.py @@ -1,6 +1,6 @@ import numpy as np import numpy.typing as npt -from weighted_ikpls import PLS +from .weighted_ikpls import PLS def mad(X, zmed, axis=1, keepdims=True): """ diff --git a/src/utils/regress.py b/src/utils/regress.py index eea1971120e6252d21b7ea2b53bbbe68f1666d20..9ac3a696ed27ba918d30ac0ef6d99f3f7aa6f863 100644 --- a/src/utils/regress.py +++ b/src/utils/regress.py @@ -1,3 +1,5 @@ +from .lwplsr_julia_converted import lwpls +import streamlit as st import numpy as np from pandas import DataFrame from utils.eval_metrics import metrics @@ -9,7 +11,7 @@ from utils.data_handling import Snv, No_transformation, KF_CV, sel_ratio class Regmodel(object): - def __init__(self, train, test, n_iter, add_hyperparams = None, nfolds = 3, **kwargs): + def __init__(self, train, test, n_iter, add_hyperparams=None, remove_hyperparams=None, nfolds=3, **kwargs): self.SCORE = 100000000 self._xc, self._xt, self._ytrain, self._ytest = train[0], test[0], train[1], test[1] @@ -19,29 +21,39 @@ class Regmodel(object): self._cv_df = DataFrame() self._sel_ratio = DataFrame() self._nfolds = nfolds - self._selected_bands = DataFrame(index = ['from', 'to']) + self._selected_bands = DataFrame(index=['from', 'to']) self.important_features = None - self._hyper_params = {'polyorder': hp.choice('polyorder', [0, 1, 2]), - 'deriv': hp.choice('deriv', [0, 1, 2]), - 'window_length': hp.choice('window_length', [15, 21, 27, 33]), - 'normalization': hp.choice('normalization', ['Snv', 'No_transformation'])} + if self._xc.shape[1] > 1000: + a = [15, 21, 27, 33] + else: + a = [5, 7, 9] + self._hyper_params = {'polyorder': hp.choice('polyorder', [2]), + 'deriv': hp.choice('deriv', [0, 1]), + # [15, 21, 27, 33] + 'window_length': hp.choice('window_length', [9]), + 'normalization': hp.choice('normalization', ['No_transformation'])} + if remove_hyperparams is not None: + for i in remove_hyperparams: + self._hyper_params.pop(i, None) + if add_hyperparams is not None: self._hyper_params.update(add_hyperparams) self._best = None trials = Trials() best_params = fmin(fn=self.objective, - space=self._hyper_params, - algo=tpe.suggest, # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach - max_evals=n_iter, - trials=trials, - verbose=1) - + space=self._hyper_params, + # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach + algo=tpe.suggest, + max_evals=n_iter, + trials=trials, + verbose=1) + @property def train_data_(self): return [self._xc, self._ytrain] - + @property def test_data_(self): return [self._xt, self._ytest] @@ -51,60 +63,70 @@ class Regmodel(object): return self.pretreated @property - def get_params_(self):### This method return the search space where the optimization algorithm will search for optimal subset of hyperparameters - return self._hyper_params - + def get_params_(self): # This method return the search space where the optimization algorithm will search for optimal subset of hyperparameters + return self._hyper_params + def objective(self, params): - pass - + pass + @property - def best_hyperparams_(self): ### This method returns the subset of selected hyperparametes + # This method returns the subset of selected hyperparametes + def best_hyperparams_(self): return self._best + @property - def best_hyperparams_print(self):### This method returns a sentence telling what signal preprocessing method was applied + # This method returns a sentence telling what signal preprocessing method was applied + def best_hyperparams_print(self): if self._best['normalization'] == 'Snv': a = 'Standard Normal Variate (SNV)' elif self._best['normalization'] == 'No_transformation': a = " No transformation was performed" - bb,cc,dd = str(self._best['window_length']), str(self._best['polyorder']),str(self._best['deriv']) - SG = '- Savitzky-Golay derivative parameters: \n(Window_length:'+bb+';polynomial order:'+ cc+'; Derivative order : '+ dd + bb, cc, dd = str(self._best['window_length']), str( + self._best['polyorder']), str(self._best['deriv']) + SG = '- Savitzky-Golay derivative parameters: \n(Window_length:' + \ + bb+';polynomial order:' + cc+'; Derivative order : ' + dd Norm = '- Spectral Normalization: \n'+a return SG+"\n"+Norm - + @property - def model_(self): # This method returns the developed model + def model_(self): # This method returns the developed model return self._model - + @property - def pred_data_(self): ## this method returns the predicted data in training and testing steps + def pred_data_(self): # this method returns the predicted data in training and testing steps return self._yc, self._yt - + @property - def cv_data_(self): ## Cross validation data + def cv_data_(self): # Cross validation data return self._ycv - + @property def CV_results_(self): return self._cv_df + @property def important_features_(self): return self.important_features + @property def selected_features_(self): return self._selected_bands - + @property def sel_ratio_(self): return self._sel_ratio - + ########################################### PLSR ######################################### + + class Plsr(Regmodel): - def __init__(self, train, test, n_iter = 10, cv = 3): - super().__init__(train, test, n_iter, nfolds = cv, add_hyperparams = {'n_components': hp.randint('n_components', 1,20)}) - ### parameters in common - + def __init__(self, train, test, n_iter=10, cv=3): + super().__init__(train, test, n_iter, nfolds=cv, add_hyperparams={ + 'n_components': hp.randint('n_components', 1, 20)}) + # parameters in common + def objective(self, params): params['n_components'] = int(params['n_components']) x0 = [self._xc, self._xt] @@ -121,126 +143,213 @@ class Plsr(Regmodel): else: a, b, c = 0, 0, 1 - params['deriv'], params['polyorder'], params['window_length'] = a, b, c - x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] - - model = PLSRegression(scale = False, n_components = params['n_components']) - folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) - yp = KF_CV().cross_val_predictor(model = model, folds = folds, x = x2[0], y = np.array(self._ytrain)) - self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] - - score = self._cv_df.loc["cv",'rmse'] - - Model = PLSRegression(scale = False, n_components = params['n_components']) + params['deriv'], params['polyorder'], params['window_length'] = a, b, c + x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], + window_length=params['window_length']) for i in range(2)] + + model = PLSRegression(scale=False, n_components=params['n_components']) + folds = KF_CV().CV(x=x2[0], y=np.array( + self._ytrain), n_folds=self._nfolds) + yp = KF_CV().cross_val_predictor( + model=model, folds=folds, x=x2[0], y=np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv( + y=np.array(self._ytrain), ypcv=yp, folds=folds)[1] + + score = self._cv_df.loc["mean", 'rmse'] / \ + np.max([0.01, self._cv_df.loc["mean", 'r2']]) + + Model = PLSRegression(scale=False, n_components=params['n_components']) Model.fit(x2[0], self._ytrain) if self.SCORE > score: self.SCORE = score - self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds) + self._ycv = KF_CV().meas_pred_eq(y=np.array(self._ytrain), ypcv=yp, folds=folds) self._yc = Model.predict(x2[0]) self._yt = Model.predict(x2[1]) self._model = Model - for key,value in params.items(): - try: params[key] = int(value) - except (TypeError, ValueError): params[key] = value + for key, value in params.items(): + try: + params[key] = int(value) + except (TypeError, ValueError): + params[key] = value self._best = params self.pretreated = DataFrame(x2[0]) self._sel_ratio = sel_ratio(Model, x2[0]) return score - ############################################ iplsr ######################################### + + class TpeIpls(Regmodel): - def __init__(self, train, test, n_iter = 10, n_intervall = 5, cv = 3): + def __init__(self, train, test, n_iter=10, n_intervall=5, cv=3, bestglobalparams=None): + self.glob = bestglobalparams + self._best = {} + self.folds = KF_CV().CV(x=np.array( + train[0]), y=np.array(train[1]), n_folds=3) + x1 = [eval(str(self.glob['normalization'])+'(train[0])'), + eval(str(self.glob['normalization'])+'(test[0])')] + self.x2 = [savgol_filter(x1[i], polyorder=self.glob['polyorder'], deriv=self.glob['deriv'], + window_length=self.glob['window_length']) for i in range(2)] self.n_intervall = n_intervall self.n_arrets = self.n_intervall*2 - - - r = {'n_components': hp.randint('n_components', 1,20)} - r.update({'v'+str(i): hp.randint('v'+str(i), 0, train[0].shape[1]) for i in range(1,self.n_arrets+1)}) - - super().__init__(train, test, n_iter, add_hyperparams = r, nfolds = cv) - - ### parameters in common - - def objective(self, params): - ### wevelengths index - self.idx = [params['v'+str(i)] for i in range(1,self.n_arrets+1)] - self.idx.sort() - arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)] - id = np.unique(np.concatenate(arrays, axis=0), axis=0) - ### Preprocessing - x0 = [self._xc, self._xt] - # x1 = [eval(str(params['normalization'])+"(x0[i])") for i in range(2)] - x1 = [] - x1.append(eval(str(params['normalization'])+'(x0[0])')) - x1.append(eval(str(params['normalization'])+'(x0[1])')) + add = {'n_components': hp.randint('n_components', 1, 20)} + add.update({'v'+str(i): hp.randint('v'+str(i), 0, + train[0].shape[1]) for i in range(1, self.n_arrets+1)}) + super().__init__(train, test, n_iter, nfolds=cv, add_hyperparams=add) - a, b, c = params['deriv'], params['polyorder'], params['window_length'] - if a > b or b > c: - if self._best is not None: - a, b, c = self._best['deriv'], self._best['polyorder'], self._best['window_length'] + # parameters in common - else: - a, b, c = 0, 0, 1 - - params['deriv'], params['polyorder'], params['window_length'] = a, b, c - x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] - - - prepared_data = [x2[i][:,id] for i in range(2)] + def objective(self, params): + # wevelengths index + self.idx = [params['v'+str(i)] for i in range(1, self.n_arrets+1)] + self.idx.sort() + arrays = [np.arange(self.idx[2*i], self.idx[2*i+1]+1) + for i in range(self.n_intervall)] + id = np.unique(np.concatenate(arrays, axis=0), axis=0) + prepared_data = [self.x2[i][:, id] for i in range(2)] - - ### Modelling - folds = KF_CV().CV(x = prepared_data[0], y = np.array(self._ytrain), n_folds = self._nfolds) + # Modelling + folds = KF_CV().CV(x=prepared_data[0], y=np.array( + self._ytrain), n_folds=self._nfolds) try: - model = PLSRegression(scale = False, n_components = params['n_components']) - yp = KF_CV().cross_val_predictor(model = model, folds = folds, x = prepared_data[0], y = np.array(self._ytrain)) - self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] + model = PLSRegression( + scale=False, n_components=params['n_components']) + yp = KF_CV().cross_val_predictor(model=model, folds=folds, + x=prepared_data[0], y=np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv( + y=np.array(self._ytrain), ypcv=yp, folds=folds)[1] except: params["n_components"] = 1 - model = PLSRegression(scale = False, n_components = params["n_components"]) - yp = KF_CV().cross_val_predictor(model = model, folds = folds, x = prepared_data[0], y = np.array(self._ytrain)) - self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] + model = PLSRegression( + scale=False, n_components=params["n_components"]) + yp = KF_CV().cross_val_predictor(model=model, folds=folds, + x=prepared_data[0], y=np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv( + y=np.array(self._ytrain), ypcv=yp, folds=folds)[1] - - score = self._cv_df.loc['cv','rmse'] - - Model = PLSRegression(scale = False, n_components = model.n_components) - Model.fit(prepared_data[0], self._ytrain) + score = self._cv_df.loc["mean", 'rmse'] / \ + np.max([0.01, self._cv_df.loc["mean", 'r2']]) if self.SCORE > score: self.SCORE = score - self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds) - - self._yc = Model.predict(prepared_data[0]) - self._yt = Model.predict(prepared_data[1]) - self._model = Model - for key,value in params.items(): - try: params[key] = int(value) - except (TypeError, ValueError): params[key] = value self._best = params - - self.pretreated = DataFrame(x2[0]) - limits = np.ones(len(arrays)*2) - for i in range(len(arrays)): - limits[2*i], limits[2*i+1] = arrays[i][0], arrays[i][arrays[i].shape[0]-1] - - self.limits = limits.astype(int) - + self.arrays = arrays + self.prepared_data = prepared_data + self.model = model return score - + + def best_fit(self): + Model = PLSRegression( + scale=False, n_components=self.model.n_components) + Model.fit(self.prepared_data[0], self._ytrain) + + self._yc = Model.predict(self.prepared_data[0]) + yp = KF_CV().cross_val_predictor(model=Model, folds=self.folds, + x=self.prepared_data[0], y=np.array(self._ytrain)) + self._ycv = KF_CV().meas_pred_eq(y=np.array( + self._ytrain), ypcv=yp, folds=self.folds) + self._yt = Model.predict(self.prepared_data[1]) + self._model = Model + + for key, value in self._best.items(): + try: + self._best[key] = int(value) + except (TypeError, ValueError): + self._best[key] = value + + self.pretreated = DataFrame(self.x2[0]) + limits = np.ones(len(self.arrays)*2) + for i in range(len(self.arrays)): + limits[2*i], limits[2*i + + 1] = self.arrays[i][0], self.arrays[i][self.arrays[i].shape[0]-1] + + self.limits = limits.astype(int) ########################################### LWPLSR ######################################### + class LwplsObject: - def __init__(self, Reg_json = None, pred = None): + def __init__(self, Reg_json=None, pred=None): if Reg_json is not None and pred is not None: from pandas import json_normalize self.model_ = Reg_json['model'] self.best_hyperparams_ = Reg_json['best_lwplsr_params'] self.pred_data_ = [json_normalize(Reg_json[i]) for i in pred] - - ############################################ Pcr ######################################### \ No newline at end of file + + ############################################ Pcr ######################################### + + +class LWPLS(Regmodel): + def __init__(self, train, test, n_iter=10, cv=3, bestglobalparams=None): + self.glob = bestglobalparams + self._best = {} + add = { + 'localplsVL': hp.randint('localplsVL', 2, bestglobalparams['n_components']), + 'dist': hp.choice('dist', ['euc', 'mah']), + 'h': hp.randint('h', 1, 3)} + self.folds = KF_CV().CV(x=np.array( + train[0]), y=np.array(train[1]), n_folds=3) + + x1 = [eval(str(self.glob['normalization'])+'(train[0])'), + eval(str(self.glob['normalization'])+'(test[0])')] + self.x2 = [savgol_filter(x1[i], polyorder=self.glob['polyorder'], deriv=self.glob['deriv'], + window_length=self.glob['window_length']) for i in range(2)] + super().__init__(train, test, n_iter, nfolds=cv, + add_hyperparams=add, remove_hyperparams=None) + + def objective(self, params): + yp = {} + for i in self.folds.keys(): + yp[i] = lwpls(Xtrain=np.delete(np.array(self.x2[0]), self.folds[i], axis=0), + ytrain=np.delete( + np.array(self._ytrain), self.folds[i], axis=0), + Xtest=np.array(self.x2[0])[self.folds[i]], + globalplsVL=self.glob['n_components'], metric=params['dist'], h=params['h'], k=200, + localplsVL=params['localplsVL'], center=True, scale=False, sklearn=True).ravel() + + self._cv_df = KF_CV().metrics_cv(y=np.array( + self._ytrain), ypcv=yp, folds=self.folds)[1] + score = self._cv_df.loc["mean", 'rmse'] / \ + np.max([0.01, self._cv_df.loc["mean", 'r2']]) + + if self.SCORE > score: + self.SCORE = score + self._best = params + return score + + def best_fit(self): + from .lwplsr_julia_converted import lwpls + yp = {} + for i in self.folds.keys(): + yp[i] = lwpls(Xtrain=np.delete(np.array(self.x2[0]), self.folds[i], axis=0), + ytrain=np.delete( + np.array(self._ytrain), self.folds[i], axis=0), + Xtest=np.array(self.x2[0])[self.folds[i]], + globalplsVL=self.glob['n_components'], metric=self._best['dist'], h=self._best['h'], k=200, + localplsVL=self._best['localplsVL'], center=True, scale=False, sklearn=True).ravel() + + self._ycv = KF_CV().meas_pred_eq(y=np.array( + self._ytrain), ypcv=yp, folds=self.folds) + self._yt = lwpls(Xtrain=np.array(self.x2[0]), + ytrain=np.array(self._ytrain), + Xtest=np.array(self.x2[1]), + globalplsVL=self.glob['n_components'], metric=self._best['dist'], h=self._best['h'], k=200, + localplsVL=self._best['localplsVL'], center=True, scale=False, sklearn=True).ravel() + self.pretreated = DataFrame(self.x2[0]) + self._model = "LW-PLS" + for key, value in self._best.items(): + self._best[key] = int(value) if isinstance(value, np.int64) else float( + value) if isinstance(value, np.float64) else value + + self._model = {'globalplsVL': self.glob['n_components'], + 'localplsVL': self._best['localplsVL'], + 'dist': self._best['dist'], + 'k': 200, + 'h': self._best['h']} + self._best = {'normalization':self.glob['normalization'], + 'polyorder':self.glob['polyorder'], + 'window_length':self.glob['window_length'], + 'deriv':self.glob['deriv'], + 'globalplsVL': self.glob['n_components']}