From 7c9546c2dc502a971836f9f2e77b90ab9be5439d Mon Sep 17 00:00:00 2001 From: barthes <nicolas.barthes@cefe.cnrs.fr> Date: Tue, 25 Jun 2024 12:15:29 +0200 Subject: [PATCH] LWPLSR subprocess documentation --- .gitignore | 1 + src/Class_Mod/LWPLSR_.py | 39 ++++++++++++++++-------------------- src/Class_Mod/LWPLSR_Call.py | 11 ++++++++-- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 0c742b4..927355c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ src/Report/*.zip src/Report/figures/*.pdf src/Report/figures/*.png config/config.json +data/params/Preprocessing.json \ No newline at end of file diff --git a/src/Class_Mod/LWPLSR_.py b/src/Class_Mod/LWPLSR_.py index a7bd379..da661d6 100644 --- a/src/Class_Mod/LWPLSR_.py +++ b/src/Class_Mod/LWPLSR_.py @@ -7,33 +7,28 @@ class LWPLSR: Returns: self.scores (DataFrame): various metrics and scores - self.predicted_results_on_train (DataFrame): - self.predicted_results_on_test (DataFrame): + self.predicted_results (Dictionary): Dict containing all predicted results (train, test, cross-validation) self.mod (Julia model): the prepared model """ def __init__(self, dataset): """Initiate the LWPLSR and prepare data for Julia computing.""" - - # self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))] + # get train / test data from dataset self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(4)] + # calculate number of KFolds and get CV data from dataset self.nb_fold = int((len(dataset)-4)/4) for i in range(self.nb_fold): setattr(self, "xtr_fold"+str(i+1), dataset[i+7]) setattr(self, "ytr_fold"+str(i+1), dataset[i+13]) setattr(self, "xte_fold"+str(i+1), dataset[i+4]) - # setattr(self, "yte_fold"+str(i+1), dataset[i+10]) setattr(jl, "xtr_fold"+str(i+1), dataset[i+7]) setattr(jl, "ytr_fold"+str(i+1), dataset[i+13]) setattr(jl, "xte_fold"+str(i+1), dataset[i+4]) - # setattr(jl, "yte_fold"+str(i+1), dataset[i+10]) - # prepare to send dataframes to julia and Jchemo + # prepare to send dataframes to julia and Jchemo (with the jl. prefix) jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test # initialize vars from the class y_shape = self.y_test.shape - self.predicted_results_on_test = pd.DataFrame - self.predicted_results_on_train = pd.DataFrame self.pred_test = np.zeros(shape=(y_shape[0], 1)) self.pred_train = np.zeros(shape=(y_shape[0], 1)) self.mod = "" @@ -52,7 +47,7 @@ class LWPLSR: Returns: self.mod (Julia model): the prepared model """ - # launch Julia Jchemo lwplsr + # launch Julia Jchemo lwplsr and convert DataFrames from Python Pandas DataFrame to Julia DataFrame jl.seval(""" using DataFrames using Pandas @@ -63,7 +58,7 @@ class LWPLSR: y_test |> Pandas.DataFrame |> DataFrames.DataFrame """) print('LWPLSR - tuning') - # set tuning parameters + # set tuning parameters to test jl.seval(""" nlvdis = [5; 10; 15] ; metric = [:eucl; :mah] h = [1; 2; 6; Inf] ; k = [30; 80; 200] @@ -83,21 +78,22 @@ class LWPLSR: ncal = ntrain - nval """) - # Create LWPLSR model and tune + # Create LWPLSR model and tune with GridScore jl.seval(""" mod = Jchemo.model(Jchemo.lwplsr) res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false) u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination """) + # save best lwplsr parameters self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]} print('best lwplsr params ' + str(self.best_lwplsr_params)) - print('LWPLSR - best params ok') - # calculate LWPLSR model with best parameters + # run LWPLSR model with best parameters jl.seval(""" mod = Jchemo.model(Jchemo.lwplsr; nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u]) # Fit model Jchemo.fit!(mod, x_train, y_train) """) + # save Julia Jchemo model self.mod = jl.mod def Jchemo_lwplsr_predict(self): @@ -128,14 +124,13 @@ class LWPLSR: print('LWPLSR - end') def Jchemo_lwplsr_cv(self): - """Send data to Julia to predict with lwplsr. + """Send Cross-Validation data to Julia to fit & predict with lwplsr. Args: - self.mod (Julia model): the prepared model + self.best_lwplsr_params: the best parameters to use (from tuning) for CV self.xtr_fold1 (DataFrame): self.ytr_fold1 (DataFrame): self.xte_fold1 (DataFrame): - self.yte_fold1 (DataFrame): Returns: self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation @@ -144,7 +139,7 @@ class LWPLSR: jl.Xtr = getattr(self, "xtr_fold"+str(i+1)) jl.Ytr = getattr(self, "ytr_fold"+str(i+1)) jl.Xte = getattr(self, "xte_fold"+str(i+1)) - # jl.Yte = getattr(self, "yte_fold"+str(i+1)) + # convert Python Pandas DataFrame to Julia DataFrame jl.seval(""" using DataFrames using Pandas @@ -153,6 +148,7 @@ class LWPLSR: Ytr |> Pandas.DataFrame |> DataFrames.DataFrame Xte |> Pandas.DataFrame |> DataFrames.DataFrame """) + # set lwplsr parameters as the best one from tuning jl.nlvdis = int(self.best_lwplsr_params['nlvdis']) jl.metric = self.best_lwplsr_params['metric'] jl.h = self.best_lwplsr_params['h'] @@ -169,15 +165,14 @@ class LWPLSR: res = Jchemo.predict(mod_cv, Xte) res.pred """) + # save predicted values for each KFold in the predicted_results dictionary self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv) @property def pred_data_(self): # convert predicted data from x_test to Pandas DataFrame - self.predicted_results_on_test = pd.DataFrame(self.pred_test) - self.predicted_results_on_train = pd.DataFrame(self.pred_train) - self.predicted_results["pred_data_train"] = self.predicted_results_on_train - self.predicted_results["pred_data_test"] = self.predicted_results_on_test + self.predicted_results["pred_data_train"] = pd.DataFrame(self.pred_train) + self.predicted_results["pred_data_test"] = pd.DataFrame(self.pred_test) return self.predicted_results @property diff --git a/src/Class_Mod/LWPLSR_Call.py b/src/Class_Mod/LWPLSR_Call.py index 49c674c..0070091 100644 --- a/src/Class_Mod/LWPLSR_Call.py +++ b/src/Class_Mod/LWPLSR_Call.py @@ -7,35 +7,42 @@ import os # loading the lwplsr_inputs.json temp_path = Path("temp/") data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] +# check data for cross-validation depending on KFold number temp_files_list = os.listdir(temp_path) nb_fold = 0 for i in temp_files_list: if 'fold' in i: + # add CV file name to data_to_work_with data_to_work_with.append(str(i)[:-4]) + # and count the number of KFold nb_fold += 1 +# Import data from csv files in the temp/ folder dataset = [] for i in data_to_work_with: dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=',')) print('CSV imported') +# launch LWPLSR Class from LWPLSR_.py in Class_Mod print('start model creation') Reg = LWPLSR(dataset) print('model created. \nnow fit') LWPLSR.Jchemo_lwplsr_fit(Reg) print('now predict') LWPLSR.Jchemo_lwplsr_predict(Reg) - print('now CV') LWPLSR.Jchemo_lwplsr_cv(Reg) - +# Export results in a json file to bring data back to 2-model_creation.py and streamlit interface print('export to json') pred = ['pred_data_train', 'pred_data_test'] +# add KFold results to predicted data for i in range(int(nb_fold/4)): pred.append("CV" + str(i+1)) json_export = {} for i in pred: json_export[i] = Reg.pred_data_[i].to_dict() +# add the lwplsr global model to the json json_export['model'] = str(Reg.model_) +# add the best parameters for the lwplsr obtained from GridScore tuning json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_ with open(temp_path / "lwplsr_outputs.json", "w+") as outfile: json.dump(json_export, outfile) -- GitLab