From 5b74352eafa17c702555d0f4ff298d3ae6ae397b Mon Sep 17 00:00:00 2001 From: barthes <nicolas.barthes@cefe.cnrs.fr> Date: Tue, 17 Sep 2024 15:19:35 +0200 Subject: [PATCH] optimized LWPLSR predictions using best hyperparams from model creation ; also added automatic feature selection during loading of a zip model to be consistent with the model created. --- src/pages/2-model_creation.py | 12 +++--- src/pages/3-prediction.py | 10 +++-- src/utils/LWPLSR_.py | 69 +++++++++++++++++++---------------- src/utils/LWPLSR_Call.py | 6 +-- 4 files changed, 54 insertions(+), 43 deletions(-) diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 354b418..a342638 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -340,6 +340,7 @@ if not spectra.empty and not y.empty: # delete json file after import os.unlink(temp_path / "lwplsr_outputs.json") os.unlink(temp_path / "lwplsr_preTreatments.json") + os.unlink(temp_path / 'model') # format result data into Reg object pred = ['pred_data_train', 'pred_data_test']### keys of the dict for i in range(nb_folds): @@ -653,17 +654,16 @@ if Reg: Reg.best_hyperparams_['window_length'], Reg.best_hyperparams_['deriv']]}} if model_type == 'TPE-iPLS': # export selected wavelengths - pklfile['selected-wls'] = {'idx':Reg.selected_features_.T , "wls":intervalls_with_cols } - else: + pklfile['selected-wls'] = {'idx':Reg.selected_features_.T , "wls":intervalls_with_cols } + elif model_type == 'LW-PLS': # export LWPLS best model parameters + pklfile['selected-wls'] = {'idx':None, "wls":None } + pklfile['lwpls_params'] = Reg.best_hyperparams_ + else: pklfile['selected-wls'] = {'idx':None, "wls":None } with open('./report/out/file_system.pkl', "wb") as pkl: dump(pklfile, pkl) - - - - return change preparing_results_for_downloading(change = hash_) diff --git a/src/pages/3-prediction.py b/src/pages/3-prediction.py index 8542bfd..7c5e4cf 100644 --- a/src/pages/3-prediction.py +++ b/src/pages/3-prediction.py @@ -102,7 +102,7 @@ with c2: pkl = find_pkl_files(root_dir=temp_dir) system_file = [path for path in pkl if 'file_system' in path] - if len(system_file) ==1 : + if len(system_file) == 1: with open(system_file[0], 'rb') as fi: system_data = load(fi) @@ -142,7 +142,7 @@ with c2: chem_data, spectra, meta_data, _ = dx_loader(change = hash_) st.success("The data have been loaded successfully", icon="✅") if chem_data.to_numpy().shape[1]>0: - yname = st.selectbox('Select target', options=chem_data.columns) + yname = st.selectbox('Select target', options=chem_data.columns, index=chem_data.columns.to_list().index(system_data['data']['target'].name)) measured = chem_data.loc[:,yname] == 0 y = chem_data.loc[:,yname].loc[measured] pred_data = spectra.loc[measured] @@ -253,7 +253,6 @@ if not pred_data.empty:# Load the model with joblib spectra = preprocess_spectra(system_data['data']['raw-spectra'], change= hash_) x_pred = preprocessed rownames = x_pred.index.to_list() - # send best_lwplsr_parameters to LWPLSR_Call.py !!!! y = system_data['data']['target'] data_to_work_with = ['spectra_np', 'y_np', 'x_pred_np'] spectra_np, y_np, x_pred_np = spectra[1].to_numpy(), y.to_numpy(), x_pred.to_numpy() @@ -261,6 +260,10 @@ if not pred_data.empty:# Load the model with joblib for i in data_to_work_with: j = globals()[i] np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") + # export best LWPLSR params + with open(temp_path / "lwplsr_best_params.json", "w+") as outfile: + json.dump(system_data['lwpls_params'], outfile) + # create empty file to specify LWPLSR_Call.py that we want predictions open(temp_path / 'predict', 'w').close() # # run Julia Jchemo as subprocess import subprocess @@ -275,6 +278,7 @@ if not pred_data.empty:# Load the model with joblib os.unlink(temp_path / 'predict') # delete json file after import os.unlink(temp_path / "lwplsr_outputs.json") + os.unlink(temp_path / "lwplsr_best_params.json") # format result data into Reg object result = DataFrame(Reg_json['y_pred']) ### keys of the json dict result.index = rownames diff --git a/src/utils/LWPLSR_.py b/src/utils/LWPLSR_.py index 17e0773..1f1af55 100644 --- a/src/utils/LWPLSR_.py +++ b/src/utils/LWPLSR_.py @@ -42,6 +42,7 @@ class LWPLSR: self.x_spectra, self.y, self.x_pred = [dataset[i] for i in range(3)] # prepare to send dataframes to julia and Jchemo (with the jl. prefix) jl.x_spectra, jl.y, jl.x_pred = self.x_spectra, self.y, self.x_pred + self.preT = preT self.predicted_results = {} @@ -152,39 +153,45 @@ class LWPLSR: x_pred |> Pandas.DataFrame |> DataFrames.DataFrame """) # LWPLSR tuning - print('LWPLSR - tuning') - # set tuning parameters to test - jl.seval(""" - nlvdis = [5; 10; 15] ; metric = [:eucl; :mah] - h = [1; 2; 6; Inf] ; k = [30; 80; 200] - nlv = 5:15 - pars = Jchemo.mpar(nlvdis = nlvdis, metric = metric, h = h, k = k) - """) - # split Train data into Cal/Val for tuning - jl.seval(""" - pct = .3 - ntrain = Jchemo.nro(x_spectra) - nval = Int(round(pct * ntrain)) - s = Jchemo.samprand(ntrain, nval) - Xcal = x_spectra[s.train, :] - ycal = y[s.train] - Xval = x_spectra[s.test, :] - yval = y[s.test] - ncal = ntrain - nval - """) - - # Create LWPLSR model and tune with GridScore - jl.seval(""" - mod = Jchemo.model(Jchemo.lwplsr) - res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false) - u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination - """) - # save best lwplsr parameters - self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]} - print('best lwplsr params ' + str(self.best_lwplsr_params)) + print('LWPLSR - no tuning, using best parameters from model creation') + # # set tuning parameters to test + # jl.seval(""" + # nlvdis = [5; 10; 15] ; metric = [:eucl; :mah] + # h = [1; 2; 6; Inf] ; k = [30; 80; 200] + # nlv = 5:15 + # pars = Jchemo.mpar(nlvdis = nlvdis, metric = metric, h = h, k = k) + # """) + # # split Train data into Cal/Val for tuning + # jl.seval(""" + # pct = .3 + # ntrain = Jchemo.nro(x_spectra) + # nval = Int(round(pct * ntrain)) + # s = Jchemo.samprand(ntrain, nval) + # Xcal = x_spectra[s.train, :] + # ycal = y[s.train] + # Xval = x_spectra[s.test, :] + # yval = y[s.test] + # ncal = ntrain - nval + # """) + # + # # Create LWPLSR model and tune with GridScore + # jl.seval(""" + # mod = Jchemo.model(Jchemo.lwplsr) + # # res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false) + # # u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination + # # """) + # # save best lwplsr parameters + # self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]} + # print('best lwplsr params ' + str(self.best_lwplsr_params)) + # import best params from model creation + jl.nlvdis = self.preT['nlvdis'] + jl.metric = self.preT['metric'] + jl.h = self.preT['h'] + jl.k = self.preT['k'] + jl.nlv = self.preT['nlv'] # run LWPLSR model with best parameters jl.seval(""" - mod = Jchemo.model(Jchemo.lwplsr; nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u]) + mod = Jchemo.model(Jchemo.lwplsr; nlvdis = nlvdis, metric = Symbol(metric), h = h, k = k, nlv = nlv) # Fit model Jchemo.fit!(mod, x_spectra, y) """) diff --git a/src/utils/LWPLSR_Call.py b/src/utils/LWPLSR_Call.py index 2d3491a..5480f87 100644 --- a/src/utils/LWPLSR_Call.py +++ b/src/utils/LWPLSR_Call.py @@ -60,7 +60,9 @@ elif 'predict' in temp_files_list: for i in data_to_work_with: dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=',')) print('CSV imported') - preT = False + with open(temp_path / "lwplsr_best_params.json", "r") as outfile: + preT = json.load(outfile) + print('LWPLSR best parameters imported') # launch LWPLSR Class from LWPLSR_.py in utils print('start model creation') Reg = LWPLSR(dataset, preT, 'Prediction') @@ -76,7 +78,5 @@ elif 'predict' in temp_files_list: json_export[i] = Reg.predict_pred_data_[i].to_dict() # add the lwplsr global model to the json json_export['model'] = str(Reg.model_) - # add the best parameters for the lwplsr obtained from GridScore tuning - json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_ with open(temp_path / "lwplsr_outputs.json", "w+") as outfile: json.dump(json_export, outfile) -- GitLab