From 5b74352eafa17c702555d0f4ff298d3ae6ae397b Mon Sep 17 00:00:00 2001
From: barthes <nicolas.barthes@cefe.cnrs.fr>
Date: Tue, 17 Sep 2024 15:19:35 +0200
Subject: [PATCH] optimized LWPLSR predictions using best hyperparams from
 model creation ; also added automatic feature selection during loading of a
 zip model to be consistent with the model created.

---
 src/pages/2-model_creation.py | 12 +++---
 src/pages/3-prediction.py     | 10 +++--
 src/utils/LWPLSR_.py          | 69 +++++++++++++++++++----------------
 src/utils/LWPLSR_Call.py      |  6 +--
 4 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py
index 354b418..a342638 100644
--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -340,6 +340,7 @@ if not spectra.empty and not y.empty:
                         # delete json file after import
                         os.unlink(temp_path / "lwplsr_outputs.json")
                         os.unlink(temp_path / "lwplsr_preTreatments.json")
+                        os.unlink(temp_path / 'model')
                         # format result data into Reg object
                         pred = ['pred_data_train', 'pred_data_test']### keys of the dict
                         for i in range(nb_folds):
@@ -653,17 +654,16 @@ if Reg:
                                                                                                                                                Reg.best_hyperparams_['window_length'],
                                                                                                                                                Reg.best_hyperparams_['deriv']]}}
             if model_type == 'TPE-iPLS': # export selected wavelengths
-                    pklfile['selected-wls'] = {'idx':Reg.selected_features_.T , "wls":intervalls_with_cols }
-            else: 
+                pklfile['selected-wls'] = {'idx':Reg.selected_features_.T , "wls":intervalls_with_cols }
+            elif model_type == 'LW-PLS': # export LWPLS best model parameters
+                pklfile['selected-wls'] = {'idx':None, "wls":None }
+                pklfile['lwpls_params'] = Reg.best_hyperparams_
+            else:
                 pklfile['selected-wls'] = {'idx':None, "wls":None }
                     
             with open('./report/out/file_system.pkl', "wb") as pkl:
                 dump(pklfile, pkl)
 
-
-
-
-
             return change
         preparing_results_for_downloading(change = hash_)
         
diff --git a/src/pages/3-prediction.py b/src/pages/3-prediction.py
index 8542bfd..7c5e4cf 100644
--- a/src/pages/3-prediction.py
+++ b/src/pages/3-prediction.py
@@ -102,7 +102,7 @@ with c2:
         pkl = find_pkl_files(root_dir=temp_dir)
 
         system_file = [path for path in pkl if 'file_system' in path]
-        if len(system_file) ==1 :
+        if len(system_file) == 1:
             with open(system_file[0], 'rb') as fi:
                 system_data = load(fi)
 
@@ -142,7 +142,7 @@ with c2:
                     chem_data, spectra, meta_data, _ = dx_loader(change = hash_)
                     st.success("The data have been loaded successfully", icon="✅")
                     if chem_data.to_numpy().shape[1]>0:
-                        yname = st.selectbox('Select target', options=chem_data.columns)
+                        yname = st.selectbox('Select target', options=chem_data.columns, index=chem_data.columns.to_list().index(system_data['data']['target'].name))
                         measured = chem_data.loc[:,yname] == 0
                         y = chem_data.loc[:,yname].loc[measured]
                         pred_data = spectra.loc[measured]
@@ -253,7 +253,6 @@ if not pred_data.empty:# Load the model with joblib
                             spectra = preprocess_spectra(system_data['data']['raw-spectra'], change= hash_)
                             x_pred = preprocessed
                             rownames = x_pred.index.to_list()
-            # send best_lwplsr_parameters to LWPLSR_Call.py !!!!
                             y = system_data['data']['target']
                             data_to_work_with = ['spectra_np', 'y_np', 'x_pred_np']
                             spectra_np, y_np, x_pred_np = spectra[1].to_numpy(), y.to_numpy(), x_pred.to_numpy()
@@ -261,6 +260,10 @@ if not pred_data.empty:# Load the model with joblib
                             for i in data_to_work_with:
                                 j = globals()[i]
                                 np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",")
+                            # export best LWPLSR params
+                            with open(temp_path / "lwplsr_best_params.json", "w+") as outfile:
+                                json.dump(system_data['lwpls_params'], outfile)
+                            # create empty file to specify LWPLSR_Call.py that we want predictions
                             open(temp_path / 'predict', 'w').close()
                             # # run Julia Jchemo as subprocess
                             import subprocess
@@ -275,6 +278,7 @@ if not pred_data.empty:# Load the model with joblib
                                     os.unlink(temp_path / 'predict')
                                 # delete json file after import
                                 os.unlink(temp_path / "lwplsr_outputs.json")
+                                os.unlink(temp_path / "lwplsr_best_params.json")
                                 # format result data into Reg object
                                 result = DataFrame(Reg_json['y_pred'])  ### keys of the json dict
                                 result.index = rownames
diff --git a/src/utils/LWPLSR_.py b/src/utils/LWPLSR_.py
index 17e0773..1f1af55 100644
--- a/src/utils/LWPLSR_.py
+++ b/src/utils/LWPLSR_.py
@@ -42,6 +42,7 @@ class LWPLSR:
             self.x_spectra, self.y, self.x_pred = [dataset[i] for i in range(3)]
             # prepare to send dataframes to julia and Jchemo (with the jl. prefix)
             jl.x_spectra, jl.y, jl.x_pred = self.x_spectra, self.y, self.x_pred
+            self.preT = preT
             self.predicted_results = {}
 
 
@@ -152,39 +153,45 @@ class LWPLSR:
             x_pred |> Pandas.DataFrame |> DataFrames.DataFrame
             """)
         # LWPLSR tuning
-        print('LWPLSR - tuning')
-        # set tuning parameters to test
-        jl.seval("""
-            nlvdis = [5; 10; 15] ; metric = [:eucl; :mah] 
-            h = [1; 2; 6; Inf] ; k = [30; 80; 200]  
-            nlv = 5:15
-            pars = Jchemo.mpar(nlvdis = nlvdis, metric = metric, h = h, k = k)
-            """)
-        # split Train data into Cal/Val for tuning
-        jl.seval("""
-            pct = .3
-            ntrain = Jchemo.nro(x_spectra)
-            nval = Int(round(pct * ntrain))
-            s = Jchemo.samprand(ntrain, nval)
-            Xcal = x_spectra[s.train, :]
-            ycal = y[s.train]
-            Xval = x_spectra[s.test, :]
-            yval = y[s.test]
-            ncal = ntrain - nval 
-            """)
-
-        # Create LWPLSR model and tune with GridScore
-        jl.seval("""
-            mod = Jchemo.model(Jchemo.lwplsr)
-            res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false)
-            u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination
-            """)
-        # save best lwplsr parameters
-        self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]}
-        print('best lwplsr params ' + str(self.best_lwplsr_params))
+        print('LWPLSR - no tuning, using best parameters from model creation')
+        # # set tuning parameters to test
+        # jl.seval("""
+        #     nlvdis = [5; 10; 15] ; metric = [:eucl; :mah]
+        #     h = [1; 2; 6; Inf] ; k = [30; 80; 200]
+        #     nlv = 5:15
+        #     pars = Jchemo.mpar(nlvdis = nlvdis, metric = metric, h = h, k = k)
+        #     """)
+        # # split Train data into Cal/Val for tuning
+        # jl.seval("""
+        #     pct = .3
+        #     ntrain = Jchemo.nro(x_spectra)
+        #     nval = Int(round(pct * ntrain))
+        #     s = Jchemo.samprand(ntrain, nval)
+        #     Xcal = x_spectra[s.train, :]
+        #     ycal = y[s.train]
+        #     Xval = x_spectra[s.test, :]
+        #     yval = y[s.test]
+        #     ncal = ntrain - nval
+        #     """)
+        #
+        # # Create LWPLSR model and tune with GridScore
+        # jl.seval("""
+        #     mod = Jchemo.model(Jchemo.lwplsr)
+        # #     res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false)
+        # #     u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination
+        # #     """)
+        # # save best lwplsr parameters
+        # self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]}
+        # print('best lwplsr params ' + str(self.best_lwplsr_params))
+        # import best params from model creation
+        jl.nlvdis = self.preT['nlvdis']
+        jl.metric = self.preT['metric']
+        jl.h = self.preT['h']
+        jl.k = self.preT['k']
+        jl.nlv = self.preT['nlv']
         # run LWPLSR model with best parameters
         jl.seval("""
-            mod = Jchemo.model(Jchemo.lwplsr; nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u])
+            mod = Jchemo.model(Jchemo.lwplsr; nlvdis = nlvdis, metric = Symbol(metric), h = h, k = k, nlv = nlv)
             # Fit model
             Jchemo.fit!(mod, x_spectra, y)
             """)
diff --git a/src/utils/LWPLSR_Call.py b/src/utils/LWPLSR_Call.py
index 2d3491a..5480f87 100644
--- a/src/utils/LWPLSR_Call.py
+++ b/src/utils/LWPLSR_Call.py
@@ -60,7 +60,9 @@ elif 'predict' in temp_files_list:
     for i in data_to_work_with:
         dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=','))
     print('CSV imported')
-    preT = False
+    with open(temp_path / "lwplsr_best_params.json", "r") as outfile:
+        preT = json.load(outfile)
+    print('LWPLSR best parameters imported')
     # launch LWPLSR Class from LWPLSR_.py in utils
     print('start model creation')
     Reg = LWPLSR(dataset, preT, 'Prediction')
@@ -76,7 +78,5 @@ elif 'predict' in temp_files_list:
         json_export[i] = Reg.predict_pred_data_[i].to_dict()
     # add the lwplsr global model to the json
     json_export['model'] = str(Reg.model_)
-    # add the best parameters for the lwplsr obtained from GridScore tuning
-    json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_
     with open(temp_path / "lwplsr_outputs.json", "w+") as outfile:
         json.dump(json_export, outfile)
-- 
GitLab