From 7c9546c2dc502a971836f9f2e77b90ab9be5439d Mon Sep 17 00:00:00 2001
From: barthes <nicolas.barthes@cefe.cnrs.fr>
Date: Tue, 25 Jun 2024 12:15:29 +0200
Subject: [PATCH] LWPLSR subprocess documentation

---
 .gitignore                   |  1 +
 src/Class_Mod/LWPLSR_.py     | 39 ++++++++++++++++--------------------
 src/Class_Mod/LWPLSR_Call.py | 11 ++++++++--
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0c742b4..927355c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ src/Report/*.zip
 src/Report/figures/*.pdf
 src/Report/figures/*.png
 config/config.json
+data/params/Preprocessing.json
\ No newline at end of file
diff --git a/src/Class_Mod/LWPLSR_.py b/src/Class_Mod/LWPLSR_.py
index a7bd379..da661d6 100644
--- a/src/Class_Mod/LWPLSR_.py
+++ b/src/Class_Mod/LWPLSR_.py
@@ -7,33 +7,28 @@ class LWPLSR:
 
     Returns:
         self.scores (DataFrame): various metrics and scores
-        self.predicted_results_on_train (DataFrame):
-        self.predicted_results_on_test (DataFrame):
+        self.predicted_results (Dictionary): Dict containing all predicted results (train, test, cross-validation)
         self.mod (Julia model): the prepared model
     """
     def __init__(self, dataset):
         """Initiate the LWPLSR and prepare data for Julia computing."""
-
-        # self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))]
+        # get train / test data from dataset
         self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(4)]
+        # calculate number of KFolds and get CV data from dataset
         self.nb_fold = int((len(dataset)-4)/4)
         for i in range(self.nb_fold):
             setattr(self, "xtr_fold"+str(i+1), dataset[i+7])
             setattr(self, "ytr_fold"+str(i+1), dataset[i+13])
             setattr(self, "xte_fold"+str(i+1), dataset[i+4])
-            # setattr(self, "yte_fold"+str(i+1), dataset[i+10])
             setattr(jl, "xtr_fold"+str(i+1), dataset[i+7])
             setattr(jl, "ytr_fold"+str(i+1), dataset[i+13])
             setattr(jl, "xte_fold"+str(i+1), dataset[i+4])
-            # setattr(jl, "yte_fold"+str(i+1), dataset[i+10])
 
-        # prepare to send dataframes to julia and Jchemo
+        # prepare to send dataframes to julia and Jchemo (with the jl. prefix)
         jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test
 
         # initialize vars from the class
         y_shape = self.y_test.shape
-        self.predicted_results_on_test = pd.DataFrame
-        self.predicted_results_on_train = pd.DataFrame
         self.pred_test = np.zeros(shape=(y_shape[0], 1))
         self.pred_train = np.zeros(shape=(y_shape[0], 1))
         self.mod = ""
@@ -52,7 +47,7 @@ class LWPLSR:
         Returns:
             self.mod (Julia model): the prepared model
         """
-        # launch Julia Jchemo lwplsr
+        # launch Julia Jchemo lwplsr and convert DataFrames from Python Pandas DataFrame to Julia DataFrame
         jl.seval("""
         using DataFrames
         using Pandas
@@ -63,7 +58,7 @@ class LWPLSR:
         y_test |> Pandas.DataFrame |> DataFrames.DataFrame
         """)
         print('LWPLSR - tuning')
-        # set tuning parameters
+        # set tuning parameters to test
         jl.seval("""
         nlvdis = [5; 10; 15] ; metric = [:eucl; :mah] 
         h = [1; 2; 6; Inf] ; k = [30; 80; 200]  
@@ -83,21 +78,22 @@ class LWPLSR:
         ncal = ntrain - nval 
         """)
 
-        # Create LWPLSR model and tune
+        # Create LWPLSR model and tune with GridScore
         jl.seval("""
         mod = Jchemo.model(Jchemo.lwplsr)
         res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false)
         u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination
         """)
+        # save best lwplsr parameters
         self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]}
         print('best lwplsr params ' + str(self.best_lwplsr_params))
-        print('LWPLSR - best params ok')
-        # calculate LWPLSR model with best parameters
+        # run LWPLSR model with best parameters
         jl.seval("""
         mod = Jchemo.model(Jchemo.lwplsr; nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u])
         # Fit model
         Jchemo.fit!(mod, x_train, y_train)
         """)
+        # save Julia Jchemo model
         self.mod = jl.mod
 
     def Jchemo_lwplsr_predict(self):
@@ -128,14 +124,13 @@ class LWPLSR:
         print('LWPLSR - end')
 
     def Jchemo_lwplsr_cv(self):
-        """Send data to Julia to predict with lwplsr.
+        """Send Cross-Validation data to Julia to fit & predict with lwplsr.
 
         Args:
-            self.mod (Julia model): the prepared model
+            self.best_lwplsr_params: the best parameters to use (from tuning) for CV
             self.xtr_fold1 (DataFrame):
             self.ytr_fold1 (DataFrame):
             self.xte_fold1 (DataFrame):
-            self.yte_fold1 (DataFrame):
 
         Returns:
             self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation
@@ -144,7 +139,7 @@ class LWPLSR:
             jl.Xtr = getattr(self, "xtr_fold"+str(i+1))
             jl.Ytr = getattr(self, "ytr_fold"+str(i+1))
             jl.Xte = getattr(self, "xte_fold"+str(i+1))
-            # jl.Yte = getattr(self, "yte_fold"+str(i+1))
+            # convert Python Pandas DataFrame to Julia DataFrame
             jl.seval("""
             using DataFrames
             using Pandas
@@ -153,6 +148,7 @@ class LWPLSR:
             Ytr |> Pandas.DataFrame |> DataFrames.DataFrame
             Xte |> Pandas.DataFrame |> DataFrames.DataFrame
             """)
+            # set lwplsr parameters as the best one from tuning
             jl.nlvdis = int(self.best_lwplsr_params['nlvdis'])
             jl.metric = self.best_lwplsr_params['metric']
             jl.h = self.best_lwplsr_params['h']
@@ -169,15 +165,14 @@ class LWPLSR:
             res = Jchemo.predict(mod_cv, Xte)
             res.pred
             """)
+            # save predicted values for each KFold in the predicted_results dictionary
             self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv)
 
     @property
     def pred_data_(self):
         # convert predicted data from x_test to Pandas DataFrame
-        self.predicted_results_on_test = pd.DataFrame(self.pred_test)
-        self.predicted_results_on_train = pd.DataFrame(self.pred_train)
-        self.predicted_results["pred_data_train"] = self.predicted_results_on_train
-        self.predicted_results["pred_data_test"] = self.predicted_results_on_test
+        self.predicted_results["pred_data_train"] = pd.DataFrame(self.pred_train)
+        self.predicted_results["pred_data_test"] = pd.DataFrame(self.pred_test)
         return self.predicted_results
 
     @property
diff --git a/src/Class_Mod/LWPLSR_Call.py b/src/Class_Mod/LWPLSR_Call.py
index 49c674c..0070091 100644
--- a/src/Class_Mod/LWPLSR_Call.py
+++ b/src/Class_Mod/LWPLSR_Call.py
@@ -7,35 +7,42 @@ import os
 # loading the lwplsr_inputs.json
 temp_path = Path("temp/")
 data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
+# check data for cross-validation depending on KFold number
 temp_files_list = os.listdir(temp_path)
 nb_fold = 0
 for i in temp_files_list:
     if 'fold' in i:
+        # add CV file name to data_to_work_with
         data_to_work_with.append(str(i)[:-4])
+        # and count the number of KFold
         nb_fold += 1
+# Import data from csv files in the temp/ folder
 dataset = []
 for i in data_to_work_with:
     dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=','))
 print('CSV imported')
+# launch LWPLSR Class from LWPLSR_.py in Class_Mod
 print('start model creation')
 Reg = LWPLSR(dataset)
 print('model created. \nnow fit')
 LWPLSR.Jchemo_lwplsr_fit(Reg)
 print('now predict')
 LWPLSR.Jchemo_lwplsr_predict(Reg)
-
 print('now CV')
 LWPLSR.Jchemo_lwplsr_cv(Reg)
 
-
+# Export results in a json file to bring data back to 2-model_creation.py and streamlit interface
 print('export to json')
 pred = ['pred_data_train', 'pred_data_test']
+# add KFold results to predicted data
 for i in range(int(nb_fold/4)):
     pred.append("CV" + str(i+1))
 json_export = {}
 for i in pred:
     json_export[i] = Reg.pred_data_[i].to_dict()
+# add the lwplsr global model to the json
 json_export['model'] = str(Reg.model_)
+# add the best parameters for the lwplsr obtained from GridScore tuning
 json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_
 with open(temp_path / "lwplsr_outputs.json", "w+") as outfile:
     json.dump(json_export, outfile)
-- 
GitLab