prepare LWPLSR for CV

f0bb9226 · Nicolas Barthes · d49f8a07 · f0bb9226 · f0bb9226 · f0bb9226
Commit f0bb9226 authored 10 months ago by Nicolas Barthes
--- a/src/Class_Mod/LWPLSR_.py
+++ b/src/Class_Mod/LWPLSR_.py
 from juliacall import Main as jl
 import numpy as np
 import pandas as pd
+from sklearn.model_selection import KFold
 class LWPLSR:
+    """The lwpls regression model from Jchemo (M. Lesnoff)
+    Returns:
+        self.scores (DataFrame): various metrics and scores
+        self.predicted_results_on_train (DataFrame):
+        self.predicted_results_on_cv (DataFrame):
+        self.predicted_results_on_test (DataFrame):
+        self.mod (Julia model): the prepared model
    """
-    The lwpls regression model from Jchemo (M. Lesnoff)
+    def __init__(self, x_train, y_train, x_test, y_test, x_train_cv1, y_train_cv1, x_test_cv1, y_test_cv1, x_train_cv2, y_train_cv2, x_test_cv2, y_test_cv2, x_train_cv3, y_train_cv3, x_test_cv3, y_test_cv3):
-    """
-    def __init__(self, x_train, y_train, x_test, y_test):
        """Initiate the LWPLSR and prepare data for Julia computing."""
        self.x_train, self.y_train, self.x_test, self.y_test = x_train, y_train, x_test, y_test
+        self.x_train_cv1, self.y_train_cv1, self.x_test_cv1, self.y_test_cv1 = x_train_cv1, y_train_cv1, x_test_cv1, y_test_cv1
+        self.x_train_cv2, self.y_train_cv2, self.x_test_cv2, self.y_test_cv2 = x_train_cv2, y_train_cv2, x_test_cv2, y_test_cv2
+        self.x_train_cv3, self.y_train_cv3, self.x_test_cv3, self.y_test_cv3 = x_train_cv3, y_train_cv3, x_test_cv3, y_test_cv3
        # prepare to send dataframes to julia and Jchemo
        jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test
-        # Pre-treatment of x_train and x_test
+        jl.x_train_cv1, jl.y_train_cv1, jl.x_test_cv1, jl.y_test_cv1 = self.x_train_cv1, self.y_train_cv1, self.x_test_cv1, self.y_test_cv1
-        # jl.seval("""
+        jl.x_train_cv2, jl.y_train_cv2, jl.x_test_cv2, jl.y_test_cv2 = self.x_train_cv2, self.y_train_cv2, self.x_test_cv2, self.y_test_cv2
-        # # using DataFrames
+        jl.x_train_cv3, jl.y_train_cv3, jl.x_test_cv3, jl.y_test_cv3= self.x_train_cv3, self.y_train_cv3, self.x_test_cv3, self.y_test_cv3
-        # # using Pandas
+        # optimize lwplsr parameters with Jchemo
-        # using Jchemo
-        # mod1 = Jchemo.model(snv; centr = true, scal = true)
-        # mod2 = Jchemo.model(savgol; npoint = 15, deriv = 1, degree = 2)
-        # mod = Jchemo.pip(mod1, mod2)
-        # Jchemo.fit!(mod, x_train)
-        # x_train = Jchemo.transf(mod1, x_train)
-        # Jchemo.fit!(mod, x_test)
-        # x_test = Jchemo.transf(mod1, x_test)
-        # """)
        # jl.seval("""
        # ntrain = nro(x_train)
        # segm = segmkf(ntrain, 4; rep = 5)
@@ -36,17 +38,22 @@ class LWPLSR:
        # initialize vars from the class
        y_shape = y_test.shape
-        self.scores = pd.DataFrame
+        y_shape_cv1 = y_test_cv1.shape
+        y_shape_cv2 = y_test_cv2.shape
+        y_shape_cv3 = y_test_cv3.shape
+        # self.scores = pd.DataFrame
        self.predicted_results_on_test = pd.DataFrame
        self.predicted_results_on_train = pd.DataFrame
        self.predicted_results_on_cv = pd.DataFrame
        self.pred_test = np.zeros(shape=(y_shape[0], 1))
        self.pred_train = np.zeros(shape=(y_shape[0], 1))
-        self.pred_cv = np.zeros(shape=(y_shape[0], 1))
+        self.pred_cv1 = np.zeros(shape=(y_shape_cv1[0], 1))
+        self.pred_cv2 = np.zeros(shape=(y_shape_cv2[0], 1))
+        self.pred_cv3 = np.zeros(shape=(y_shape_cv3[0], 1))
        self.mod = ""
    def Jchemo_lwplsr_fit(self):
-        """Send data to Julia to compute lwplsr.
+        """Send data to Julia to fit lwplsr.
        Args:
            self.jl.x_train (DataFrame):
@@ -55,8 +62,7 @@ class LWPLSR:
            self.jl.y_test (DataFrame):
        Returns:
-            self.scores (DataFrame): various metrics and scores
+            self.mod (Julia model): the prepared model
-            self.predicted_results_on_test (DataFrame):
        """
        # launch Julia Jchemo lwplsr
        jl.seval("""
@@ -67,6 +73,18 @@ class LWPLSR:
        y_train |> Pandas.DataFrame |> DataFrames.DataFrame
        x_test |> Pandas.DataFrame |> DataFrames.DataFrame
        y_test |> Pandas.DataFrame |> DataFrames.DataFrame
+        x_train_cv1 |> Pandas.DataFrame |> DataFrames.DataFrame
+        y_train_cv1 |> Pandas.DataFrame |> DataFrames.DataFrame
+        x_test_cv1 |> Pandas.DataFrame |> DataFrames.DataFrame
+        y_test_cv1 |> Pandas.DataFrame |> DataFrames.DataFrame
+        x_train_cv2 |> Pandas.DataFrame |> DataFrames.DataFrame
+        y_train_cv2 |> Pandas.DataFrame |> DataFrames.DataFrame
+        x_test_cv2 |> Pandas.DataFrame |> DataFrames.DataFrame
+        y_test_cv2 |> Pandas.DataFrame |> DataFrames.DataFrame
+        x_train_cv3 |> Pandas.DataFrame |> DataFrames.DataFrame
+        y_train_cv3 |> Pandas.DataFrame |> DataFrames.DataFrame
+        x_test_cv3 |> Pandas.DataFrame |> DataFrames.DataFrame
+        y_test_cv3 |> Pandas.DataFrame |> DataFrames.DataFrame
        """)
        # Create LWPLSR model and fit
        jl.seval("""
@@ -76,6 +94,18 @@ class LWPLSR:
        # Fit model
        Jchemo.fit!(mod, x_train, y_train)
        """)
+        # CV model and fit
+        jl.seval("""
+        nlvdis = 5 ; metric = :mah
+        h = 1 ; k = 200 ; nlv = 15 #; scal = true
+        mod_cv1 = mod
+        mod_cv2 = mod
+        mod_cv3 = mod
+        # Fit model
+        Jchemo.fit!(mod_cv1, x_train_cv1, y_train_cv1)
+        Jchemo.fit!(mod_cv2, x_train_cv2, y_train_cv2)
+        Jchemo.fit!(mod_cv3, x_train_cv3, y_train_cv3)
+        """)
        # jl.seval("""
        # mod = Jchemo.model(Jchemo.lwplsr)
@@ -87,77 +117,107 @@ class LWPLSR:
        self.mod = jl.mod
    def Jchemo_lwplsr_predict(self):
+        """Send data to Julia to predict with lwplsr.
+        Args:
+            self.mod (Julia model): the prepared model
+            self.jl.x_train (DataFrame):
+            self.jl.y_train (DataFrame):
+            self.jl.x_test (DataFrame):
+            self.jl.y_test (DataFrame):
+        Returns:
+            self.pred_test (Julia DataFrame): predicted values on x_test
+            self.pred_train (Julia DataFrame): predicted values on x_train
+            self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation
+        """
        # Predictions on x_test and store in self.pred
        self.pred_test = jl.seval("""
+        println("start test predict")
        res = Jchemo.predict(mod, x_test)
        res.pred
        """)
        self.pred_train = jl.seval("""
+        println("start train predict")
        res = Jchemo.predict(mod, x_train)
        res.pred
        """)
-        self.pred_cv = self.pred_train
+        self.pred_cv1 = jl.seval("""
+        println("start test_cv1 predict")
+        res = Jchemo.predict(mod_cv1, x_test_cv1)
+        res.pred
+        """)
+        self.pred_cv2 = jl.seval("""
+        println("start test_cv2 predict")
+        res = Jchemo.predict(mod_cv2, x_test_cv2)
+        res.pred
+        """)
+        self.pred_cv3 = jl.seval("""
+        println("start test_cv3 predict")
+        res = Jchemo.predict(mod_cv3, x_test_cv3)
+        res.pred
+        """)
    @property
    def pred_data_(self):
        # convert predicted data from x_test to Pandas DataFrame
        self.predicted_results_on_test = pd.DataFrame(self.pred_test)
        self.predicted_results_on_train = pd.DataFrame(self.pred_train)
-        # self.predicted_results_on_cv = pd.DataFrame(self.pred_cv)
+        self.predicted_results_on_cv1 = pd.DataFrame(self.pred_cv1)
-        self.predicted_results_on_cv = pd.DataFrame(self.pred_train)
+        self.predicted_results_on_cv2 = pd.DataFrame(self.pred_cv2)
-        return self.predicted_results_on_train, self.predicted_results_on_cv, self.predicted_results_on_test
+        self.predicted_results_on_cv3 = pd.DataFrame(self.pred_cv3)
+        return self.predicted_results_on_train, self.predicted_results_on_cv1, self.predicted_results_on_cv2, self.predicted_results_on_cv3, self.predicted_results_on_test
    @property
    def model_(self):
        return self.mod
-    @property
+    # @property
-    def metrics_(self):
+    # def metrics_(self):
-        jl.pred_test = self.pred_test
+    #     jl.pred_test = self.pred_test
-        jl.seval("""
+    #     jl.seval("""
-        using Jchemo
+    #     using Jchemo
-        """)
+    #     """)
-        scorermsep_test = jl.seval("""
+    #     scorermsep_test = jl.seval("""
-            first(Jchemo.rmsep(pred_test, y_test))
+    #         first(Jchemo.rmsep(pred_test, y_test))
-            """)
+    #         """)
-        scoremr2_test = jl.seval("""
+    #     scoremr2_test = jl.seval("""
-            first(Jchemo.r2(pred_test, y_test))
+    #         first(Jchemo.r2(pred_test, y_test))
-            """)
+    #         """)
-        scorerpd_test = jl.seval("""
+    #     scorerpd_test = jl.seval("""
-            first(Jchemo.rpd(pred_test, y_test))
+    #         first(Jchemo.rpd(pred_test, y_test))
-            """)
+    #         """)
-        scoremsep_test = jl.seval("""
+    #     scoremsep_test = jl.seval("""
-            first(Jchemo.sep(pred_test, y_test))
+    #         first(Jchemo.sep(pred_test, y_test))
-            """)
+    #         """)
-        jl.pred_train = self.pred_train
+    #     jl.pred_train = self.pred_train
-        scorermsep_train = jl.seval("""
+    #     scorermsep_train = jl.seval("""
-            first(Jchemo.rmsep(pred_train, y_train))
+    #         first(Jchemo.rmsep(pred_train, y_train))
-            """)
+    #         """)
-        scoremr2_train = jl.seval("""
+    #     scoremr2_train = jl.seval("""
-            first(Jchemo.r2(pred_train, y_train))
+    #         first(Jchemo.r2(pred_train, y_train))
-            """)
+    #         """)
-        scorerpd_train = jl.seval("""
+    #     scorerpd_train = jl.seval("""
-            first(Jchemo.rpd(pred_train, y_train))
+    #         first(Jchemo.rpd(pred_train, y_train))
-            """)
+    #         """)
-        scoremsep_train = jl.seval("""
+    #     scoremsep_train = jl.seval("""
-            first(Jchemo.sep(pred_train, y_train))
+    #         first(Jchemo.sep(pred_train, y_train))
-            """)
+    #         """)
-        jl.pred_cv = self.pred_cv
+    #     jl.pred_cv = self.pred_cv
-        scorermsep_cv = jl.seval("""
+    #     scorermsep_cv = jl.seval("""
-            first(Jchemo.rmsep(pred_cv, y_train))
+    #         first(Jchemo.rmsep(pred_cv, y_train))
-            """)
+    #         """)
-        scoremr2_cv = jl.seval("""
+    #     scoremr2_cv = jl.seval("""
-            first(Jchemo.r2(pred_cv, y_train))
+    #         first(Jchemo.r2(pred_cv, y_train))
-            """)
+    #         """)
-        scorerpd_cv = jl.seval("""
+    #     scorerpd_cv = jl.seval("""
-            first(Jchemo.rpd(pred_cv, y_train))
+    #         first(Jchemo.rpd(pred_cv, y_train))
-            """)
+    #         """)
-        scoremsep_cv = jl.seval("""
+    #     scoremsep_cv = jl.seval("""
-            first(Jchemo.sep(pred_cv, y_train))
+    #         first(Jchemo.sep(pred_cv, y_train))
-            """)
+    #         """)
+    #
+    #
-        self.scores = pd.DataFrame([[scoremr2_test, scorermsep_test, scoremsep_test, scorerpd_test]], columns=['r2', 'rmsep', 'msep', 'rpd'], index=['test'])
+    #     self.scores = pd.DataFrame([[scoremr2_test, scorermsep_test, scoremsep_test, scorerpd_test]], columns=['r2', 'rmsep', 'msep', 'rpd'], index=['test'])
-        self.scores = pd.concat([self.scores, pd.DataFrame([[scoremr2_train, scorermsep_train, scoremsep_train, scorerpd_train]], columns=['r2', 'rmsep', 'msep', 'rpd'], index = ["train"]), pd.DataFrame([[scoremr2_cv, scorermsep_cv, scoremsep_cv, scorerpd_cv]], columns=['r2', 'rmsep', 'msep', 'rpd'], index = ["cv"])])#
+    #     self.scores = pd.concat([self.scores, pd.DataFrame([[scoremr2_train, scorermsep_train, scoremsep_train, scorerpd_train]], columns=['r2', 'rmsep', 'msep', 'rpd'], index = ["train"]), pd.DataFrame([[scoremr2_cv, scorermsep_cv, scoremsep_cv, scorerpd_cv]], columns=['r2', 'rmsep', 'msep', 'rpd'], index = ["cv"])])#
-        return self.scores
+    #     return self.scores
--- a/src/Class_Mod/LWPLSR_Call.py
+++ b/src/Class_Mod/LWPLSR_Call.py
@@ -5,23 +5,24 @@ from LWPLSR_ import LWPLSR
 # loading the lwplsr_inputs.json
 temp_path = Path("temp/")
-for i in ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']:
+for i in ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']:#,'x_train_np_cv1', 'y_train_np_cv1', 'x_test_np_cv1', 'y_test_np_cv1', 'x_train_np_cv2', 'y_train_np_cv2', 'x_test_np_cv2', 'y_test_np_cv2', 'x_train_np_cv3', 'y_train_np_cv3', 'x_test_np_cv3', 'y_test_np_cv3',]:
    globals()[i] = np.genfromtxt(temp_path / str(i + ".csv"), delimiter=',')
 print('CSV imported')
 print('start model creation')
-Reg = LWPLSR(x_train_np, y_train_np, x_test_np, y_test_np)
+Reg = LWPLSR(x_train_np, y_train_np, x_test_np, y_test_np, x_train_np_cv1, y_train_np_cv1, x_test_np_cv1, y_test_np_cv1, x_train_np_cv2, y_train_np_cv2, x_test_np_cv2, y_test_np_cv2, x_train_np_cv3, y_train_np_cv3, x_test_np_cv3, y_test_np_cv3)
 print('model created. \n now fit')
 LWPLSR.Jchemo_lwplsr_fit(Reg)
 print('now predict')
 LWPLSR.Jchemo_lwplsr_predict(Reg)
 json_export = {}
-data_to_export = ['model', 'pred_data', 'metrics']
 json_export['pred_data_train'] = Reg.pred_data_[0].to_dict()
-json_export['pred_data_cv'] = Reg.pred_data_[1].to_dict()
+json_export['pred_data_cv1'] = Reg.pred_data_[1].to_dict()
-json_export['pred_data_test'] = Reg.pred_data_[2].to_dict()
+json_export['pred_data_cv2'] = Reg.pred_data_[2].to_dict()
-json_export['metrics'] = Reg.metrics_.to_dict()
+json_export['pred_data_cv3'] = Reg.pred_data_[3].to_dict()
+json_export['pred_data_test'] = Reg.pred_data_[4].to_dict()
 json_export['model'] = str(Reg.model_)
+# json_export['metrics'] = Reg.metrics_.to_dict()
 with open(temp_path / "lwplsr_outputs.json", "w+") as outfile:
    json.dump(json_export, outfile)
 print(Reg.metrics_)
\ No newline at end of file
--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -140,8 +140,9 @@ if not spectra.empty and not y.empty:
        reg_model = Reg.model_
        #M2.dataframe(Pin.pred_data_)
    elif regression_algo == reg_algo[2]:
-        data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
+        data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']#,'x_train_np_cv1', 'y_train_np_cv1', 'x_test_np_cv1', 'y_test_np_cv1', 'x_train_np_cv2', 'y_train_np_cv2', 'x_test_np_cv2', 'y_test_np_cv2', 'x_train_np_cv3', 'y_train_np_cv3', 'x_test_np_cv3', 'y_test_np_cv3',]
        x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
+        # x_train_np_cv1, y_train_np_cv1, x_test_np_cv1, y_test_np_cv1, x_train_np_cv2, y_train_np_cv2, x_test_np_cv2, y_test_np_cv2, x_train_np_cv3, y_train_np_cv3, x_test_np_cv3, y_test_np_cv3, = X_train_cv1.to_numpy(), y_train_cv1.to_numpy(), X_test_cv1.to_numpy(), y_test_cv1.to_numpy(), X_train_cv2.to_numpy(), y_train_cv2.to_numpy(), X_test_cv2.to_numpy(), y_test_cv2.to_numpy(), X_train_cv3.to_numpy(), y_train_cv3.to_numpy(), X_test_cv3.to_numpy(), y_test_cv3.to_numpy()
        temp_path = Path('temp/')
        for i in data_to_work_with: np.savetxt(temp_path / str(i + ".csv"), vars()[i], delimiter=",")
        import subprocess
@@ -151,13 +152,17 @@ if not spectra.empty and not y.empty:
                Reg_json = json.load(outfile)
                for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
        os.unlink(temp_path / "lwplsr_outputs.json")
-        Reg = type('obj', (object,), {'metrics_' : pd.json_normalize(Reg_json['metrics']), 'pred_data_' : [pd.json_normalize(Reg_json['pred_data_train']), pd.json_normalize(Reg_json['pred_data_cv']),pd.json_normalize(Reg_json['pred_data_test'])]})
+        Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json['pred_data_train']), pd.json_normalize(Reg_json['pred_data_cv1']), pd.json_normalize(Reg_json['pred_data_cv2']), pd.json_normalize(Reg_json['pred_data_cv3']), pd.json_normalize(Reg_json['pred_data_test'])]})
        Reg.pred_data_[0] = Reg.pred_data_[0].T.reset_index().drop(columns = ['index'])
        Reg.pred_data_[0].index = list(y_train.index)
-        Reg.pred_data_[1] = Reg.pred_data_[1].T.reset_index().drop(columns = ['index'])
+        # Reg.pred_data_[1] = Reg.pred_data_[1].T.reset_index().drop(columns = ['index'])
-        Reg.pred_data_[1].index = list(y_train.index)
+        # Reg.pred_data_[1].index = list(y_train_cv1.index)
-        Reg.pred_data_[2] = Reg.pred_data_[2].T.reset_index().drop(columns = ['index'])
+        #  Reg.pred_data_[2] = Reg.pred_data_[2].T.reset_index().drop(columns = ['index'])
-        Reg.pred_data_[2].index = list(y_test.index)
+        # Reg.pred_data_[2].index = list(y_train_cv2.index)
+        #  Reg.pred_data_[3] = Reg.pred_data_[3].T.reset_index().drop(columns = ['index'])
+        # Reg.pred_data_[3].index = list(y_train_cv3.index)
+        Reg.pred_data_[4] = Reg.pred_data_[4].T.reset_index().drop(columns = ['index'])
+        Reg.pred_data_[4].index = list(y_test.index)
    elif regression_algo == reg_algo[3]:
        s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)