From 7f8b71508379a6f8a8877ea3077fa53177ab1a26 Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Tue, 25 Jun 2024 16:45:40 +0200
Subject: [PATCH] test

---
 src/Class_Mod/LWPLSR_.py      | 81 ++++++++++++++++++++++++++------
 src/Class_Mod/LWPLSR_Call.py  | 15 +++++-
 src/pages/2-model_creation.py | 87 ++++++++++++++++++++++++++---------
 3 files changed, 148 insertions(+), 35 deletions(-)

diff --git a/src/Class_Mod/LWPLSR_.py b/src/Class_Mod/LWPLSR_.py
index 2e3d40b..a7bd379 100644
--- a/src/Class_Mod/LWPLSR_.py
+++ b/src/Class_Mod/LWPLSR_.py
@@ -14,7 +14,18 @@ class LWPLSR:
     def __init__(self, dataset):
         """Initiate the LWPLSR and prepare data for Julia computing."""
 
-        self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))]
+        # self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))]
+        self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(4)]
+        self.nb_fold = int((len(dataset)-4)/4)
+        for i in range(self.nb_fold):
+            setattr(self, "xtr_fold"+str(i+1), dataset[i+7])
+            setattr(self, "ytr_fold"+str(i+1), dataset[i+13])
+            setattr(self, "xte_fold"+str(i+1), dataset[i+4])
+            # setattr(self, "yte_fold"+str(i+1), dataset[i+10])
+            setattr(jl, "xtr_fold"+str(i+1), dataset[i+7])
+            setattr(jl, "ytr_fold"+str(i+1), dataset[i+13])
+            setattr(jl, "xte_fold"+str(i+1), dataset[i+4])
+            # setattr(jl, "yte_fold"+str(i+1), dataset[i+10])
 
         # prepare to send dataframes to julia and Jchemo
         jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test
@@ -23,20 +34,20 @@ class LWPLSR:
         y_shape = self.y_test.shape
         self.predicted_results_on_test = pd.DataFrame
         self.predicted_results_on_train = pd.DataFrame
-        self.predicted_results_on_cv = pd.DataFrame
         self.pred_test = np.zeros(shape=(y_shape[0], 1))
         self.pred_train = np.zeros(shape=(y_shape[0], 1))
         self.mod = ""
         self.best_lwplsr_params = np.zeros(shape=(5, 1))
+        self.predicted_results = {}
 
     def Jchemo_lwplsr_fit(self):
         """Send data to Julia to fit lwplsr.
 
         Args:
-            self.jl.x_train (DataFrame):
-            self.jl.y_train (DataFrame):
-            self.jl.x_test (DataFrame):
-            self.jl.y_test (DataFrame):
+            self.x_train (DataFrame):
+            self.y_train (DataFrame):
+            self.x_test (DataFrame):
+            self.y_test (DataFrame):
 
         Returns:
             self.mod (Julia model): the prepared model
@@ -79,7 +90,7 @@ class LWPLSR:
         u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination
         """)
         self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]}
-        print('best lwplsr params' + str(self.best_lwplsr_params))
+        print('best lwplsr params ' + str(self.best_lwplsr_params))
         print('LWPLSR - best params ok')
         # calculate LWPLSR model with best parameters
         jl.seval("""
@@ -94,15 +105,14 @@ class LWPLSR:
 
         Args:
             self.mod (Julia model): the prepared model
-            self.jl.x_train (DataFrame):
-            self.jl.y_train (DataFrame):
-            self.jl.x_test (DataFrame):
-            self.jl.y_test (DataFrame):
+            self.x_train (DataFrame):
+            self.y_train (DataFrame):
+            self.x_test (DataFrame):
+            self.y_test (DataFrame):
 
         Returns:
             self.pred_test (Julia DataFrame): predicted values on x_test
             self.pred_train (Julia DataFrame): predicted values on x_train
-            self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation
         """
         # Predictions on x_test and store in self.pred
         self.pred_test = jl.seval("""
@@ -117,13 +127,58 @@ class LWPLSR:
         """)
         print('LWPLSR - end')
 
+    def Jchemo_lwplsr_cv(self):
+        """Send data to Julia to predict with lwplsr.
+
+        Args:
+            self.mod (Julia model): the prepared model
+            self.xtr_fold1 (DataFrame):
+            self.ytr_fold1 (DataFrame):
+            self.xte_fold1 (DataFrame):
+            self.yte_fold1 (DataFrame):
+
+        Returns:
+            self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation
+        """
+        for i in range(self.nb_fold):
+            jl.Xtr = getattr(self, "xtr_fold"+str(i+1))
+            jl.Ytr = getattr(self, "ytr_fold"+str(i+1))
+            jl.Xte = getattr(self, "xte_fold"+str(i+1))
+            # jl.Yte = getattr(self, "yte_fold"+str(i+1))
+            jl.seval("""
+            using DataFrames
+            using Pandas
+            using Jchemo
+            Xtr |> Pandas.DataFrame |> DataFrames.DataFrame
+            Ytr |> Pandas.DataFrame |> DataFrames.DataFrame
+            Xte |> Pandas.DataFrame |> DataFrames.DataFrame
+            """)
+            jl.nlvdis = int(self.best_lwplsr_params['nlvdis'])
+            jl.metric = self.best_lwplsr_params['metric']
+            jl.h = self.best_lwplsr_params['h']
+            jl.k = int(self.best_lwplsr_params['k'])
+            jl.nlv = int(self.best_lwplsr_params['nlv'])
+            jl.seval("""
+            println("LWPLSR - start CV mod")
+            mod_cv = Jchemo.model(Jchemo.lwplsr; nlvdis = nlvdis, metric = Symbol(metric), h = h, k = k, nlv = nlv)
+            # Fit model
+            Jchemo.fit!(mod_cv, Xtr, Ytr)
+            """)
+            pred_cv = jl.seval("""
+            println("LWPLSR - start CV predict")
+            res = Jchemo.predict(mod_cv, Xte)
+            res.pred
+            """)
+            self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv)
 
     @property
     def pred_data_(self):
         # convert predicted data from x_test to Pandas DataFrame
         self.predicted_results_on_test = pd.DataFrame(self.pred_test)
         self.predicted_results_on_train = pd.DataFrame(self.pred_train)
-        return self.predicted_results_on_train, self.predicted_results_on_test
+        self.predicted_results["pred_data_train"] = self.predicted_results_on_train
+        self.predicted_results["pred_data_test"] = self.predicted_results_on_test
+        return self.predicted_results
 
     @property
     def model_(self):
diff --git a/src/Class_Mod/LWPLSR_Call.py b/src/Class_Mod/LWPLSR_Call.py
index f8445d4..49c674c 100644
--- a/src/Class_Mod/LWPLSR_Call.py
+++ b/src/Class_Mod/LWPLSR_Call.py
@@ -2,10 +2,17 @@ import numpy as np
 from pathlib import Path
 import json
 from LWPLSR_ import LWPLSR
+import os
 
 # loading the lwplsr_inputs.json
 temp_path = Path("temp/")
 data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
+temp_files_list = os.listdir(temp_path)
+nb_fold = 0
+for i in temp_files_list:
+    if 'fold' in i:
+        data_to_work_with.append(str(i)[:-4])
+        nb_fold += 1
 dataset = []
 for i in data_to_work_with:
     dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=','))
@@ -17,11 +24,17 @@ LWPLSR.Jchemo_lwplsr_fit(Reg)
 print('now predict')
 LWPLSR.Jchemo_lwplsr_predict(Reg)
 
+print('now CV')
+LWPLSR.Jchemo_lwplsr_cv(Reg)
+
+
 print('export to json')
 pred = ['pred_data_train', 'pred_data_test']
+for i in range(int(nb_fold/4)):
+    pred.append("CV" + str(i+1))
 json_export = {}
 for i in pred:
-    json_export[i] = Reg.pred_data_[pred.index(i)].to_dict()
+    json_export[i] = Reg.pred_data_[i].to_dict()
 json_export['model'] = str(Reg.model_)
 json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_
 with open(temp_path / "lwplsr_outputs.json", "w+") as outfile:
diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py
index 855d47e..a56a13e 100644
--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -1,4 +1,5 @@
 # import streamlit
+import pandas as pd
 from Packages import *
 st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
 from Modules import *
@@ -124,7 +125,7 @@ if not spectra.empty and not y.empty:
         colnames = spectra.columns
     else:
         colnames = np.arange(spectra.shape[1])
-    
+
 
     #rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
     # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
@@ -133,9 +134,9 @@ if not spectra.empty and not y.empty:
     # Assign data to training and test sets
     X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index]
     X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index]
-    
 
-        #### insight on loaded data
+
+    #### insight on loaded data
     fig, ax1 = plt.subplots( figsize = (12,3))
     spectra.T.plot(legend=False, ax = ax1, linestyle = '--')
     ax1.set_ylabel('Signal intensity')
@@ -168,29 +169,73 @@ if not spectra.empty and not y.empty:
         reg_model = Reg.model_
         #M2.dataframe(Pin.pred_data_)
     elif regression_algo == reg_algo[2]:
-        # export data to csv for Julia
+        info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.')
+        # export data to csv for Julia train/test
         data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
         x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
+        # Cross-Validation calculation
+        nb_folds = 3
+        st.write('KFold for Cross-Validation = ' + str(nb_folds))
+        # split train data into nb_folds
+        folds = KF_CV.CV(x_train_np, y_train_np, nb_folds)
+        d = {}
+        for i in range(nb_folds):
+            d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]]
+            data_to_work_with.append("xtr_fold{0}".format(i+1))
+            data_to_work_with.append("ytr_fold{0}".format(i+1))
+            data_to_work_with.append("xte_fold{0}".format(i+1))
+            data_to_work_with.append("yte_fold{0}".format(i+1))
+        # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files
         temp_path = Path('temp/')
-        for i in data_to_work_with: np.savetxt(temp_path / str(i + ".csv"), vars()[i], delimiter=",")
-        # run Julia Jchemo
+        for i in data_to_work_with:
+            if 'fold' in i:
+                j = d[i]
+            else:
+                j = globals()[i]
+            np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",")
+        # run Julia Jchemo as subprocess
         import subprocess
         subprocess_path = Path("Class_Mod/")
         subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"])
         # retrieve json results from Julia JChemo
-        with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
-            Reg_json = json.load(outfile)
-            for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
-        os.unlink(temp_path / "lwplsr_outputs.json")
-        pred = ['pred_data_train', 'pred_data_test']
-        Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
-        for i in range(len(pred)):
-            Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
-            if i != 1: # if not pred_data_test
-                Reg.pred_data_[i].index = list(y_train.index)
-            else:
-                Reg.pred_data_[i].index = list(y_test.index)
-
+        try:
+            with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
+                Reg_json = json.load(outfile)
+                # delete csv files
+                for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
+            # delete json file after import
+            os.unlink(temp_path / "lwplsr_outputs.json")
+            # format result data into Reg object
+            pred = ['pred_data_train', 'pred_data_test']
+            for i in range(nb_folds):
+                pred.append("CV" + str(i+1))
+            Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
+            Reg.CV_results_ = pd.DataFrame()
+            Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}}
+            # set indexes to Reg.pred_data (train, test, folds idx)
+            for i in range(len(pred)):
+                Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
+                if i == 0: # data_train
+                    Reg.pred_data_[i].index = list(y_train.index)
+                elif i == 1: # data_test
+                    Reg.pred_data_[i].index = list(y_test.index)
+                else: # CVi
+                    Reg.pred_data_[i].index = folds[list(folds)[i-2]]
+                    Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]])
+                    Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = Reg.pred_data_[i]
+                    Reg.cv_data_['idxCV']['Fold' + str(i-1)] = folds[list(folds)[i-2]]
+            Reg.CV_results_.sort_index(inplace = True)
+            Reg.CV_results_.columns = ['Ypredicted_CV']
+            # if you want to display Reg.cv_data_ containing by fold YpredCV and idxCV
+            # cv2.json(Reg.cv_data_)
+            # Display end of modeling message on the interface
+            info.empty()
+            M1.success('Model created!')
+        except FileNotFoundError as e:
+            # Display error message on the interface if modeling is wrong
+            info.empty()
+            M1.warning('- ERROR during model creation -')
+            Reg = None
     elif regression_algo == reg_algo[3]:
         s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
         it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3)
@@ -219,7 +264,7 @@ if not spectra.empty and not y.empty:
 
 
         ################# Model analysis ############
-    if regression_algo in reg_algo[1:]:
+    if regression_algo in reg_algo[1:] and Reg is not None:
         #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model --   ')
 
         fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6))
@@ -365,7 +410,7 @@ if not spectra.empty and not y.empty:
 
 
 if not spectra.empty and not y.empty and regression_algo:
-    if regression_algo in reg_algo[1:]:
+    if regression_algo in reg_algo[1:] and Reg is not None:
         fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True)
         ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)')
         ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)')
-- 
GitLab