Merge branch 'master' of https://src.koda.cnrs.fr/cefe/pace/nirs_workflow

5728ea85 · DIANE · bb9f8a44 · b03b645d · 5728ea85 · 5728ea85
Commit 5728ea85 authored 10 months ago by DIANE
--- a/src/Class_Mod/DATA_HANDLING.py
+++ b/src/Class_Mod/DATA_HANDLING.py
@@ -80,6 +80,7 @@ def No_transformation(X):
 ######################################## Cross val split ############################
 class KF_CV:
    ### method for generating test sets index
+    ### KFCV(dict) returns a testset indices/Fold 
    @staticmethod
    def CV(x, y, n_folds:int):
        test_folds = {}
@@ -90,30 +91,45 @@ class KF_CV:
            for _, i_test in kf.split(x, y):
                d.append(i_test)
            test_folds[folds_name[i]] = d[i]        
-        return test_folds
+        return test_folds ## returns a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set
    
    ### Cross validate the model and return the predictions and samples index
    @staticmethod
-    def cross_val_predictor(model, x, y, n_folds:int):
+    def cross_val_predictor(model, folds, x, y):
+        """" model: the object to be cross-validated,
+          folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method)
+          x and y: the data used for CV"""
        x = np.array(x)
        y = np.array(y)

        yp = {}
-        folds = KF_CV.CV(x=x, y=y, n_folds=n_folds)### Test index
        key = list(folds.keys())
+        n_folds = len(folds.keys())

        for i in range(n_folds):
            model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0))
            yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold
-        
-
+        return yp # returns a tuple with keys are names of folds and the corresponding values are the predicted Y/fold
+    @staticmethod
+    def meas_pred_eq(y, ypcv, folds):
+        """" y: the target variable,
+          ypcv: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with predictions/fold (from cross_val_predictor method)
+          folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method)
+          x and y: the data used for CV
+          
+        returns:
+        two dataframe:
+        - a n x 4 dataframe containing measured values, predicted values, ols reg equation, and index (n is the total number of samples)
+        -  a 2 x k dataframe containing ols regression coefficients(k is the number of folds)
+        """
        cvcv = {}
        coeff = {}
+        y = np.array(y)
        for i, Fname in enumerate(folds.keys()):
            r = pd.DataFrame()
-            r['Predicted'] = yp[Fname]
+            r['Predicted'] = ypcv[Fname]
            r['Measured'] = y[folds[Fname]]
-            ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]),yp[Fname].reshape(-1,1))
+            ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1))
            r.index = folds[Fname]
            r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0]
            cvcv[i] = r
@@ -123,37 +139,47 @@ class KF_CV:
        data['index'] = [data.index[i][1] for i in range(data.shape[0])]
        data.index = data['index']
        coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept'])    
-        return yp, folds, data, coeff
-
-    ### compute metrics for each fold
+        return data, coeff ## returns  values predicted in cross validation, ,coefficients of regression
+    
    @staticmethod
-    def process(model, x, y, n_folds:int):
-        f, idx,_ , _ = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds)
+    def metrics_cv(y, ypcv, folds):
+        y = np.array(y)
        e = {}
-        for i in idx.keys():
-            e[i] = metrics().reg_(y.iloc[idx[i]],f[i])
+        for i in folds.keys():
+            e[i] = metrics().reg_(y[folds[i]],ypcv[i])
        r = pd.DataFrame(e)
-        return r
+        r_print = r.copy()
+        r_print['mean'] = r.mean(axis = 1)
+        r_print['sd'] = r.std(axis = 1)
+        r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1)
+        return r.T, r_print.T
    
-    ### bias and variance
+    ### compute metrics for each fold
    @staticmethod
-    def cv_scores(model, x, y, n_folds:int):
-        x = KF_CV.process(model, x, y, n_folds)
-        mean = x.mean(axis = 1)
-        sd = x.std(axis = 1)
-        rsd = sd*100/mean
-        data = pd.concat([mean, sd, rsd], axis = 1).round(2)
-        data.columns = ['mean', 'sd', 'cv(%)']
-        return data
+    def cv_scores(y, ypcv, folds):
+        """ Takes as input the Y vactor, the tuple of preducted values/fold(from cross_val_predictor method), and the index/fold(from CV method)
+        and returns two dataframes, the first is containing metrics scores/fold and the second is similar to the first by with additional mean, sd, and rsd variables
+        """
+        y = np.array(y)
+        e = {}
+        for i in folds.keys():
+            e[i] = metrics().reg_(y[folds[i]],ypcv[i])
+        r = pd.DataFrame(e)
+        r_print = r
+        r_print['mean'] = r.mean(axis = 1)
+        r_print['sd'] = r.std(axis = 1)
+        r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1)
+        return r.T, r_print.T
    
-    ### Return ycv
-    @staticmethod
-    def ycv(model, x, y, n_folds:int):
-        ycv = np.zeros(y.shape[0])
-        f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds)
-        for i in f.keys():
-            ycv[idx[i]] = f[i]            
-        return ycv
+    
+    # ### Return ycv
+    # @staticmethod
+    # def ycv(model, x, y, n_folds:int):
+    #     ycv = np.zeros(y.shape[0])
+    #     f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds)
+    #     for i in f.keys():
+    #         ycv[idx[i]] = f[i]            
+    #     return ycv


 ### Selectivity ratio

--- a/src/Class_Mod/Hash.py
+++ b/src/Class_Mod/Hash.py
+from Packages import *
+
+def create_hash(spectra):
+    #using the md5 hash function.
+    hash_func = hashlib.md5()
+    spectra = str(spectra)
+    encoded_spectra = spectra.encode()
+    hash_func.update(encoded_spectra)
+    hash = hash_func.hexdigest()
+    return hash
+
+def check_hash(hash):
+    # path to hash file and grep/cat functions for Win
+    subprocess_path = Path("src/data/hash/")
+    # run a grep from the hash onto the hash file
+    nb_hash = subprocess.run([subprocess_path / 'grep.exe', '-c', hash, subprocess_path / "hash.txt"], shell=True)
+    # if hash present
+    if 'returncode=0' in str(nb_hash):
+        return 'existing hash'
+    # if hash not present, add it to the file with cat function
+    else:
+        add_hash = subprocess.run(['echo', str(hash) + '>>', subprocess_path / "hash.txt"], shell=True)
+        if 'returncode=0' in str(add_hash):
+            return 'hash added'
+        else:
+            return 'error while adding the new hash'
\ No newline at end of file
--- a/src/Class_Mod/LWPLSR_.py
+++ b/src/Class_Mod/LWPLSR_.py
@@ -7,33 +7,28 @@ class LWPLSR:

    Returns:
        self.scores (DataFrame): various metrics and scores
-        self.predicted_results_on_train (DataFrame):
-        self.predicted_results_on_test (DataFrame):
+        self.predicted_results (Dictionary): Dict containing all predicted results (train, test, cross-validation)
        self.mod (Julia model): the prepared model
    """
    def __init__(self, dataset):
        """Initiate the LWPLSR and prepare data for Julia computing."""
-
-        # self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))]
+        # get train / test data from dataset
        self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(4)]
+        # calculate number of KFolds and get CV data from dataset
        self.nb_fold = int((len(dataset)-4)/4)
        for i in range(self.nb_fold):
            setattr(self, "xtr_fold"+str(i+1), dataset[i+7])
            setattr(self, "ytr_fold"+str(i+1), dataset[i+13])
            setattr(self, "xte_fold"+str(i+1), dataset[i+4])
-            # setattr(self, "yte_fold"+str(i+1), dataset[i+10])
            setattr(jl, "xtr_fold"+str(i+1), dataset[i+7])
            setattr(jl, "ytr_fold"+str(i+1), dataset[i+13])
            setattr(jl, "xte_fold"+str(i+1), dataset[i+4])
-            # setattr(jl, "yte_fold"+str(i+1), dataset[i+10])

-        # prepare to send dataframes to julia and Jchemo
+        # prepare to send dataframes to julia and Jchemo (with the jl. prefix)
        jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test

        # initialize vars from the class
        y_shape = self.y_test.shape
-        self.predicted_results_on_test = pd.DataFrame
-        self.predicted_results_on_train = pd.DataFrame
        self.pred_test = np.zeros(shape=(y_shape[0], 1))
        self.pred_train = np.zeros(shape=(y_shape[0], 1))
        self.mod = ""
@@ -52,7 +47,7 @@ class LWPLSR:
        Returns:
            self.mod (Julia model): the prepared model
        """
-        # launch Julia Jchemo lwplsr
+        # launch Julia Jchemo lwplsr and convert DataFrames from Python Pandas DataFrame to Julia DataFrame
        jl.seval("""
        using DataFrames
        using Pandas
@@ -63,7 +58,7 @@ class LWPLSR:
        y_test |> Pandas.DataFrame |> DataFrames.DataFrame
        """)
        print('LWPLSR - tuning')
-        # set tuning parameters
+        # set tuning parameters to test
        jl.seval("""
        nlvdis = [5; 10; 15] ; metric = [:eucl; :mah] 
        h = [1; 2; 6; Inf] ; k = [30; 80; 200]  
@@ -83,21 +78,22 @@ class LWPLSR:
        ncal = ntrain - nval 
        """)

-        # Create LWPLSR model and tune
+        # Create LWPLSR model and tune with GridScore
        jl.seval("""
        mod = Jchemo.model(Jchemo.lwplsr)
        res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false)
        u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination
        """)
+        # save best lwplsr parameters
        self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]}
        print('best lwplsr params ' + str(self.best_lwplsr_params))
-        print('LWPLSR - best params ok')
-        # calculate LWPLSR model with best parameters
+        # run LWPLSR model with best parameters
        jl.seval("""
        mod = Jchemo.model(Jchemo.lwplsr; nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u])
        # Fit model
        Jchemo.fit!(mod, x_train, y_train)
        """)
+        # save Julia Jchemo model
        self.mod = jl.mod

    def Jchemo_lwplsr_predict(self):
@@ -128,14 +124,13 @@ class LWPLSR:
        print('LWPLSR - end')

    def Jchemo_lwplsr_cv(self):
-        """Send data to Julia to predict with lwplsr.
+        """Send Cross-Validation data to Julia to fit & predict with lwplsr.

        Args:
-            self.mod (Julia model): the prepared model
+            self.best_lwplsr_params: the best parameters to use (from tuning) for CV
            self.xtr_fold1 (DataFrame):
            self.ytr_fold1 (DataFrame):
            self.xte_fold1 (DataFrame):
-            self.yte_fold1 (DataFrame):

        Returns:
            self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation
@@ -144,7 +139,7 @@ class LWPLSR:
            jl.Xtr = getattr(self, "xtr_fold"+str(i+1))
            jl.Ytr = getattr(self, "ytr_fold"+str(i+1))
            jl.Xte = getattr(self, "xte_fold"+str(i+1))
-            # jl.Yte = getattr(self, "yte_fold"+str(i+1))
+            # convert Python Pandas DataFrame to Julia DataFrame
            jl.seval("""
            using DataFrames
            using Pandas
@@ -153,6 +148,7 @@ class LWPLSR:
            Ytr |> Pandas.DataFrame |> DataFrames.DataFrame
            Xte |> Pandas.DataFrame |> DataFrames.DataFrame
            """)
+            # set lwplsr parameters as the best one from tuning
            jl.nlvdis = int(self.best_lwplsr_params['nlvdis'])
            jl.metric = self.best_lwplsr_params['metric']
            jl.h = self.best_lwplsr_params['h']
@@ -169,15 +165,14 @@ class LWPLSR:
            res = Jchemo.predict(mod_cv, Xte)
            res.pred
            """)
+            # save predicted values for each KFold in the predicted_results dictionary
            self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv)

    @property
    def pred_data_(self):
        # convert predicted data from x_test to Pandas DataFrame
-        self.predicted_results_on_test = pd.DataFrame(self.pred_test)
-        self.predicted_results_on_train = pd.DataFrame(self.pred_train)
-        self.predicted_results["pred_data_train"] = self.predicted_results_on_train
-        self.predicted_results["pred_data_test"] = self.predicted_results_on_test
+        self.predicted_results["pred_data_train"] = pd.DataFrame(self.pred_train)
+        self.predicted_results["pred_data_test"] = pd.DataFrame(self.pred_test)
        return self.predicted_results

    @property

--- a/src/Class_Mod/LWPLSR_Call.py
+++ b/src/Class_Mod/LWPLSR_Call.py
@@ -7,35 +7,42 @@ import os
 # loading the lwplsr_inputs.json
 temp_path = Path("temp/")
 data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
+# check data for cross-validation depending on KFold number
 temp_files_list = os.listdir(temp_path)
 nb_fold = 0
 for i in temp_files_list:
    if 'fold' in i:
+        # add CV file name to data_to_work_with
        data_to_work_with.append(str(i)[:-4])
+        # and count the number of KFold
        nb_fold += 1
+# Import data from csv files in the temp/ folder
 dataset = []
 for i in data_to_work_with:
    dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=','))
 print('CSV imported')
+# launch LWPLSR Class from LWPLSR_.py in Class_Mod
 print('start model creation')
 Reg = LWPLSR(dataset)
 print('model created. \nnow fit')
 LWPLSR.Jchemo_lwplsr_fit(Reg)
 print('now predict')
 LWPLSR.Jchemo_lwplsr_predict(Reg)
-
 print('now CV')
 LWPLSR.Jchemo_lwplsr_cv(Reg)

-
+# Export results in a json file to bring data back to 2-model_creation.py and streamlit interface
 print('export to json')
 pred = ['pred_data_train', 'pred_data_test']
+# add KFold results to predicted data
 for i in range(int(nb_fold/4)):
    pred.append("CV" + str(i+1))
 json_export = {}
 for i in pred:
    json_export[i] = Reg.pred_data_[i].to_dict()
+# add the lwplsr global model to the json
 json_export['model'] = str(Reg.model_)
+# add the best parameters for the lwplsr obtained from GridScore tuning
 json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_
 with open(temp_path / "lwplsr_outputs.json", "w+") as outfile:
    json.dump(json_export, outfile)
--- a/src/Class_Mod/RegModels.py
+++ b/src/Class_Mod/RegModels.py
@@ -115,19 +115,23 @@ class Plsr(Regmodel):
        x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)]

        Model = PLSRegression(scale = False, n_components = params['n_components'])
-        self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
-        self._cv_df['Average'] = self._cv_df.mean(axis = 1)
-        self._cv_df['S'] = self._cv_df.std(axis = 1)
-        self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
-        self._cv_df = self._cv_df.T.round(2)
-        score = self._cv_df.loc["CV(%)",'rmse']
+        # self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
+        # self._cv_df['Average'] = self._cv_df.mean(axis = 1)
+        # self._cv_df['S'] = self._cv_df.std(axis = 1)
+        # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
+        # self._cv_df = self._cv_df.T.round(2)
+        folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds)
+        yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain))
+        self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1]
+                
+        score = self._cv_df.loc["cv",'rmse']
        
        Model = PLSRegression(scale = False, n_components = params['n_components'])
        Model.fit(x2[0], self._ytrain)

        if self.SCORE > score:
            self.SCORE = score
-            self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
+            self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds)
            self._yc = Model.predict(x2[0])
            self._yt = Model.predict(x2[1])
            self._model = Model
@@ -179,26 +183,29 @@ class TpeIpls(Regmodel):
        # print(x2)
        
        # ## Modelling
+        folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds)
        try:
            Model = PLSRegression(scale = False, n_components = params['n_components'])
-            self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
+            yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain))
+            self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1]
        except ValueError as ve:
            params["n_components"] = 1
            Model = PLSRegression(scale = False, n_components = params['n_components'])
-            self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
-
-        self._cv_df['Average'] = self._cv_df.mean(axis = 1)
-        self._cv_df['S'] = self._cv_df.std(axis = 1)
-        self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
-        self._cv_df = self._cv_df.T.round(2)
-        score = self._cv_df.loc['CV(%)','rmse']
+            yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain))
+            self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1]
+        # self._cv_df['Average'] = self._cv_df.mean(axis = 1)
+        # self._cv_df['S'] = self._cv_df.std(axis = 1)
+        # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
+        # self._cv_df = self._cv_df.T.round(2)
+        score = self._cv_df.loc['cv','rmse']
        
        Model = PLSRegression(scale = False, n_components = params['n_components'])
        Model.fit(x2[0][:,id], self._ytrain)

        if self.SCORE > score:
            self.SCORE = score
-            self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
+            self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds)
+            
            self._yc = Model.predict(x2[0][:,id])
            self._yt = Model.predict(x2[1][:,id])
            self._model = Model

--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -38,18 +38,11 @@ M9 = st.container()
 M9.write("-- Save the model --")
    ##############################################################################################

-reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
-regression_algo = None
-      #######################################        ###########################################

 files_format = ['.csv', '.dx']
 file = M00.radio('Select files format:', options = files_format)
-
-### Data
 spectra = pd.DataFrame()
 y = pd.DataFrame()
-
-
 # load .csv file
 if file == files_format[0]:
    xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
@@ -60,6 +53,8 @@ if file == files_format[0]:
                                options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
        if hdrx == "yes": col = 0
        else: col = False
+    else:
+        M00.warning('Insert your spectral data file here!')
        
    ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
    if ycal_csv:
@@ -67,6 +62,8 @@ if file == files_format[0]:
        hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3)
        if hdry == "yes": col = 0
        else: col = False
+    else:
+        M00.warning('Insert your target data file here!')
    
    if xcal_csv and ycal_csv:
        file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name)
@@ -93,17 +90,14 @@ if file == files_format[0]:
                spectra = pd.DataFrame

        else:
-            M1.warning('Tune decimal and separator parameters')
-
-        
-
-
-        
+            M00.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !')

 ## Load .dx file
 elif file == files_format[1]:
    data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
-    if data_file:
+    if not data_file:
+        M00.warning('Load your file here!')
+    else :
        file_name = str(data_file.name)
        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
            tmp.write(data_file.read())
@@ -116,7 +110,7 @@ elif file == files_format[1]:
                y = chem_data.loc[:,yname].loc[measured]
                spectra = spectra.loc[measured]
            else:
-                M00.warning('Warning: Chemical data are not included in your file !', icon="⚠️")
+                M00.warning('Warning: your file includes no target variables to model !', icon="⚠️")
        os.unlink(tmp_path)

 ### split the data
@@ -157,27 +151,36 @@ if not spectra.empty and not y.empty:


    M0.write('Loaded data summary')
-    M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2))
-    stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)
+    M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2))
+    stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)
    ####################################### Insight into the loaded data

-    #######################################
+
+    ####################################### Model creation ###################################################
+    reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
+    regression_algo = None
+    Reg = None
    regression_algo = M1.selectbox("Choose the algorithm for regression", options= reg_algo, key = 12, placeholder ="Choose an option")
+    # split train data into nb_folds for cross_validation
+    nb_folds = 3
+    folds = KF_CV.CV(X_train, y_train, nb_folds)
+
+    if not regression_algo:
+        M1.warning('Choose a modelling algorithm from the dropdown list !')
    if regression_algo == reg_algo[1]:
        # Train model with model function from application_functions.py
        Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1)
        reg_model = Reg.model_
        #M2.dataframe(Pin.pred_data_)
+
    elif regression_algo == reg_algo[2]:
        info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.')
        # export data to csv for Julia train/test
        data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
        x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
        # Cross-Validation calculation
-        nb_folds = 3
-        st.write('KFold for Cross-Validation = ' + str(nb_folds))
-        # split train data into nb_folds
-        folds = KF_CV.CV(x_train_np, y_train_np, nb_folds)
+        
+        st.write('KFold for Cross-Validation = ' + str(nb_folds))        
        d = {}
        for i in range(nb_folds):
            d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]]
@@ -203,39 +206,60 @@ if not spectra.empty and not y.empty:
                Reg_json = json.load(outfile)
                # delete csv files
                for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
-            # delete json file after import
+            # # delete json file after import
            os.unlink(temp_path / "lwplsr_outputs.json")
            # format result data into Reg object
-            pred = ['pred_data_train', 'pred_data_test']
+            pred = ['pred_data_train', 'pred_data_test']### keys of the dict
            for i in range(nb_folds):
-                pred.append("CV" + str(i+1))
-            Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
+                pred.append("CV" + str(i+1)) ### add cv folds keys to pred
+
+            Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'],
+                                          'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
+   
            Reg.CV_results_ = pd.DataFrame()
            Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}}
-            # set indexes to Reg.pred_data (train, test, folds idx)
+            # # set indexes to Reg.pred_data (train, test, folds idx)
            for i in range(len(pred)):
                Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
                if i == 0: # data_train
+                    # Reg.pred_data_[i] = np.array(Reg.pred_data_[i])
                    Reg.pred_data_[i].index = list(y_train.index)
+                    Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0]
                elif i == 1: # data_test
+                    # Reg.pred_data_[i] = np.array(Reg.pred_data_[i])
                    Reg.pred_data_[i].index = list(y_test.index)
-                else: # CVi
+                    Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0]
+                else:
+                    # CVi
                    Reg.pred_data_[i].index = folds[list(folds)[i-2]]
-                    Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]])
-                    Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = Reg.pred_data_[i]
-                    Reg.cv_data_['idxCV']['Fold' + str(i-1)] = folds[list(folds)[i-2]]
-            Reg.CV_results_.sort_index(inplace = True)
-            Reg.CV_results_.columns = ['Ypredicted_CV']
-            # if you want to display Reg.cv_data_ containing by fold YpredCV and idxCV
-            # cv2.json(Reg.cv_data_)
-            # Display end of modeling message on the interface
-            info.empty()
+                    # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]])
+                    Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1)
+                    Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1)
+            #Reg.cv_data_['idxCV'] and folds contains the same data
+            
+            Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1]
+        #     #### cross validation results print
+            Reg.best_hyperparams_print = Reg.best_hyperparams_
+        #     ## plots
+            Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds)
+            # st.write(Reg.cv_data_ )
+        #     # Reg.CV_results_.sort_index(inplace = True)
+        #     # Reg.CV_results_.columns = ['Ypredicted_CV']
+        #     # if you want to display Reg.cv_data_ containing, by fold, YpredCV and idxCV
+        #     # cv2.json(Reg.cv_data_)
+        #     # Display end of modeling message on the interface
+        #     info.empty()
            M1.success('Model created!')
        except FileNotFoundError as e:
            # Display error message on the interface if modeling is wrong
            info.empty()
            M1.warning('- ERROR during model creation -')
            Reg = None
+
+#######################
+
+
+            
    elif regression_algo == reg_algo[3]:
        s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
        it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3)
@@ -263,7 +287,8 @@ if not spectra.empty and not y.empty:



-        ################# Model analysis ############
+#         ###############################################################################################################DDDVVVVVVVVVV
+#        ################# Model analysis ############
    if regression_algo in reg_algo[1:] and Reg is not None:
        #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model --   ')

@@ -311,19 +336,20 @@ if not spectra.empty and not y.empty:
        cv_results=pd.DataFrame(Reg.CV_results_)
        cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')

-        fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", 
+        fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", 
                 color_discrete_sequence=px.colors.qualitative.G10)
-        fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
+        fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']),
+                        y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
        fig1.update_traces(marker_size=7, showlegend=False)
        cv2.plotly_chart(fig1, use_container_width=True)
-        fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
+        fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
                 color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000)
        fig0.update_traces(marker_size=8, showlegend=False)
-        fig0.write_image("./Report/figures/Allinone.png")
+        fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png")

        cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
        cv1.plotly_chart(fig0, use_container_width=True)
-        fig1.write_image("./Report/figures/Predictions_V.png")
+        fig1.write_image("./Report/figures/meas_vs_pred_cv_all.png")

        
        yc = Reg.pred_data_[0]
@@ -337,10 +363,12 @@ if not spectra.empty and not y.empty:
            json.dump(Reg.best_hyperparams_, outfile)
        
        
-##########
+# ##########
        M1.write("-- Model performance --")
-        M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
-
+        if regression_algo != "Locally Weighted PLSR":
+            M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
+        else:
+            M1.dataframe(metrics(t = [y_test, yt], method='regression').scores_)
        model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
        #from st_circular_progress import CircularProgress
        #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance',
@@ -348,26 +376,34 @@ if not spectra.empty and not y.empty:
        
        #my_circular_progress.st_circular_progress()
        #my_circular_progress.update_value(progress=20)
-        a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
+        if regression_algo != "Locally Weighted PLSR":
+            a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
+        else:
+            a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)

        M7.pyplot(a)
-        plt.savefig('./Report/figures/Predictedvs.png')
+        plt.savefig('./Report/figures/measured_vs_predicted.png')
        prep_para = Reg.best_hyperparams_
-        prep_para.pop('n_components')
-
-        for i in ['deriv','polyorder']:
-            if Reg.best_hyperparams_[i] == 0:
-                prep_para[i] = '0'
-            elif Reg.best_hyperparams_[i] == 1:
-                prep_para[i] = '1st'
-            elif Reg.best_hyperparams_[i] > 1:
-                prep_para[i] = f"{Reg.best_hyperparams_[i]}nd"
+        if regression_algo != "Locally Weighted PLSR":
+            prep_para.pop('n_components')
+            for i in ['deriv','polyorder']:
+                if Reg.best_hyperparams_[i] == 0:
+                    prep_para[i] = '0'
+                elif Reg.best_hyperparams_[i] == 1:
+                    prep_para[i] = '1st'
+                elif Reg.best_hyperparams_[i] > 1:
+                    prep_para[i] = f"{Reg.best_hyperparams_[i]}nd"
+        
+        if regression_algo != "Locally Weighted PLSR":
+            residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
+        else:
+            residual_plot = resid_plot([y_train, y_test], [yt, yt], train_idx=train_index, test_idx=test_index)

-        residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
        M8.pyplot(residual_plot)
-        plt.savefig('./Report/figures/residual_plot.png')
-
-        rega = Reg.selected_features_  ##### ADD FEATURES IMPORTANCE PLOT
+        plt.savefig('./Report/figures/residuals_plot.png')
+        
+        if regression_algo != "Locally Weighted PLSR":
+            rega = Reg.selected_features_  ##### ADD FEATURES IMPORTANCE PLOT
            
            #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
        model_name = M9.text_input('Give it a name')
@@ -413,7 +449,8 @@ if not spectra.empty and not y.empty and regression_algo:
    if regression_algo in reg_algo[1:] and Reg is not None:
        fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True)
        ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)')
-        ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)')
+        if regression_algo != "Locally Weighted PLSR":
+            ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)')
        ax2.set_xlabel('Wavelenghts')
        plt.tight_layout()
        
@@ -443,16 +480,19 @@ if not spectra.empty and not y.empty and regression_algo:
        M2.pyplot(fig)

 ## Load .dx file
+if Reg is not None:
+    with st.container():
+        if st.button("Download the report"):
+            if regression_algo == reg_algo[1]:
+                        latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results)
+                        report.compile_latex()
+            if regression_algo is None:
+                st.warning('Data processing has not been performed or finished yet!', icon = "⚠️")
+            else:
+                pass

-with st.container():
-    if st.button("Download the report"):
-        if regression_algo == reg_algo[1]:
-                    latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results)
-                    report.compile_latex()
-        if regression_algo is None:
-            st.warning('Data processing has not been performed or finished yet!', icon = "⚠️")
        else:
            pass

-    else:
-        pass
+
+ 
\ No newline at end of file