From 7f8b71508379a6f8a8877ea3077fa53177ab1a26 Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Tue, 25 Jun 2024 16:45:40 +0200 Subject: [PATCH] test --- src/Class_Mod/LWPLSR_.py | 81 ++++++++++++++++++++++++++------ src/Class_Mod/LWPLSR_Call.py | 15 +++++- src/pages/2-model_creation.py | 87 ++++++++++++++++++++++++++--------- 3 files changed, 148 insertions(+), 35 deletions(-) diff --git a/src/Class_Mod/LWPLSR_.py b/src/Class_Mod/LWPLSR_.py index 2e3d40b..a7bd379 100644 --- a/src/Class_Mod/LWPLSR_.py +++ b/src/Class_Mod/LWPLSR_.py @@ -14,7 +14,18 @@ class LWPLSR: def __init__(self, dataset): """Initiate the LWPLSR and prepare data for Julia computing.""" - self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))] + # self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))] + self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(4)] + self.nb_fold = int((len(dataset)-4)/4) + for i in range(self.nb_fold): + setattr(self, "xtr_fold"+str(i+1), dataset[i+7]) + setattr(self, "ytr_fold"+str(i+1), dataset[i+13]) + setattr(self, "xte_fold"+str(i+1), dataset[i+4]) + # setattr(self, "yte_fold"+str(i+1), dataset[i+10]) + setattr(jl, "xtr_fold"+str(i+1), dataset[i+7]) + setattr(jl, "ytr_fold"+str(i+1), dataset[i+13]) + setattr(jl, "xte_fold"+str(i+1), dataset[i+4]) + # setattr(jl, "yte_fold"+str(i+1), dataset[i+10]) # prepare to send dataframes to julia and Jchemo jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test @@ -23,20 +34,20 @@ class LWPLSR: y_shape = self.y_test.shape self.predicted_results_on_test = pd.DataFrame self.predicted_results_on_train = pd.DataFrame - self.predicted_results_on_cv = pd.DataFrame self.pred_test = np.zeros(shape=(y_shape[0], 1)) self.pred_train = np.zeros(shape=(y_shape[0], 1)) self.mod = "" self.best_lwplsr_params = np.zeros(shape=(5, 1)) + self.predicted_results = {} def Jchemo_lwplsr_fit(self): """Send data to Julia to fit lwplsr. Args: - self.jl.x_train (DataFrame): - self.jl.y_train (DataFrame): - self.jl.x_test (DataFrame): - self.jl.y_test (DataFrame): + self.x_train (DataFrame): + self.y_train (DataFrame): + self.x_test (DataFrame): + self.y_test (DataFrame): Returns: self.mod (Julia model): the prepared model @@ -79,7 +90,7 @@ class LWPLSR: u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination """) self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]} - print('best lwplsr params' + str(self.best_lwplsr_params)) + print('best lwplsr params ' + str(self.best_lwplsr_params)) print('LWPLSR - best params ok') # calculate LWPLSR model with best parameters jl.seval(""" @@ -94,15 +105,14 @@ class LWPLSR: Args: self.mod (Julia model): the prepared model - self.jl.x_train (DataFrame): - self.jl.y_train (DataFrame): - self.jl.x_test (DataFrame): - self.jl.y_test (DataFrame): + self.x_train (DataFrame): + self.y_train (DataFrame): + self.x_test (DataFrame): + self.y_test (DataFrame): Returns: self.pred_test (Julia DataFrame): predicted values on x_test self.pred_train (Julia DataFrame): predicted values on x_train - self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation """ # Predictions on x_test and store in self.pred self.pred_test = jl.seval(""" @@ -117,13 +127,58 @@ class LWPLSR: """) print('LWPLSR - end') + def Jchemo_lwplsr_cv(self): + """Send data to Julia to predict with lwplsr. + + Args: + self.mod (Julia model): the prepared model + self.xtr_fold1 (DataFrame): + self.ytr_fold1 (DataFrame): + self.xte_fold1 (DataFrame): + self.yte_fold1 (DataFrame): + + Returns: + self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation + """ + for i in range(self.nb_fold): + jl.Xtr = getattr(self, "xtr_fold"+str(i+1)) + jl.Ytr = getattr(self, "ytr_fold"+str(i+1)) + jl.Xte = getattr(self, "xte_fold"+str(i+1)) + # jl.Yte = getattr(self, "yte_fold"+str(i+1)) + jl.seval(""" + using DataFrames + using Pandas + using Jchemo + Xtr |> Pandas.DataFrame |> DataFrames.DataFrame + Ytr |> Pandas.DataFrame |> DataFrames.DataFrame + Xte |> Pandas.DataFrame |> DataFrames.DataFrame + """) + jl.nlvdis = int(self.best_lwplsr_params['nlvdis']) + jl.metric = self.best_lwplsr_params['metric'] + jl.h = self.best_lwplsr_params['h'] + jl.k = int(self.best_lwplsr_params['k']) + jl.nlv = int(self.best_lwplsr_params['nlv']) + jl.seval(""" + println("LWPLSR - start CV mod") + mod_cv = Jchemo.model(Jchemo.lwplsr; nlvdis = nlvdis, metric = Symbol(metric), h = h, k = k, nlv = nlv) + # Fit model + Jchemo.fit!(mod_cv, Xtr, Ytr) + """) + pred_cv = jl.seval(""" + println("LWPLSR - start CV predict") + res = Jchemo.predict(mod_cv, Xte) + res.pred + """) + self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv) @property def pred_data_(self): # convert predicted data from x_test to Pandas DataFrame self.predicted_results_on_test = pd.DataFrame(self.pred_test) self.predicted_results_on_train = pd.DataFrame(self.pred_train) - return self.predicted_results_on_train, self.predicted_results_on_test + self.predicted_results["pred_data_train"] = self.predicted_results_on_train + self.predicted_results["pred_data_test"] = self.predicted_results_on_test + return self.predicted_results @property def model_(self): diff --git a/src/Class_Mod/LWPLSR_Call.py b/src/Class_Mod/LWPLSR_Call.py index f8445d4..49c674c 100644 --- a/src/Class_Mod/LWPLSR_Call.py +++ b/src/Class_Mod/LWPLSR_Call.py @@ -2,10 +2,17 @@ import numpy as np from pathlib import Path import json from LWPLSR_ import LWPLSR +import os # loading the lwplsr_inputs.json temp_path = Path("temp/") data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] +temp_files_list = os.listdir(temp_path) +nb_fold = 0 +for i in temp_files_list: + if 'fold' in i: + data_to_work_with.append(str(i)[:-4]) + nb_fold += 1 dataset = [] for i in data_to_work_with: dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=',')) @@ -17,11 +24,17 @@ LWPLSR.Jchemo_lwplsr_fit(Reg) print('now predict') LWPLSR.Jchemo_lwplsr_predict(Reg) +print('now CV') +LWPLSR.Jchemo_lwplsr_cv(Reg) + + print('export to json') pred = ['pred_data_train', 'pred_data_test'] +for i in range(int(nb_fold/4)): + pred.append("CV" + str(i+1)) json_export = {} for i in pred: - json_export[i] = Reg.pred_data_[pred.index(i)].to_dict() + json_export[i] = Reg.pred_data_[i].to_dict() json_export['model'] = str(Reg.model_) json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_ with open(temp_path / "lwplsr_outputs.json", "w+") as outfile: diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 855d47e..a56a13e 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -1,4 +1,5 @@ # import streamlit +import pandas as pd from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * @@ -124,7 +125,7 @@ if not spectra.empty and not y.empty: colnames = spectra.columns else: colnames = np.arange(spectra.shape[1]) - + #rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing @@ -133,9 +134,9 @@ if not spectra.empty and not y.empty: # Assign data to training and test sets X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index] X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index] - - #### insight on loaded data + + #### insight on loaded data fig, ax1 = plt.subplots( figsize = (12,3)) spectra.T.plot(legend=False, ax = ax1, linestyle = '--') ax1.set_ylabel('Signal intensity') @@ -168,29 +169,73 @@ if not spectra.empty and not y.empty: reg_model = Reg.model_ #M2.dataframe(Pin.pred_data_) elif regression_algo == reg_algo[2]: - # export data to csv for Julia + info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.') + # export data to csv for Julia train/test data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() + # Cross-Validation calculation + nb_folds = 3 + st.write('KFold for Cross-Validation = ' + str(nb_folds)) + # split train data into nb_folds + folds = KF_CV.CV(x_train_np, y_train_np, nb_folds) + d = {} + for i in range(nb_folds): + d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] + data_to_work_with.append("xtr_fold{0}".format(i+1)) + data_to_work_with.append("ytr_fold{0}".format(i+1)) + data_to_work_with.append("xte_fold{0}".format(i+1)) + data_to_work_with.append("yte_fold{0}".format(i+1)) + # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files temp_path = Path('temp/') - for i in data_to_work_with: np.savetxt(temp_path / str(i + ".csv"), vars()[i], delimiter=",") - # run Julia Jchemo + for i in data_to_work_with: + if 'fold' in i: + j = d[i] + else: + j = globals()[i] + np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") + # run Julia Jchemo as subprocess import subprocess subprocess_path = Path("Class_Mod/") subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"]) # retrieve json results from Julia JChemo - with open(temp_path / "lwplsr_outputs.json", "r") as outfile: - Reg_json = json.load(outfile) - for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) - os.unlink(temp_path / "lwplsr_outputs.json") - pred = ['pred_data_train', 'pred_data_test'] - Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) - for i in range(len(pred)): - Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) - if i != 1: # if not pred_data_test - Reg.pred_data_[i].index = list(y_train.index) - else: - Reg.pred_data_[i].index = list(y_test.index) - + try: + with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + Reg_json = json.load(outfile) + # delete csv files + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + # delete json file after import + os.unlink(temp_path / "lwplsr_outputs.json") + # format result data into Reg object + pred = ['pred_data_train', 'pred_data_test'] + for i in range(nb_folds): + pred.append("CV" + str(i+1)) + Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + Reg.CV_results_ = pd.DataFrame() + Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} + # set indexes to Reg.pred_data (train, test, folds idx) + for i in range(len(pred)): + Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) + if i == 0: # data_train + Reg.pred_data_[i].index = list(y_train.index) + elif i == 1: # data_test + Reg.pred_data_[i].index = list(y_test.index) + else: # CVi + Reg.pred_data_[i].index = folds[list(folds)[i-2]] + Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) + Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = Reg.pred_data_[i] + Reg.cv_data_['idxCV']['Fold' + str(i-1)] = folds[list(folds)[i-2]] + Reg.CV_results_.sort_index(inplace = True) + Reg.CV_results_.columns = ['Ypredicted_CV'] + # if you want to display Reg.cv_data_ containing by fold YpredCV and idxCV + # cv2.json(Reg.cv_data_) + # Display end of modeling message on the interface + info.empty() + M1.success('Model created!') + except FileNotFoundError as e: + # Display error message on the interface if modeling is wrong + info.empty() + M1.warning('- ERROR during model creation -') + Reg = None elif regression_algo == reg_algo[3]: s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3) it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3) @@ -219,7 +264,7 @@ if not spectra.empty and not y.empty: ################# Model analysis ############ - if regression_algo in reg_algo[1:]: + if regression_algo in reg_algo[1:] and Reg is not None: #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ') fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6)) @@ -365,7 +410,7 @@ if not spectra.empty and not y.empty: if not spectra.empty and not y.empty and regression_algo: - if regression_algo in reg_algo[1:]: + if regression_algo in reg_algo[1:] and Reg is not None: fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') -- GitLab