diff --git a/src/Class_Mod/DATA_HANDLING.py b/src/Class_Mod/DATA_HANDLING.py index 892c0c0854533b346a4e2363a61408c4d114a4ae..17e4dcb44161db3710fec2ccf94ad8363e35cfbc 100644 --- a/src/Class_Mod/DATA_HANDLING.py +++ b/src/Class_Mod/DATA_HANDLING.py @@ -80,6 +80,7 @@ def No_transformation(X): ######################################## Cross val split ############################ class KF_CV: ### method for generating test sets index + ### KFCV(dict) returns a testset indices/Fold @staticmethod def CV(x, y, n_folds:int): test_folds = {} @@ -90,30 +91,45 @@ class KF_CV: for _, i_test in kf.split(x, y): d.append(i_test) test_folds[folds_name[i]] = d[i] - return test_folds + return test_folds ## returns a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set ### Cross validate the model and return the predictions and samples index @staticmethod - def cross_val_predictor(model, x, y, n_folds:int): + def cross_val_predictor(model, folds, x, y): + """" model: the object to be cross-validated, + folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method) + x and y: the data used for CV""" x = np.array(x) y = np.array(y) yp = {} - folds = KF_CV.CV(x=x, y=y, n_folds=n_folds)### Test index key = list(folds.keys()) + n_folds = len(folds.keys()) for i in range(n_folds): model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0)) yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold - - + return yp # returns a tuple with keys are names of folds and the corresponding values are the predicted Y/fold + @staticmethod + def meas_pred_eq(y, ypcv, folds): + """" y: the target variable, + ypcv: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with predictions/fold (from cross_val_predictor method) + folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method) + x and y: the data used for CV + + returns: + two dataframe: + - a n x 4 dataframe containing measured values, predicted values, ols reg equation, and index (n is the total number of samples) + - a 2 x k dataframe containing ols regression coefficients(k is the number of folds) + """ cvcv = {} coeff = {} + y = np.array(y) for i, Fname in enumerate(folds.keys()): r = pd.DataFrame() - r['Predicted'] = yp[Fname] + r['Predicted'] = ypcv[Fname] r['Measured'] = y[folds[Fname]] - ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]),yp[Fname].reshape(-1,1)) + ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1)) r.index = folds[Fname] r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0] cvcv[i] = r @@ -123,37 +139,47 @@ class KF_CV: data['index'] = [data.index[i][1] for i in range(data.shape[0])] data.index = data['index'] coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept']) - return yp, folds, data, coeff - - ### compute metrics for each fold + return data, coeff ## returns values predicted in cross validation, ,coefficients of regression + @staticmethod - def process(model, x, y, n_folds:int): - f, idx,_ , _ = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds) + def metrics_cv(y, ypcv, folds): + y = np.array(y) e = {} - for i in idx.keys(): - e[i] = metrics().reg_(y.iloc[idx[i]],f[i]) + for i in folds.keys(): + e[i] = metrics().reg_(y[folds[i]],ypcv[i]) r = pd.DataFrame(e) - return r + r_print = r.copy() + r_print['mean'] = r.mean(axis = 1) + r_print['sd'] = r.std(axis = 1) + r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1) + return r.T, r_print.T - ### bias and variance + ### compute metrics for each fold @staticmethod - def cv_scores(model, x, y, n_folds:int): - x = KF_CV.process(model, x, y, n_folds) - mean = x.mean(axis = 1) - sd = x.std(axis = 1) - rsd = sd*100/mean - data = pd.concat([mean, sd, rsd], axis = 1).round(2) - data.columns = ['mean', 'sd', 'cv(%)'] - return data + def cv_scores(y, ypcv, folds): + """ Takes as input the Y vactor, the tuple of preducted values/fold(from cross_val_predictor method), and the index/fold(from CV method) + and returns two dataframes, the first is containing metrics scores/fold and the second is similar to the first by with additional mean, sd, and rsd variables + """ + y = np.array(y) + e = {} + for i in folds.keys(): + e[i] = metrics().reg_(y[folds[i]],ypcv[i]) + r = pd.DataFrame(e) + r_print = r + r_print['mean'] = r.mean(axis = 1) + r_print['sd'] = r.std(axis = 1) + r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1) + return r.T, r_print.T - ### Return ycv - @staticmethod - def ycv(model, x, y, n_folds:int): - ycv = np.zeros(y.shape[0]) - f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds) - for i in f.keys(): - ycv[idx[i]] = f[i] - return ycv + + # ### Return ycv + # @staticmethod + # def ycv(model, x, y, n_folds:int): + # ycv = np.zeros(y.shape[0]) + # f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds) + # for i in f.keys(): + # ycv[idx[i]] = f[i] + # return ycv ### Selectivity ratio diff --git a/src/Class_Mod/Hash.py b/src/Class_Mod/Hash.py new file mode 100644 index 0000000000000000000000000000000000000000..8a96cbd6009e356e4b3011f35778de27d3e58a9c --- /dev/null +++ b/src/Class_Mod/Hash.py @@ -0,0 +1,26 @@ +from Packages import * + +def create_hash(spectra): + #using the md5 hash function. + hash_func = hashlib.md5() + spectra = str(spectra) + encoded_spectra = spectra.encode() + hash_func.update(encoded_spectra) + hash = hash_func.hexdigest() + return hash + +def check_hash(hash): + # path to hash file and grep/cat functions for Win + subprocess_path = Path("src/data/hash/") + # run a grep from the hash onto the hash file + nb_hash = subprocess.run([subprocess_path / 'grep.exe', '-c', hash, subprocess_path / "hash.txt"], shell=True) + # if hash present + if 'returncode=0' in str(nb_hash): + return 'existing hash' + # if hash not present, add it to the file with cat function + else: + add_hash = subprocess.run(['echo', str(hash) + '>>', subprocess_path / "hash.txt"], shell=True) + if 'returncode=0' in str(add_hash): + return 'hash added' + else: + return 'error while adding the new hash' \ No newline at end of file diff --git a/src/Class_Mod/LWPLSR_.py b/src/Class_Mod/LWPLSR_.py index a7bd37980855274ec8c93ec5f0e188116188385f..da661d6be6fea9c1627cfc0e8df17da25dc7e627 100644 --- a/src/Class_Mod/LWPLSR_.py +++ b/src/Class_Mod/LWPLSR_.py @@ -7,33 +7,28 @@ class LWPLSR: Returns: self.scores (DataFrame): various metrics and scores - self.predicted_results_on_train (DataFrame): - self.predicted_results_on_test (DataFrame): + self.predicted_results (Dictionary): Dict containing all predicted results (train, test, cross-validation) self.mod (Julia model): the prepared model """ def __init__(self, dataset): """Initiate the LWPLSR and prepare data for Julia computing.""" - - # self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))] + # get train / test data from dataset self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(4)] + # calculate number of KFolds and get CV data from dataset self.nb_fold = int((len(dataset)-4)/4) for i in range(self.nb_fold): setattr(self, "xtr_fold"+str(i+1), dataset[i+7]) setattr(self, "ytr_fold"+str(i+1), dataset[i+13]) setattr(self, "xte_fold"+str(i+1), dataset[i+4]) - # setattr(self, "yte_fold"+str(i+1), dataset[i+10]) setattr(jl, "xtr_fold"+str(i+1), dataset[i+7]) setattr(jl, "ytr_fold"+str(i+1), dataset[i+13]) setattr(jl, "xte_fold"+str(i+1), dataset[i+4]) - # setattr(jl, "yte_fold"+str(i+1), dataset[i+10]) - # prepare to send dataframes to julia and Jchemo + # prepare to send dataframes to julia and Jchemo (with the jl. prefix) jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test # initialize vars from the class y_shape = self.y_test.shape - self.predicted_results_on_test = pd.DataFrame - self.predicted_results_on_train = pd.DataFrame self.pred_test = np.zeros(shape=(y_shape[0], 1)) self.pred_train = np.zeros(shape=(y_shape[0], 1)) self.mod = "" @@ -52,7 +47,7 @@ class LWPLSR: Returns: self.mod (Julia model): the prepared model """ - # launch Julia Jchemo lwplsr + # launch Julia Jchemo lwplsr and convert DataFrames from Python Pandas DataFrame to Julia DataFrame jl.seval(""" using DataFrames using Pandas @@ -63,7 +58,7 @@ class LWPLSR: y_test |> Pandas.DataFrame |> DataFrames.DataFrame """) print('LWPLSR - tuning') - # set tuning parameters + # set tuning parameters to test jl.seval(""" nlvdis = [5; 10; 15] ; metric = [:eucl; :mah] h = [1; 2; 6; Inf] ; k = [30; 80; 200] @@ -83,21 +78,22 @@ class LWPLSR: ncal = ntrain - nval """) - # Create LWPLSR model and tune + # Create LWPLSR model and tune with GridScore jl.seval(""" mod = Jchemo.model(Jchemo.lwplsr) res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false) u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination """) + # save best lwplsr parameters self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]} print('best lwplsr params ' + str(self.best_lwplsr_params)) - print('LWPLSR - best params ok') - # calculate LWPLSR model with best parameters + # run LWPLSR model with best parameters jl.seval(""" mod = Jchemo.model(Jchemo.lwplsr; nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u]) # Fit model Jchemo.fit!(mod, x_train, y_train) """) + # save Julia Jchemo model self.mod = jl.mod def Jchemo_lwplsr_predict(self): @@ -128,14 +124,13 @@ class LWPLSR: print('LWPLSR - end') def Jchemo_lwplsr_cv(self): - """Send data to Julia to predict with lwplsr. + """Send Cross-Validation data to Julia to fit & predict with lwplsr. Args: - self.mod (Julia model): the prepared model + self.best_lwplsr_params: the best parameters to use (from tuning) for CV self.xtr_fold1 (DataFrame): self.ytr_fold1 (DataFrame): self.xte_fold1 (DataFrame): - self.yte_fold1 (DataFrame): Returns: self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation @@ -144,7 +139,7 @@ class LWPLSR: jl.Xtr = getattr(self, "xtr_fold"+str(i+1)) jl.Ytr = getattr(self, "ytr_fold"+str(i+1)) jl.Xte = getattr(self, "xte_fold"+str(i+1)) - # jl.Yte = getattr(self, "yte_fold"+str(i+1)) + # convert Python Pandas DataFrame to Julia DataFrame jl.seval(""" using DataFrames using Pandas @@ -153,6 +148,7 @@ class LWPLSR: Ytr |> Pandas.DataFrame |> DataFrames.DataFrame Xte |> Pandas.DataFrame |> DataFrames.DataFrame """) + # set lwplsr parameters as the best one from tuning jl.nlvdis = int(self.best_lwplsr_params['nlvdis']) jl.metric = self.best_lwplsr_params['metric'] jl.h = self.best_lwplsr_params['h'] @@ -169,15 +165,14 @@ class LWPLSR: res = Jchemo.predict(mod_cv, Xte) res.pred """) + # save predicted values for each KFold in the predicted_results dictionary self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv) @property def pred_data_(self): # convert predicted data from x_test to Pandas DataFrame - self.predicted_results_on_test = pd.DataFrame(self.pred_test) - self.predicted_results_on_train = pd.DataFrame(self.pred_train) - self.predicted_results["pred_data_train"] = self.predicted_results_on_train - self.predicted_results["pred_data_test"] = self.predicted_results_on_test + self.predicted_results["pred_data_train"] = pd.DataFrame(self.pred_train) + self.predicted_results["pred_data_test"] = pd.DataFrame(self.pred_test) return self.predicted_results @property diff --git a/src/Class_Mod/LWPLSR_Call.py b/src/Class_Mod/LWPLSR_Call.py index 49c674cdae90369fad0a5db1757cc722f7d17b4a..007009125fbab59c62d4a587ab62e5cc065b54fd 100644 --- a/src/Class_Mod/LWPLSR_Call.py +++ b/src/Class_Mod/LWPLSR_Call.py @@ -7,35 +7,42 @@ import os # loading the lwplsr_inputs.json temp_path = Path("temp/") data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] +# check data for cross-validation depending on KFold number temp_files_list = os.listdir(temp_path) nb_fold = 0 for i in temp_files_list: if 'fold' in i: + # add CV file name to data_to_work_with data_to_work_with.append(str(i)[:-4]) + # and count the number of KFold nb_fold += 1 +# Import data from csv files in the temp/ folder dataset = [] for i in data_to_work_with: dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=',')) print('CSV imported') +# launch LWPLSR Class from LWPLSR_.py in Class_Mod print('start model creation') Reg = LWPLSR(dataset) print('model created. \nnow fit') LWPLSR.Jchemo_lwplsr_fit(Reg) print('now predict') LWPLSR.Jchemo_lwplsr_predict(Reg) - print('now CV') LWPLSR.Jchemo_lwplsr_cv(Reg) - +# Export results in a json file to bring data back to 2-model_creation.py and streamlit interface print('export to json') pred = ['pred_data_train', 'pred_data_test'] +# add KFold results to predicted data for i in range(int(nb_fold/4)): pred.append("CV" + str(i+1)) json_export = {} for i in pred: json_export[i] = Reg.pred_data_[i].to_dict() +# add the lwplsr global model to the json json_export['model'] = str(Reg.model_) +# add the best parameters for the lwplsr obtained from GridScore tuning json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_ with open(temp_path / "lwplsr_outputs.json", "w+") as outfile: json.dump(json_export, outfile) diff --git a/src/Class_Mod/RegModels.py b/src/Class_Mod/RegModels.py index 056253d9148483608628553d00b195f6808b57f4..ce07a07e6bf541d8e078dfe12846d96d4868e28a 100644 --- a/src/Class_Mod/RegModels.py +++ b/src/Class_Mod/RegModels.py @@ -115,19 +115,23 @@ class Plsr(Regmodel): x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] Model = PLSRegression(scale = False, n_components = params['n_components']) - self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) - self._cv_df['Average'] = self._cv_df.mean(axis = 1) - self._cv_df['S'] = self._cv_df.std(axis = 1) - self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] - self._cv_df = self._cv_df.T.round(2) - score = self._cv_df.loc["CV(%)",'rmse'] + # self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) + # self._cv_df['Average'] = self._cv_df.mean(axis = 1) + # self._cv_df['S'] = self._cv_df.std(axis = 1) + # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] + # self._cv_df = self._cv_df.T.round(2) + folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) + yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] + + score = self._cv_df.loc["cv",'rmse'] Model = PLSRegression(scale = False, n_components = params['n_components']) Model.fit(x2[0], self._ytrain) if self.SCORE > score: self.SCORE = score - self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) + self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds) self._yc = Model.predict(x2[0]) self._yt = Model.predict(x2[1]) self._model = Model @@ -179,26 +183,29 @@ class TpeIpls(Regmodel): # print(x2) # ## Modelling + folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) try: Model = PLSRegression(scale = False, n_components = params['n_components']) - self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) + yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] except ValueError as ve: params["n_components"] = 1 Model = PLSRegression(scale = False, n_components = params['n_components']) - self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) - - self._cv_df['Average'] = self._cv_df.mean(axis = 1) - self._cv_df['S'] = self._cv_df.std(axis = 1) - self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] - self._cv_df = self._cv_df.T.round(2) - score = self._cv_df.loc['CV(%)','rmse'] + yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] + # self._cv_df['Average'] = self._cv_df.mean(axis = 1) + # self._cv_df['S'] = self._cv_df.std(axis = 1) + # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] + # self._cv_df = self._cv_df.T.round(2) + score = self._cv_df.loc['cv','rmse'] Model = PLSRegression(scale = False, n_components = params['n_components']) Model.fit(x2[0][:,id], self._ytrain) if self.SCORE > score: self.SCORE = score - self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds) + self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds) + self._yc = Model.predict(x2[0][:,id]) self._yt = Model.predict(x2[1][:,id]) self._model = Model diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index a56a13e8b7fc389a407cca1bc63ffc2254bb5453..12d2c3c395cc120d0d09cbbd35446ac156fae990 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -38,18 +38,11 @@ M9 = st.container() M9.write("-- Save the model --") ############################################################################################## -reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] -regression_algo = None - ####################################### ########################################### files_format = ['.csv', '.dx'] file = M00.radio('Select files format:', options = files_format) - -### Data spectra = pd.DataFrame() y = pd.DataFrame() - - # load .csv file if file == files_format[0]: xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") @@ -60,6 +53,8 @@ if file == files_format[0]: options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) if hdrx == "yes": col = 0 else: col = False + else: + M00.warning('Insert your spectral data file here!') ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") if ycal_csv: @@ -67,6 +62,8 @@ if file == files_format[0]: hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3) if hdry == "yes": col = 0 else: col = False + else: + M00.warning('Insert your target data file here!') if xcal_csv and ycal_csv: file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) @@ -93,17 +90,14 @@ if file == files_format[0]: spectra = pd.DataFrame else: - M1.warning('Tune decimal and separator parameters') - - - - - + M00.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !') ## Load .dx file elif file == files_format[1]: data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") - if data_file: + if not data_file: + M00.warning('Load your file here!') + else : file_name = str(data_file.name) with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: tmp.write(data_file.read()) @@ -116,7 +110,7 @@ elif file == files_format[1]: y = chem_data.loc[:,yname].loc[measured] spectra = spectra.loc[measured] else: - M00.warning('Warning: Chemical data are not included in your file !', icon="âš ï¸") + M00.warning('Warning: your file includes no target variables to model !', icon="âš ï¸") os.unlink(tmp_path) ### split the data @@ -157,27 +151,36 @@ if not spectra.empty and not y.empty: M0.write('Loaded data summary') - M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)) - stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2) + M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)) + stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2) ####################################### Insight into the loaded data - ####################################### + + ####################################### Model creation ################################################### + reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] + regression_algo = None + Reg = None regression_algo = M1.selectbox("Choose the algorithm for regression", options= reg_algo, key = 12, placeholder ="Choose an option") + # split train data into nb_folds for cross_validation + nb_folds = 3 + folds = KF_CV.CV(X_train, y_train, nb_folds) + + if not regression_algo: + M1.warning('Choose a modelling algorithm from the dropdown list !') if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1) reg_model = Reg.model_ #M2.dataframe(Pin.pred_data_) + elif regression_algo == reg_algo[2]: info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.') # export data to csv for Julia train/test data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() # Cross-Validation calculation - nb_folds = 3 - st.write('KFold for Cross-Validation = ' + str(nb_folds)) - # split train data into nb_folds - folds = KF_CV.CV(x_train_np, y_train_np, nb_folds) + + st.write('KFold for Cross-Validation = ' + str(nb_folds)) d = {} for i in range(nb_folds): d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] @@ -203,39 +206,60 @@ if not spectra.empty and not y.empty: Reg_json = json.load(outfile) # delete csv files for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) - # delete json file after import + # # delete json file after import os.unlink(temp_path / "lwplsr_outputs.json") # format result data into Reg object - pred = ['pred_data_train', 'pred_data_test'] + pred = ['pred_data_train', 'pred_data_test']### keys of the dict for i in range(nb_folds): - pred.append("CV" + str(i+1)) - Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + pred.append("CV" + str(i+1)) ### add cv folds keys to pred + + Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], + 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + Reg.CV_results_ = pd.DataFrame() Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} - # set indexes to Reg.pred_data (train, test, folds idx) + # # set indexes to Reg.pred_data (train, test, folds idx) for i in range(len(pred)): Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) if i == 0: # data_train + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) Reg.pred_data_[i].index = list(y_train.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] elif i == 1: # data_test + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) Reg.pred_data_[i].index = list(y_test.index) - else: # CVi + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + else: + # CVi Reg.pred_data_[i].index = folds[list(folds)[i-2]] - Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) - Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = Reg.pred_data_[i] - Reg.cv_data_['idxCV']['Fold' + str(i-1)] = folds[list(folds)[i-2]] - Reg.CV_results_.sort_index(inplace = True) - Reg.CV_results_.columns = ['Ypredicted_CV'] - # if you want to display Reg.cv_data_ containing by fold YpredCV and idxCV - # cv2.json(Reg.cv_data_) - # Display end of modeling message on the interface - info.empty() + # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) + Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) + Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) + #Reg.cv_data_['idxCV'] and folds contains the same data + + Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] + # #### cross validation results print + Reg.best_hyperparams_print = Reg.best_hyperparams_ + # ## plots + Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds) + # st.write(Reg.cv_data_ ) + # # Reg.CV_results_.sort_index(inplace = True) + # # Reg.CV_results_.columns = ['Ypredicted_CV'] + # # if you want to display Reg.cv_data_ containing, by fold, YpredCV and idxCV + # # cv2.json(Reg.cv_data_) + # # Display end of modeling message on the interface + # info.empty() M1.success('Model created!') except FileNotFoundError as e: # Display error message on the interface if modeling is wrong info.empty() M1.warning('- ERROR during model creation -') Reg = None + +####################### + + + elif regression_algo == reg_algo[3]: s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3) it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3) @@ -263,7 +287,8 @@ if not spectra.empty and not y.empty: - ################# Model analysis ############ +# ###############################################################################################################DDDVVVVVVVVVV +# ################# Model analysis ############ if regression_algo in reg_algo[1:] and Reg is not None: #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ') @@ -311,19 +336,20 @@ if not spectra.empty and not y.empty: cv_results=pd.DataFrame(Reg.CV_results_) cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --') - fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", + fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", color_discrete_sequence=px.colors.qualitative.G10) - fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash")) + fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), + y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash")) fig1.update_traces(marker_size=7, showlegend=False) cv2.plotly_chart(fig1, use_container_width=True) - fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, + fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000) fig0.update_traces(marker_size=8, showlegend=False) - fig0.write_image("./Report/figures/Allinone.png") + fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png") cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --') cv1.plotly_chart(fig0, use_container_width=True) - fig1.write_image("./Report/figures/Predictions_V.png") + fig1.write_image("./Report/figures/meas_vs_pred_cv_all.png") yc = Reg.pred_data_[0] @@ -337,10 +363,12 @@ if not spectra.empty and not y.empty: json.dump(Reg.best_hyperparams_, outfile) -########## +# ########## M1.write("-- Model performance --") - M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) - + if regression_algo != "Locally Weighted PLSR": + M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) + else: + M1.dataframe(metrics(t = [y_test, yt], method='regression').scores_) model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) #from st_circular_progress import CircularProgress #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance', @@ -348,26 +376,34 @@ if not spectra.empty and not y.empty: #my_circular_progress.st_circular_progress() #my_circular_progress.update_value(progress=20) - a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) + if regression_algo != "Locally Weighted PLSR": + a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) + else: + a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) M7.pyplot(a) - plt.savefig('./Report/figures/Predictedvs.png') + plt.savefig('./Report/figures/measured_vs_predicted.png') prep_para = Reg.best_hyperparams_ - prep_para.pop('n_components') - - for i in ['deriv','polyorder']: - if Reg.best_hyperparams_[i] == 0: - prep_para[i] = '0' - elif Reg.best_hyperparams_[i] == 1: - prep_para[i] = '1st' - elif Reg.best_hyperparams_[i] > 1: - prep_para[i] = f"{Reg.best_hyperparams_[i]}nd" + if regression_algo != "Locally Weighted PLSR": + prep_para.pop('n_components') + for i in ['deriv','polyorder']: + if Reg.best_hyperparams_[i] == 0: + prep_para[i] = '0' + elif Reg.best_hyperparams_[i] == 1: + prep_para[i] = '1st' + elif Reg.best_hyperparams_[i] > 1: + prep_para[i] = f"{Reg.best_hyperparams_[i]}nd" + + if regression_algo != "Locally Weighted PLSR": + residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) + else: + residual_plot = resid_plot([y_train, y_test], [yt, yt], train_idx=train_index, test_idx=test_index) - residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) M8.pyplot(residual_plot) - plt.savefig('./Report/figures/residual_plot.png') - - rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT + plt.savefig('./Report/figures/residuals_plot.png') + + if regression_algo != "Locally Weighted PLSR": + rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) model_name = M9.text_input('Give it a name') @@ -413,7 +449,8 @@ if not spectra.empty and not y.empty and regression_algo: if regression_algo in reg_algo[1:] and Reg is not None: fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') - ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') + if regression_algo != "Locally Weighted PLSR": + ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') ax2.set_xlabel('Wavelenghts') plt.tight_layout() @@ -443,16 +480,19 @@ if not spectra.empty and not y.empty and regression_algo: M2.pyplot(fig) ## Load .dx file +if Reg is not None: + with st.container(): + if st.button("Download the report"): + if regression_algo == reg_algo[1]: + latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results) + report.compile_latex() + if regression_algo is None: + st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸") + else: + pass -with st.container(): - if st.button("Download the report"): - if regression_algo == reg_algo[1]: - latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results) - report.compile_latex() - if regression_algo is None: - st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸") else: pass - else: - pass + + \ No newline at end of file