Skip to content
Snippets Groups Projects
Commit 5728ea85 authored by DIANE's avatar DIANE
Browse files
parents bb9f8a44 b03b645d
Branches
No related tags found
No related merge requests found
......@@ -80,6 +80,7 @@ def No_transformation(X):
######################################## Cross val split ############################
class KF_CV:
### method for generating test sets index
### KFCV(dict) returns a testset indices/Fold
@staticmethod
def CV(x, y, n_folds:int):
test_folds = {}
......@@ -90,30 +91,45 @@ class KF_CV:
for _, i_test in kf.split(x, y):
d.append(i_test)
test_folds[folds_name[i]] = d[i]
return test_folds
return test_folds ## returns a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set
### Cross validate the model and return the predictions and samples index
@staticmethod
def cross_val_predictor(model, x, y, n_folds:int):
def cross_val_predictor(model, folds, x, y):
"""" model: the object to be cross-validated,
folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method)
x and y: the data used for CV"""
x = np.array(x)
y = np.array(y)
yp = {}
folds = KF_CV.CV(x=x, y=y, n_folds=n_folds)### Test index
key = list(folds.keys())
n_folds = len(folds.keys())
for i in range(n_folds):
model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0))
yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold
return yp # returns a tuple with keys are names of folds and the corresponding values are the predicted Y/fold
@staticmethod
def meas_pred_eq(y, ypcv, folds):
"""" y: the target variable,
ypcv: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with predictions/fold (from cross_val_predictor method)
folds: a tuple where keys are the name of each fold, and the corresponding values is a 1d numpy array filled with indices of test set(from CV method)
x and y: the data used for CV
returns:
two dataframe:
- a n x 4 dataframe containing measured values, predicted values, ols reg equation, and index (n is the total number of samples)
- a 2 x k dataframe containing ols regression coefficients(k is the number of folds)
"""
cvcv = {}
coeff = {}
y = np.array(y)
for i, Fname in enumerate(folds.keys()):
r = pd.DataFrame()
r['Predicted'] = yp[Fname]
r['Predicted'] = ypcv[Fname]
r['Measured'] = y[folds[Fname]]
ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]),yp[Fname].reshape(-1,1))
ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1))
r.index = folds[Fname]
r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0]
cvcv[i] = r
......@@ -123,37 +139,47 @@ class KF_CV:
data['index'] = [data.index[i][1] for i in range(data.shape[0])]
data.index = data['index']
coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept'])
return yp, folds, data, coeff
### compute metrics for each fold
return data, coeff ## returns values predicted in cross validation, ,coefficients of regression
@staticmethod
def process(model, x, y, n_folds:int):
f, idx,_ , _ = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds)
def metrics_cv(y, ypcv, folds):
y = np.array(y)
e = {}
for i in idx.keys():
e[i] = metrics().reg_(y.iloc[idx[i]],f[i])
for i in folds.keys():
e[i] = metrics().reg_(y[folds[i]],ypcv[i])
r = pd.DataFrame(e)
return r
r_print = r.copy()
r_print['mean'] = r.mean(axis = 1)
r_print['sd'] = r.std(axis = 1)
r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1)
return r.T, r_print.T
### bias and variance
### compute metrics for each fold
@staticmethod
def cv_scores(model, x, y, n_folds:int):
x = KF_CV.process(model, x, y, n_folds)
mean = x.mean(axis = 1)
sd = x.std(axis = 1)
rsd = sd*100/mean
data = pd.concat([mean, sd, rsd], axis = 1).round(2)
data.columns = ['mean', 'sd', 'cv(%)']
return data
def cv_scores(y, ypcv, folds):
""" Takes as input the Y vactor, the tuple of preducted values/fold(from cross_val_predictor method), and the index/fold(from CV method)
and returns two dataframes, the first is containing metrics scores/fold and the second is similar to the first by with additional mean, sd, and rsd variables
"""
y = np.array(y)
e = {}
for i in folds.keys():
e[i] = metrics().reg_(y[folds[i]],ypcv[i])
r = pd.DataFrame(e)
r_print = r
r_print['mean'] = r.mean(axis = 1)
r_print['sd'] = r.std(axis = 1)
r_print['cv'] = 100*r.std(axis = 1)/r.mean(axis = 1)
return r.T, r_print.T
### Return ycv
@staticmethod
def ycv(model, x, y, n_folds:int):
ycv = np.zeros(y.shape[0])
f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds)
for i in f.keys():
ycv[idx[i]] = f[i]
return ycv
# ### Return ycv
# @staticmethod
# def ycv(model, x, y, n_folds:int):
# ycv = np.zeros(y.shape[0])
# f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds)
# for i in f.keys():
# ycv[idx[i]] = f[i]
# return ycv
### Selectivity ratio
......
from Packages import *
def create_hash(spectra):
#using the md5 hash function.
hash_func = hashlib.md5()
spectra = str(spectra)
encoded_spectra = spectra.encode()
hash_func.update(encoded_spectra)
hash = hash_func.hexdigest()
return hash
def check_hash(hash):
# path to hash file and grep/cat functions for Win
subprocess_path = Path("src/data/hash/")
# run a grep from the hash onto the hash file
nb_hash = subprocess.run([subprocess_path / 'grep.exe', '-c', hash, subprocess_path / "hash.txt"], shell=True)
# if hash present
if 'returncode=0' in str(nb_hash):
return 'existing hash'
# if hash not present, add it to the file with cat function
else:
add_hash = subprocess.run(['echo', str(hash) + '>>', subprocess_path / "hash.txt"], shell=True)
if 'returncode=0' in str(add_hash):
return 'hash added'
else:
return 'error while adding the new hash'
\ No newline at end of file
......@@ -7,33 +7,28 @@ class LWPLSR:
Returns:
self.scores (DataFrame): various metrics and scores
self.predicted_results_on_train (DataFrame):
self.predicted_results_on_test (DataFrame):
self.predicted_results (Dictionary): Dict containing all predicted results (train, test, cross-validation)
self.mod (Julia model): the prepared model
"""
def __init__(self, dataset):
"""Initiate the LWPLSR and prepare data for Julia computing."""
# self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(len(dataset))]
# get train / test data from dataset
self.x_train, self.y_train, self.x_test, self.y_test = [dataset[i] for i in range(4)]
# calculate number of KFolds and get CV data from dataset
self.nb_fold = int((len(dataset)-4)/4)
for i in range(self.nb_fold):
setattr(self, "xtr_fold"+str(i+1), dataset[i+7])
setattr(self, "ytr_fold"+str(i+1), dataset[i+13])
setattr(self, "xte_fold"+str(i+1), dataset[i+4])
# setattr(self, "yte_fold"+str(i+1), dataset[i+10])
setattr(jl, "xtr_fold"+str(i+1), dataset[i+7])
setattr(jl, "ytr_fold"+str(i+1), dataset[i+13])
setattr(jl, "xte_fold"+str(i+1), dataset[i+4])
# setattr(jl, "yte_fold"+str(i+1), dataset[i+10])
# prepare to send dataframes to julia and Jchemo
# prepare to send dataframes to julia and Jchemo (with the jl. prefix)
jl.x_train, jl.y_train, jl.x_test, jl.y_test = self.x_train, self.y_train, self.x_test, self.y_test
# initialize vars from the class
y_shape = self.y_test.shape
self.predicted_results_on_test = pd.DataFrame
self.predicted_results_on_train = pd.DataFrame
self.pred_test = np.zeros(shape=(y_shape[0], 1))
self.pred_train = np.zeros(shape=(y_shape[0], 1))
self.mod = ""
......@@ -52,7 +47,7 @@ class LWPLSR:
Returns:
self.mod (Julia model): the prepared model
"""
# launch Julia Jchemo lwplsr
# launch Julia Jchemo lwplsr and convert DataFrames from Python Pandas DataFrame to Julia DataFrame
jl.seval("""
using DataFrames
using Pandas
......@@ -63,7 +58,7 @@ class LWPLSR:
y_test |> Pandas.DataFrame |> DataFrames.DataFrame
""")
print('LWPLSR - tuning')
# set tuning parameters
# set tuning parameters to test
jl.seval("""
nlvdis = [5; 10; 15] ; metric = [:eucl; :mah]
h = [1; 2; 6; Inf] ; k = [30; 80; 200]
......@@ -83,21 +78,22 @@ class LWPLSR:
ncal = ntrain - nval
""")
# Create LWPLSR model and tune
# Create LWPLSR model and tune with GridScore
jl.seval("""
mod = Jchemo.model(Jchemo.lwplsr)
res = gridscore(mod, Xcal, ycal, Xval, yval; score = Jchemo.rmsep, pars, nlv, verbose = false)
u = findall(res.y1 .== minimum(res.y1))[1] #best parameters combination
""")
# save best lwplsr parameters
self.best_lwplsr_params = {'nlvdis' : jl.res.nlvdis[jl.u], 'metric' : str(jl.res.metric[jl.u]), 'h' : jl.res.h[jl.u], 'k' : jl.res.k[jl.u], 'nlv' : jl.res.nlv[jl.u]}
print('best lwplsr params ' + str(self.best_lwplsr_params))
print('LWPLSR - best params ok')
# calculate LWPLSR model with best parameters
# run LWPLSR model with best parameters
jl.seval("""
mod = Jchemo.model(Jchemo.lwplsr; nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u])
# Fit model
Jchemo.fit!(mod, x_train, y_train)
""")
# save Julia Jchemo model
self.mod = jl.mod
def Jchemo_lwplsr_predict(self):
......@@ -128,14 +124,13 @@ class LWPLSR:
print('LWPLSR - end')
def Jchemo_lwplsr_cv(self):
"""Send data to Julia to predict with lwplsr.
"""Send Cross-Validation data to Julia to fit & predict with lwplsr.
Args:
self.mod (Julia model): the prepared model
self.best_lwplsr_params: the best parameters to use (from tuning) for CV
self.xtr_fold1 (DataFrame):
self.ytr_fold1 (DataFrame):
self.xte_fold1 (DataFrame):
self.yte_fold1 (DataFrame):
Returns:
self.pred_cv (Julia DataFrame): predicted values on x_train with Cross-Validation
......@@ -144,7 +139,7 @@ class LWPLSR:
jl.Xtr = getattr(self, "xtr_fold"+str(i+1))
jl.Ytr = getattr(self, "ytr_fold"+str(i+1))
jl.Xte = getattr(self, "xte_fold"+str(i+1))
# jl.Yte = getattr(self, "yte_fold"+str(i+1))
# convert Python Pandas DataFrame to Julia DataFrame
jl.seval("""
using DataFrames
using Pandas
......@@ -153,6 +148,7 @@ class LWPLSR:
Ytr |> Pandas.DataFrame |> DataFrames.DataFrame
Xte |> Pandas.DataFrame |> DataFrames.DataFrame
""")
# set lwplsr parameters as the best one from tuning
jl.nlvdis = int(self.best_lwplsr_params['nlvdis'])
jl.metric = self.best_lwplsr_params['metric']
jl.h = self.best_lwplsr_params['h']
......@@ -169,15 +165,14 @@ class LWPLSR:
res = Jchemo.predict(mod_cv, Xte)
res.pred
""")
# save predicted values for each KFold in the predicted_results dictionary
self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv)
@property
def pred_data_(self):
# convert predicted data from x_test to Pandas DataFrame
self.predicted_results_on_test = pd.DataFrame(self.pred_test)
self.predicted_results_on_train = pd.DataFrame(self.pred_train)
self.predicted_results["pred_data_train"] = self.predicted_results_on_train
self.predicted_results["pred_data_test"] = self.predicted_results_on_test
self.predicted_results["pred_data_train"] = pd.DataFrame(self.pred_train)
self.predicted_results["pred_data_test"] = pd.DataFrame(self.pred_test)
return self.predicted_results
@property
......
......@@ -7,35 +7,42 @@ import os
# loading the lwplsr_inputs.json
temp_path = Path("temp/")
data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
# check data for cross-validation depending on KFold number
temp_files_list = os.listdir(temp_path)
nb_fold = 0
for i in temp_files_list:
if 'fold' in i:
# add CV file name to data_to_work_with
data_to_work_with.append(str(i)[:-4])
# and count the number of KFold
nb_fold += 1
# Import data from csv files in the temp/ folder
dataset = []
for i in data_to_work_with:
dataset.append(np.genfromtxt(temp_path / str(i + ".csv"), delimiter=','))
print('CSV imported')
# launch LWPLSR Class from LWPLSR_.py in Class_Mod
print('start model creation')
Reg = LWPLSR(dataset)
print('model created. \nnow fit')
LWPLSR.Jchemo_lwplsr_fit(Reg)
print('now predict')
LWPLSR.Jchemo_lwplsr_predict(Reg)
print('now CV')
LWPLSR.Jchemo_lwplsr_cv(Reg)
# Export results in a json file to bring data back to 2-model_creation.py and streamlit interface
print('export to json')
pred = ['pred_data_train', 'pred_data_test']
# add KFold results to predicted data
for i in range(int(nb_fold/4)):
pred.append("CV" + str(i+1))
json_export = {}
for i in pred:
json_export[i] = Reg.pred_data_[i].to_dict()
# add the lwplsr global model to the json
json_export['model'] = str(Reg.model_)
# add the best parameters for the lwplsr obtained from GridScore tuning
json_export['best_lwplsr_params'] = Reg.best_lwplsr_params_
with open(temp_path / "lwplsr_outputs.json", "w+") as outfile:
json.dump(json_export, outfile)
......@@ -115,19 +115,23 @@ class Plsr(Regmodel):
x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)]
Model = PLSRegression(scale = False, n_components = params['n_components'])
self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
self._cv_df['Average'] = self._cv_df.mean(axis = 1)
self._cv_df['S'] = self._cv_df.std(axis = 1)
self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
self._cv_df = self._cv_df.T.round(2)
score = self._cv_df.loc["CV(%)",'rmse']
# self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
# self._cv_df['Average'] = self._cv_df.mean(axis = 1)
# self._cv_df['S'] = self._cv_df.std(axis = 1)
# self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
# self._cv_df = self._cv_df.T.round(2)
folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds)
yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain))
self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1]
score = self._cv_df.loc["cv",'rmse']
Model = PLSRegression(scale = False, n_components = params['n_components'])
Model.fit(x2[0], self._ytrain)
if self.SCORE > score:
self.SCORE = score
self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds)
self._yc = Model.predict(x2[0])
self._yt = Model.predict(x2[1])
self._model = Model
......@@ -179,26 +183,29 @@ class TpeIpls(Regmodel):
# print(x2)
# ## Modelling
folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds)
try:
Model = PLSRegression(scale = False, n_components = params['n_components'])
self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain))
self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1]
except ValueError as ve:
params["n_components"] = 1
Model = PLSRegression(scale = False, n_components = params['n_components'])
self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
self._cv_df['Average'] = self._cv_df.mean(axis = 1)
self._cv_df['S'] = self._cv_df.std(axis = 1)
self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
self._cv_df = self._cv_df.T.round(2)
score = self._cv_df.loc['CV(%)','rmse']
yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain))
self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1]
# self._cv_df['Average'] = self._cv_df.mean(axis = 1)
# self._cv_df['S'] = self._cv_df.std(axis = 1)
# self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
# self._cv_df = self._cv_df.T.round(2)
score = self._cv_df.loc['cv','rmse']
Model = PLSRegression(scale = False, n_components = params['n_components'])
Model.fit(x2[0][:,id], self._ytrain)
if self.SCORE > score:
self.SCORE = score
self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds)
self._yc = Model.predict(x2[0][:,id])
self._yt = Model.predict(x2[1][:,id])
self._model = Model
......
......@@ -38,18 +38,11 @@ M9 = st.container()
M9.write("-- Save the model --")
##############################################################################################
reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
regression_algo = None
####################################### ###########################################
files_format = ['.csv', '.dx']
file = M00.radio('Select files format:', options = files_format)
### Data
spectra = pd.DataFrame()
y = pd.DataFrame()
# load .csv file
if file == files_format[0]:
xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
......@@ -60,6 +53,8 @@ if file == files_format[0]:
options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
if hdrx == "yes": col = 0
else: col = False
else:
M00.warning('Insert your spectral data file here!')
ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if ycal_csv:
......@@ -67,6 +62,8 @@ if file == files_format[0]:
hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3)
if hdry == "yes": col = 0
else: col = False
else:
M00.warning('Insert your target data file here!')
if xcal_csv and ycal_csv:
file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name)
......@@ -93,17 +90,14 @@ if file == files_format[0]:
spectra = pd.DataFrame
else:
M1.warning('Tune decimal and separator parameters')
M00.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !')
## Load .dx file
elif file == files_format[1]:
data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
if data_file:
if not data_file:
M00.warning('Load your file here!')
else :
file_name = str(data_file.name)
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(data_file.read())
......@@ -116,7 +110,7 @@ elif file == files_format[1]:
y = chem_data.loc[:,yname].loc[measured]
spectra = spectra.loc[measured]
else:
M00.warning('Warning: Chemical data are not included in your file !', icon="⚠️")
M00.warning('Warning: your file includes no target variables to model !', icon="⚠️")
os.unlink(tmp_path)
### split the data
......@@ -157,27 +151,36 @@ if not spectra.empty and not y.empty:
M0.write('Loaded data summary')
M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2))
stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)
M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2))
stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)
####################################### Insight into the loaded data
#######################################
####################################### Model creation ###################################################
reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
regression_algo = None
Reg = None
regression_algo = M1.selectbox("Choose the algorithm for regression", options= reg_algo, key = 12, placeholder ="Choose an option")
# split train data into nb_folds for cross_validation
nb_folds = 3
folds = KF_CV.CV(X_train, y_train, nb_folds)
if not regression_algo:
M1.warning('Choose a modelling algorithm from the dropdown list !')
if regression_algo == reg_algo[1]:
# Train model with model function from application_functions.py
Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1)
reg_model = Reg.model_
#M2.dataframe(Pin.pred_data_)
elif regression_algo == reg_algo[2]:
info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.')
# export data to csv for Julia train/test
data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
# Cross-Validation calculation
nb_folds = 3
st.write('KFold for Cross-Validation = ' + str(nb_folds))
# split train data into nb_folds
folds = KF_CV.CV(x_train_np, y_train_np, nb_folds)
st.write('KFold for Cross-Validation = ' + str(nb_folds))
d = {}
for i in range(nb_folds):
d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]]
......@@ -203,39 +206,60 @@ if not spectra.empty and not y.empty:
Reg_json = json.load(outfile)
# delete csv files
for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
# delete json file after import
# # delete json file after import
os.unlink(temp_path / "lwplsr_outputs.json")
# format result data into Reg object
pred = ['pred_data_train', 'pred_data_test']
pred = ['pred_data_train', 'pred_data_test']### keys of the dict
for i in range(nb_folds):
pred.append("CV" + str(i+1))
Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
pred.append("CV" + str(i+1)) ### add cv folds keys to pred
Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'],
'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
Reg.CV_results_ = pd.DataFrame()
Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}}
# set indexes to Reg.pred_data (train, test, folds idx)
# # set indexes to Reg.pred_data (train, test, folds idx)
for i in range(len(pred)):
Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
if i == 0: # data_train
# Reg.pred_data_[i] = np.array(Reg.pred_data_[i])
Reg.pred_data_[i].index = list(y_train.index)
Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0]
elif i == 1: # data_test
# Reg.pred_data_[i] = np.array(Reg.pred_data_[i])
Reg.pred_data_[i].index = list(y_test.index)
else: # CVi
Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0]
else:
# CVi
Reg.pred_data_[i].index = folds[list(folds)[i-2]]
Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]])
Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = Reg.pred_data_[i]
Reg.cv_data_['idxCV']['Fold' + str(i-1)] = folds[list(folds)[i-2]]
Reg.CV_results_.sort_index(inplace = True)
Reg.CV_results_.columns = ['Ypredicted_CV']
# if you want to display Reg.cv_data_ containing by fold YpredCV and idxCV
# cv2.json(Reg.cv_data_)
# Display end of modeling message on the interface
info.empty()
# Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]])
Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1)
Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1)
#Reg.cv_data_['idxCV'] and folds contains the same data
Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1]
# #### cross validation results print
Reg.best_hyperparams_print = Reg.best_hyperparams_
# ## plots
Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds)
# st.write(Reg.cv_data_ )
# # Reg.CV_results_.sort_index(inplace = True)
# # Reg.CV_results_.columns = ['Ypredicted_CV']
# # if you want to display Reg.cv_data_ containing, by fold, YpredCV and idxCV
# # cv2.json(Reg.cv_data_)
# # Display end of modeling message on the interface
# info.empty()
M1.success('Model created!')
except FileNotFoundError as e:
# Display error message on the interface if modeling is wrong
info.empty()
M1.warning('- ERROR during model creation -')
Reg = None
#######################
elif regression_algo == reg_algo[3]:
s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3)
......@@ -263,7 +287,8 @@ if not spectra.empty and not y.empty:
################# Model analysis ############
# ###############################################################################################################DDDVVVVVVVVVV
# ################# Model analysis ############
if regression_algo in reg_algo[1:] and Reg is not None:
#M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ')
......@@ -311,19 +336,20 @@ if not spectra.empty and not y.empty:
cv_results=pd.DataFrame(Reg.CV_results_)
cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')
fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds",
fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds",
color_discrete_sequence=px.colors.qualitative.G10)
fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']),
y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
fig1.update_traces(marker_size=7, showlegend=False)
cv2.plotly_chart(fig1, use_container_width=True)
fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000)
fig0.update_traces(marker_size=8, showlegend=False)
fig0.write_image("./Report/figures/Allinone.png")
fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png")
cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
cv1.plotly_chart(fig0, use_container_width=True)
fig1.write_image("./Report/figures/Predictions_V.png")
fig1.write_image("./Report/figures/meas_vs_pred_cv_all.png")
yc = Reg.pred_data_[0]
......@@ -337,10 +363,12 @@ if not spectra.empty and not y.empty:
json.dump(Reg.best_hyperparams_, outfile)
##########
# ##########
M1.write("-- Model performance --")
M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
if regression_algo != "Locally Weighted PLSR":
M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
else:
M1.dataframe(metrics(t = [y_test, yt], method='regression').scores_)
model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
#from st_circular_progress import CircularProgress
#my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance',
......@@ -348,26 +376,34 @@ if not spectra.empty and not y.empty:
#my_circular_progress.st_circular_progress()
#my_circular_progress.update_value(progress=20)
a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
if regression_algo != "Locally Weighted PLSR":
a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
else:
a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
M7.pyplot(a)
plt.savefig('./Report/figures/Predictedvs.png')
plt.savefig('./Report/figures/measured_vs_predicted.png')
prep_para = Reg.best_hyperparams_
prep_para.pop('n_components')
for i in ['deriv','polyorder']:
if Reg.best_hyperparams_[i] == 0:
prep_para[i] = '0'
elif Reg.best_hyperparams_[i] == 1:
prep_para[i] = '1st'
elif Reg.best_hyperparams_[i] > 1:
prep_para[i] = f"{Reg.best_hyperparams_[i]}nd"
if regression_algo != "Locally Weighted PLSR":
prep_para.pop('n_components')
for i in ['deriv','polyorder']:
if Reg.best_hyperparams_[i] == 0:
prep_para[i] = '0'
elif Reg.best_hyperparams_[i] == 1:
prep_para[i] = '1st'
elif Reg.best_hyperparams_[i] > 1:
prep_para[i] = f"{Reg.best_hyperparams_[i]}nd"
if regression_algo != "Locally Weighted PLSR":
residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
else:
residual_plot = resid_plot([y_train, y_test], [yt, yt], train_idx=train_index, test_idx=test_index)
residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
M8.pyplot(residual_plot)
plt.savefig('./Report/figures/residual_plot.png')
rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT
plt.savefig('./Report/figures/residuals_plot.png')
if regression_algo != "Locally Weighted PLSR":
rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT
#model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
model_name = M9.text_input('Give it a name')
......@@ -413,7 +449,8 @@ if not spectra.empty and not y.empty and regression_algo:
if regression_algo in reg_algo[1:] and Reg is not None:
fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True)
ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)')
ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)')
if regression_algo != "Locally Weighted PLSR":
ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)')
ax2.set_xlabel('Wavelenghts')
plt.tight_layout()
......@@ -443,16 +480,19 @@ if not spectra.empty and not y.empty and regression_algo:
M2.pyplot(fig)
## Load .dx file
if Reg is not None:
with st.container():
if st.button("Download the report"):
if regression_algo == reg_algo[1]:
latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results)
report.compile_latex()
if regression_algo is None:
st.warning('Data processing has not been performed or finished yet!', icon = "⚠️")
else:
pass
with st.container():
if st.button("Download the report"):
if regression_algo == reg_algo[1]:
latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results)
report.compile_latex()
if regression_algo is None:
st.warning('Data processing has not been performed or finished yet!', icon = "⚠️")
else:
pass
else:
pass
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment