Skip to content
Snippets Groups Projects
Commit 2ec31fd7 authored by DIANE's avatar DIANE
Browse files

- Cross validation methods

- Implementing CV and visualizing the results
- Organization des classes des algos de régression
parent ac1a9cbd
No related branches found
No related tags found
No related merge requests found
from Packages import *
from .Evaluation_Metrics import metrics
## try to automatically detect the field separator within the CSV
def find_delimiter(filename):
......@@ -62,24 +63,17 @@ def MinMaxScale(X):
sc = pd.DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns)
return sc
######################################## Spectral preprocessing
def Detrend(X):
c = detrend(X, axis=-1, type='linear', bp=0, overwrite_data=False)
return c
def Sg11(X):
c = savgol_filter(X, polyorder=1, deriv=1, window_length = 7)
return c
def Snv(X):
xt = np.array(X).T
c = (xt-xt.mean())/xt.std()
return pd.DataFrame(c.T, index=X.index, columns= X.columns)
def Non(X):
def No_transformation(X):
return X
......@@ -105,18 +99,36 @@ class KF_CV:
y = np.array(y)
yp = {}
folds = CV(x=x, y=y, n_folds=n_folds)### Test index
folds = KF_CV.CV(x=x, y=y, n_folds=n_folds)### Test index
key = list(folds.keys())
for i in range(n_folds):
model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0))
yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold
return yp, folds
cvcv = {}
coeff = {}
for i, Fname in enumerate(folds.keys()):
r = pd.DataFrame()
r['Predicted'] = yp[Fname]
r['Measured'] = y[folds[Fname]]
ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]),yp[Fname].reshape(-1,1))
r.index = folds[Fname]
r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0]
cvcv[i] = r
coeff[Fname] = [ols.coef_[0][0], ols.intercept_[0]]
data = pd.concat(cvcv, axis = 0)
data['index'] = [data.index[i][1] for i in range(data.shape[0])]
data.index = data['index']
coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept'])
return yp, folds, data, coeff
### compute metrics for each fold
@staticmethod
def process(model, x, y, n_folds:int):
f, idx = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds)
f, idx,_ , _ = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds)
e = {}
for i in idx.keys():
e[i] = metrics().reg_(y.iloc[idx[i]],f[i])
......@@ -138,7 +150,7 @@ class KF_CV:
@staticmethod
def ycv(model, x, y, n_folds:int):
ycv = np.zeros(y.shape[0])
f, idx = KF_CV.cross_val_predictor(model, x,y, n_folds)
f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds)
for i in f.keys():
ycv[idx[i]] = f[i]
return ycv
from Packages import *
from Class_Mod import metrics, Snv, No_transformation, KF_CV
class Regmodel(object):
def __init__(self, train : [pd.DataFrame, pd.DataFrame], test : [pd.DataFrame, pd.DataFrame], n_iter, add_hyperparams = None, nfolds = 5, **kwargs):
self.SCORE = 100000000
self._xc, self._xt, self._ytrain, self._ytest = train[0], test[0], train[1], test[1]
self._nc, self._nt, self._p = train[0].shape[0], test[0].shape[0], train[0].shape[1]
self._model, self._best = None, None
self._yc, self._ycv, self._yt = None, None, None
self._cv_df = pd.DataFrame
self._nfolds = nfolds
self.bands = pd.DataFrame()
self.important_features = pd.DataFrame()
self._hyper_params = {'polyorder': hp.choice('polyorder', [0, 1, 2]),
'deriv': hp.choice('deriv', [0, 1, 2]),
'window_length': hp.choice('window_length', [15, 21, 27, 33]),
'scatter': hp.choice('scatter', ['Snv', 'No_transformation'])}
if add_hyperparams is not None:
self._hyper_params.update(add_hyperparams)
self._best = None
trials = Trials()
best_params = fmin(fn=self.objective,
space=self._hyper_params,
algo=tpe.suggest, # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach
max_evals=n_iter,
trials=trials,
verbose=1)
@property
def train_data_(self):
return [self._xc, self._ytrain]
@property
def test_data_(self):
return [self._xt, self._ytest]
@property
def get_params_(self):
return self._hyper_params
def objective(self, params):
pass
@property
def best_hyperparams(self):
return self._best
@property
def model_(self):
return self._model
@property
def pred_data_(self):
return self._yc, self._yt
@property
def cv_data_(self):
return self._ycv
@property
def CV_results_(self):
return self._cv_df
@property
def important_features_(self):
return self.important_features
########################################### #########################################
class Plsr(Regmodel):
def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10):
super().__init__(train, test, n_iter, add_hyperparams = {'n_components': hp.randint('n_components', 2,20)})
### parameters in common
def objective(self, params):
x0 = [self._xc, self._xt]
x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)]
a, b, c = params['deriv'], params['polyorder'], params['window_length']
if a > b or b > c:
if self._best is not None:
a, b, c = self._best['deriv'], self._best['polyorder'], self._best['window_length']
else:
a, b, c = 0, 0, 1
params['deriv'], params['polyorder'], params['window_length'] = a, b, c
x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)]
Model = PLSRegression(scale = False, n_components = params['n_components'])
self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
self._cv_df['Average'] = self._cv_df.mean(axis = 1)
self._cv_df['S'] = self._cv_df.std(axis = 1)
self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
self._cv_df = self._cv_df.T.round(2)
score = self._cv_df.loc["CV(%)",'rmse']
Model = PLSRegression(scale = False, n_components = params['n_components'])
Model.fit(x2[0], self._ytrain)
if self.SCORE > score:
self.SCORE = score
self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
self._yc = Model.predict(x2[0])
self._yt = Model.predict(x2[1])
self._model = Model
self._best = params
self.x2 = x2[0]
return score
############################################ #########################################
class TpeIpls(Regmodel):
def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10, n_intervall = 5):
self.n_intervall = n_intervall
self.n_arrets = self.n_intervall*2
self.bands = pd.DataFrame()
self.bands.index = ['from', 'to']
r = {'n_components': hp.randint('n_components', 2,20)}
r.update({f'v{i}': hp.randint(f'v{i}', 0, train[0].shape[1]) for i in range(1,self.n_arrets+1)})
super().__init__(train, test, n_iter, add_hyperparams = r)
### parameters in common
def objective(self, params):
### wevelengths index
self.idx = [params[f'v{i}'] for i in range(1,self.n_arrets+1)]
self.idx.sort()
arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)]
id = np.unique(np.concatenate(arrays, axis=0), axis=0)
# ## Preprocessing
x0 = [self._xc, self._xt]
x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)]
a, b, c = params['deriv'], params['polyorder'], params['window_length']
if a > b or b > c:
if self._best is not None:
a, b, c = self._best['deriv'], self._best['polyorder'], self._best['window_length']
else:
a, b, c = 0, 0, 1
params['deriv'], params['polyorder'], params['window_length'] = a, b, c
x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)]
# print(x2)
# ## Modelling
Model = PLSRegression(scale = False, n_components = params['n_components'])
self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
self._cv_df['Average'] = self._cv_df.mean(axis = 1)
self._cv_df['S'] = self._cv_df.std(axis = 1)
self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
self._cv_df = self._cv_df.T.round(2)
score = self._cv_df.loc['CV(%)','rmse']
Model = PLSRegression(scale = False, n_components = params['n_components'])
Model.fit(x2[0][:,id], self._ytrain)
if self.SCORE > score:
self.SCORE = score
self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
self._yc = Model.predict(x2[0][:,id])
self._yt = Model.predict(x2[1][:,id])
self._model = Model
self._best = params
self.x2 = x2[0][:,id]
self.segments = arrays
for i in range(len(self.segments)):
self.bands[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]]
self.bands.index = ['from','to']
return score
############################################ #########################################
class Pcr(Regmodel):
def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10, n_val = 5):
super.__init__()
{f'pc{i}': hp.randint(f'pc{i+1}', 0, train[0].shape[1]) for i in range(self.n_val)}
\ No newline at end of file
......@@ -7,7 +7,7 @@ from .DATA_HANDLING import *
from .PLSR_ import PinardPlsr
from .LWPLSR_ import LWPLSR
from .Evaluation_Metrics import metrics
from .VarSel import TpeIpls
#from .VarSel import TpeIpls
from .Miscellaneous import resid_plot, reg_plot
from .DxReader import DxRead, read_dx
from .HDBSCAN_Clustering import Hdbscan
......@@ -15,3 +15,4 @@ from .SK_PLSR_ import PlsR
from .PLSR_Preprocess import PlsProcess
from .NMF_ import Nmf
from .Ap import AP
from .RegModels import Plsr, TpeIpls
\ No newline at end of file
from Packages import *
from Class_Mod import PlsR, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP
from Class_Mod import Plsr, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP
from Class_Mod import LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx, PlsProcess
from Class_Mod.DATA_HANDLING import *
from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra, local_css
from style.header import add_header
from Report import report
css_file = Path("style/")
local_css(css_file / "style.css")
\ No newline at end of file
local_css(css_file / "style.css")
# path = os.path.dirname(os.path.abspath(__file__)).replace('\\','/')
# d1 = path.find('/')
# css_file = path[:d1]+'/style'
# st.session_state["interface"] = st.session_state.get('interface')
# if st.session_state["interface"] == 'simple':
# hide_pages("Predictions")
# local_css(css_file +"/style.css")
......@@ -10,6 +10,10 @@ add_header()
st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
hide_pages("Predictions")
#path = os.path.dirname(os.path.abspath(__file__)).replace('\\','/')
#css_file = path[:path.find('/pages')]+'/style'
#local_css(css_file +"/style_model.css")
local_css(css_file / "style_model.css")
####################################### page Design #######################################
......@@ -19,13 +23,17 @@ st.header("I - Data visualization", divider='blue')
M0, M00 = st.columns([1, .4])
st.header("II - Model creation", divider='blue')
M1, M2, M3 = st.columns([2,2,2])
st.header("Cross_Validation")
cv1, cv2 = st.columns([2,2])
cv3 = st.container()
st.header("III - Model Diagnosis", divider='blue')
M7, M8 = st.columns([2,2])
M7.write('Predicted vs Measured values')
M8.write('Residuals plot')
M9 = st.container()
M9.write("-- Save the model --")
######################################################################
##############################################################################################
reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
####################################### ###########################################
......@@ -144,8 +152,7 @@ if not spectra.empty and not y.empty:
regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
if regression_algo == reg_algo[1]:
# Train model with model function from application_functions.py
Reg = PlsProcess(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test, scale = False, Kfold=3)
Reg.tune(n_iter=500)
Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10)
reg_model = Reg.model_
#M2.dataframe(Pin.pred_data_)
elif regression_algo == reg_algo[2]:
......@@ -166,7 +173,7 @@ if not spectra.empty and not y.empty:
Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
for i in range(len(pred)):
Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
if i is not 4:
if i != 4:
Reg.pred_data_[i].index = list(y_train.index)
else:
Reg.pred_data_[i].index = list(y_test.index)
......@@ -176,16 +183,15 @@ if not spectra.empty and not y.empty:
it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
progress_text = "The model is being created. Please wait."
Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it)
pro = M1.progress(0, text="The model is being created. Please wait!")
rega = Reg.BandSelect(n_iter = it)
pro.empty()
M1.progress(100, text = "The model has successfully been created!")
time.sleep(1)
reg_model = Reg.model_
M3.write('-- Spectral regions used for model creation --')
wls = rega[0]
M3.table(wls)
intervalls = Reg.bands.T
M3.table(intervalls)
fig, ax = plt.subplots(figsize = (12, 6))
X_train.mean().plot(ax = ax)
for i in range(s):
......@@ -193,12 +199,13 @@ if not spectra.empty and not y.empty:
num = {'u','i','f','c'}
if np.array(X_train.columns).dtype.kind in num:
plt.plot(X_train.columns, X_train.mean(), color = 'black')
ax.axvspan(X_train.columns[rega[0]['from'][i]], X_train.columns[rega[0]['to'][i]], color='#2a52be', alpha=0.5, lw=0)
ax.axvspan(X_train.columns[intervalls['from'][i]], X_train.columns[intervalls['to'][i]],
color='#2a52be', alpha=0.5, lw=0)
plt.tight_layout()
plt.margins(x = 0)
else:
plt.plot(np.arange(X_train.shape[1]), X_train.mean())
ax.axvspan(rega[0]['from'][i], rega[0]['to'][i], color='#2a52be', alpha=0.5, lw=0)
plt.plot(np.arange(X_train.shape[1]), X_train.mean(), color = 'black')
ax.axvspan(intervalls['from'][i], intervalls['to'][i], color='#2a52be', alpha=0.5, lw=0)
plt.tight_layout()
plt.margins(x = 0)
......@@ -211,8 +218,26 @@ if not spectra.empty and not y.empty:
################# Model analysis ############
if regression_algo in reg_algo[1:]:
cv2.write('-- Cross-Validation Summary--')
cv2.write(Reg.CV_results_)
cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')
fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds",
color_discrete_sequence=px.colors.qualitative.G10)
fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
fig1.update_traces(marker_size=7, showlegend=False)
cv2.plotly_chart(fig1)
fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000)
fig0.update_traces(marker_size=8, showlegend=False)
cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
cv1.plotly_chart(fig0)
yc = Reg.pred_data_[0]
yt = Reg.pred_data_[2]
yt = Reg.pred_data_[1]
#if
M2.write('-- Spectral preprocessing info --')
......@@ -231,7 +256,8 @@ if not spectra.empty and not y.empty:
M7.pyplot(reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index))
M8.pyplot(resid_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index))
# rega = Reg.important_features_ ##### ADD FEATURES IMPORTANCE PLOT
#model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
model_name = M9.text_input('Give it a name')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment