Skip to content
Snippets Groups Projects
Commit 6718fc6f authored by DIANE's avatar DIANE
Browse files

- Wavelength selection successfully incorporated

- Modifications on Model creation pages
- correction of regression metrics
parent 8df9593c
No related branches found
No related tags found
No related merge requests found
......@@ -40,10 +40,10 @@ def resid_plot( meas, pred):
sns.residplot(x = meas[1], y = pred[1], color='red', label = 'CV')
sns.residplot(x = meas[2], y = pred[2], color='green', label = 'Test')
ax.set_ylabel('Residuals')
ax.set_xlabel('Predicted values')
ax.set_xlabel('Measured values')
plt.legend()
# function that create a download button - needs the data to save and the file name to store to
def download_results(data, export_name):
with open(data) as f:
st.download_button('Download Results', f, export_name)
\ No newline at end of file
st.download_button('Download Results', f, export_name)
......@@ -25,7 +25,7 @@ class PinardPlsr:
pipeline = Pipeline([
('scaler', MinMaxScaler()), # scaling the data
('preprocessing', FeatureUnion(preprocessing)), # preprocessing
('PLS', PLSRegression())])
('PLS', PLSRegression(n_components=14))])
# Estimator including y values scaling
estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())
# Training
......
......@@ -17,11 +17,12 @@ class metrics:
@property
def evaluate_(self):
xbar = np.mean(self.meas) # the average of measured values
e2 = np.square(np.subtract(self.meas, self.pred))# the squared error
e = np.subtract(self.meas.ravel(), self.pred.ravel())
e2 = e**2# the squared error
# Sum of squared:
# TOTAL
sst = np.sum((self.meas-xbar)**2)
sst = np.sum((self.meas- xbar)**2)
# RESIDUAL
ssr = np.sum(e2)
# REGRESSION OR MODEL
......@@ -32,7 +33,7 @@ class metrics:
# Compute statistical metrics
metr = pd.DataFrame()
metr['r'] = [np.corrcoef(self.meas.ravel(), self.pred)[0,1]]
metr['r2'] = [ssm/sst]
metr['r2'] = [1-ssr/sst]
metr['rmse'] = [np.sqrt(np.mean(e2))]
metr['mae'] = [np.mean(np.abs(e2))]
metr['rpd'] = [np.std(self.meas)/np.sqrt(np.mean(e2))]
......
from Packages import *
from Class_Mod import metrics
class TpeIpls:
'''
This framework is added to the clan of wavelengths selection algorithms.It was introduced as an improvement
......@@ -14,10 +13,10 @@ class TpeIpls:
'''Optimization algorithms can be used to find the subset of variables that optimize a certain criterion
(e.g., maximize predictive performance, minimize overfitting)'''
SCORE = 10000
SCORE = 100000000
index_export = pd.DataFrame()
def __init__(self, x_train, x_test, y_train, y_test, scale, Kfold, n_intervall):
def __init__(self, x_train, x_test, y_train, y_test,
scale, Kfold, n_intervall):
TpeIpls.SCORE = 10000
self.x_train = x_train
self.x_test = x_test
......@@ -27,13 +26,12 @@ class TpeIpls:
self.Kfold = Kfold
self.p = self.x_train.shape[1]
self.n_intervall = n_intervall
self.__n_arrets = self.n_intervall*2
self.PLS_params = {f'v{i}': hp.randint(f'v{i}', 0, self.p) for i in range(1,self.__n_arrets+1)}
self.n_arrets = self.n_intervall*2
self.PLS_params = {f'v{i}': hp.randint(f'v{i}', 0, self.p) for i in range(1,self.n_arrets+1)}
self.PLS_params['n_components'] = hp.randint("n_components", 1, 6)
def _objective(self, params):
self.idx = [params[f'v{i}'] for i in range(1,self.__n_arrets+1)]
def objective(self, params):
self.idx = [params[f'v{i}'] for i in range(1,self.n_arrets+1)]
self.idx.sort()
arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)]
......@@ -65,72 +63,78 @@ class TpeIpls:
TpeIpls.SCORE = score
self.nlv = params['n_components']
print('--**-------------##---------#~###~#---------##---------------**--')
print(f'***** R²train : [{round(r2c * 100)}]**** R²cv : [{round(r2cv * 100)}]**** R²test : [{round(r2t * 100)}]*****')
print(f'***** N Predictiors : [{len(id)}] ******** NLV : [{params["n_components"]}]*****')
TpeIpls.index_export = pd.DataFrame()
TpeIpls.index_export["Vars"] = self.x_test.columns[id]
TpeIpls.index_export.index = id
# Save model
#TpeIpls.index_export.to_excel(path + 'variables.xlsx')
##3-performance
metrics(train=(self.y_train, yc), cv=(self.y_train, ycv) , test=(self.y_test, yt)).round(2).to_excel(path + "performance.xlsx")
self.segments = arrays
print("''---------------------------- evolution noticed, hence a new model was saved-------------------------------''")
self.idx = self.idx
return score
def tune(self, n_iter):
print('------------------------------------------------ Optimization of the process has started ---------------------------------------------')
##############################################
def BandSelect(self, n_iter):
trials = Trials()
best_params = fmin(fn=self._objective,
best_params = fmin(fn=self.objective,
space=self.PLS_params,
algo=tpe.suggest, # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach
max_evals=n_iter,
trials=trials,
verbose=2)
@property
def segments_(self):
self.bands = {}
ban = {}
for i in range(len(self.segments)):
self.bands[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]]
ban[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]]
bands = pd.DataFrame(self.bands).T
bands.columns = ['from', 'to']
return bands
self.bands = pd.DataFrame(ban).T
self.bands.columns = ['from', 'to']
@property
def tpe_pls_performance(self):
f = []
for i in range(self.segments_.shape[0]):
f.extend(np.arange(self.segments_["from"][i], self.segments_["to"][i]+1))
for i in range(self.bands.shape[0]):
f.extend(np.arange(self.bands["from"][i], self.bands["to"][i]+1))
variables_idx = list(set(f))
############################################
for i in range(self.bands.shape[0]):
f.extend(np.arange(self.bands["from"][i], self.bands["to"][i]+1))
variables_idx = list(set(f))
pls = PLSRegression(n_components=self.nlv, scale= self.scale)
pls.fit(self.x_train.iloc[:,variables_idx], self.y_train)
self.pls = PLSRegression(n_components=self.nlv, scale= self.scale)
self.pls.fit(self.x_train.iloc[:,variables_idx], self.y_train)
self.yc = pls.predict(self.x_train.iloc[:,variables_idx]).ravel()
self.ycv = cross_val_predict(pls, self.x_train.iloc[:,variables_idx], self.y_train, cv=self.Kfold, n_jobs=-1).ravel()
self.yt = pls.predict(self.x_test.iloc[:,variables_idx]).ravel()
self.yc = self.pls.predict(self.x_train.iloc[:,variables_idx]).ravel()
self.ycv = cross_val_predict(self.pls, self.x_train.iloc[:,variables_idx], self.y_train, cv=self.Kfold, n_jobs=-1).ravel()
self.yt = self.pls.predict(self.x_test.iloc[:,variables_idx]).ravel()
perf = metrics(train=(self.y_train, self.yc), cv=(self.y_train, self.ycv) , test=(self.y_test, self.yt)).round(2)
return self.bands, variables_idx
return perf
@property
def model_(self):
return self.pls
@property
def metrics_(self):
metc = metrics(self.y_train, self.yc)
metc = metc.evaluate_
metcv = metrics(self.y_train, self.ycv)
metcv = metcv.evaluate_
mett = metrics( self.y_test, self.yt)
mett = mett.evaluate_
met = pd.concat([metc, metcv, mett], axis = 0)
met.index = ['calib','cv','test']
return met
@property
def meas_vs_pred(self):
fig, ax = plt.subplots()
sns.regplot(x = self.y_train ,y = self.yc, ax = ax)
sns.regplot(x = self.y_train ,y = self.ycv,ax = ax)
sns.regplot(x = self.y_test,y = self.yt,ax = ax)
plt.show()
\ No newline at end of file
def pred_data_(self):
return self.yc, self.ycv, self.yt
\ No newline at end of file
......@@ -57,4 +57,7 @@ import joblib
# import pickle as pkl
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal
st.set_option('deprecation.showPyplotGlobalUse', False)
......@@ -3,24 +3,36 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
def nn(x):
return x is not None
########################################################################################
# Model creation module
container2 = st.container(border=True)
reg_algo = ["","Full-PLS", "Locally Weighted PLS", "Interval-PLS"]
# Model creation module
st.header("Calibration Model Development", divider='blue')
st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra")
M1, M2, M3 = st.columns([2,2,2])
M1.write("-- Performance metrics --")
M4, M5 = st.columns([6,2])
container3 = st.container(border=True)
st.write("---")
st.header("Model Diagnosis", divider='blue')
M7, M8 = st.columns([2,2])
M7.write('Predicted vs Measured values')
M8.write('Residuals plot')
M9, M10 = st.columns([2,2])
M9.write("-- Save the model --")
available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]
with container2:
st.header("Calibration Model Development", divider='blue')
st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra")
# CSV files loader
xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
# CSV files loader
xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if xcal_csv is not None and ycal_csv is not None:
if xcal_csv is not None and ycal_csv is not None:
# Select list for CSV delimiter
sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
# Select list for CSV header True / False
......@@ -29,61 +41,55 @@ with container2:
col = 0
else:
col = False
rd_seed = M1.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
rd_seed = M1.slider("Change Train-test split", min_value=1, max_value=1212, value=42, format="%i")
x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
# Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
# Assign data to training and test sets
X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index])
#############################
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0]
regression_algo = M1.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12)
if regression_algo == 'SciKitLearn PLSR':
############################# Regression modelling ##########################################
regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
if regression_algo == reg_algo[1]:
# Train model with model function from application_functions.py
Reg = PinardPlsr(x_train=X_train, x_test=X_test,y_train=y_train, y_test=y_test)
reg_model = Reg.model_
#M2.dataframe(Pin.pred_data_)
elif regression_algo == 'Jchemo Local Weighted PLSR':
elif regression_algo == reg_algo[2]:
reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
elif regression_algo == "Intervalle Selection PLSR":
elif regression_algo == reg_algo[3]:
s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value="min")
reg_model = TpeIpls(x_train= X_train, y_train= y_train, x_test=X_test, y_test= y_test,Kfold= 3,scale= True, n_intervall = 3)
reg_model.tune(n_iter=10)
if regression_algo in ["SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]:
with container3:
st.header("Model Diagnosis", divider='blue')
yc = Reg.pred_data_[0]
ycv = Reg.pred_data_[1]
yt = Reg.pred_data_[2]
M7.write('Predicted vs Measured values')
M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
M8.write('Residuals plot')
M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
# Export the model with pickle or joblib
if regression_algo != '':
M1.write("-- Performance metrics --")
it = M2.number_input(label='Enter the maximum number of iteration', min_value=50, max_value=1000, value="min")
Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = 6)
rega = Reg.BandSelect(n_iter=it)
reg_model = Reg.model_
################# Model analysis ############
if regression_algo in reg_algo[1:]:
yc = Reg.pred_data_[0]
ycv = Reg.pred_data_[1]
yt = Reg.pred_data_[2]
M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
M1.dataframe(Reg.metrics_)
M1.write("-- Save the model --")
#model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
model_name = M1.text_input('Give it a name')
if M1.button('Export Model'):
model_name = M9.text_input('Give it a name')
if M9.button('Export Model'):
#export_package = __import__(model_export)
with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model,f)
joblib.dump(reg_model, f)
st.write('Model Exported')
# create a report with information on the model
## see https://stackoverflow.com/a/59578663
#M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
# graphical delimiter
st.write("---")
#M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment