Newer
Older
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
add_sidebar(pages_folder)
repertoire_a_vider = Path('Report/figures')
if os.path.exists(repertoire_a_vider):
for fichier in os.listdir(repertoire_a_vider):
chemin_fichier = repertoire_a_vider / fichier
if os.path.isfile(chemin_fichier) or os.path.islink(chemin_fichier):
os.unlink(chemin_fichier)
elif os.path.isdir(chemin_fichier):
local_css(css_file / "style_model.css")
####################################### page preamble #######################################
st.title("Calibration Model Development") # page title
st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra")
M0, M00 = st.columns([1, .4])
M0.image("C:/Users/diane/Desktop/nirs_workflow/src/images/graphical_abstract.jpg", use_column_width=True) # graphical abstract
####################################### I- Data preparation
files_format = ['.csv', '.dx'] # Supported files format
file = M00.radio('Select files format:', options = files_format) # Select a file format
spectra = pd.DataFrame() # preallocate the spectral data block
y = pd.DataFrame() # preallocate the target(s) data block
match file:
## load .csv file
case '.csv':
# Load X-block data
xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
if xcal_csv:
sepx = M00.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
hdrx = M00.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
match hdrx:
case "yes":
col = 0
case "no":
col = False
else:
M00.warning('Insert your spectral data file here!')
# Load Y-block data
ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if ycal_csv:
sepy = M00.radio("Select separator (Y file) - _detected_: " + str(find_delimiter('data/'+ycal_csv.name)),
options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+ycal_csv.name))), key=2)
hdry = M00.radio("samples name (Y file)? - _detected_: " + str(find_col_index('data/'+ycal_csv.name)),
options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+ycal_csv.name))), key=3)
else:
M00.warning('Insert your target data file here!')
if xcal_csv and ycal_csv:
file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name)
xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0)
yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col)
if yfile.shape[1]>0 and xfile.shape[1]>0 :
spectra, meta_data = col_cat(xfile)
chem_data, idx = col_cat(yfile)
if chem_data.shape[1]>1:
yname = M00.selectbox('Select target', options=chem_data.columns)
y = chem_data.loc[:,yname]
else:
y = chem_data.iloc[:,0]
spectra = pd.DataFrame(spectra).astype(float)
# if not meta_data.empty :
# st.write(meta_data)
if spectra.shape[0] != y.shape[0]:
M00.warning('X and Y have different sample size')
y = pd.DataFrame
spectra = pd.DataFrame
M00.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !')
## Load .dx file
case '.dx':
data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
if not data_file:
M00.warning('Load your file here!')
else :
file_name = str(data_file.name)
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(data_file.read())
tmp_path = tmp.name
chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path)
M00.success("The data have been loaded successfully", icon="✅")
if chem_data.shape[1]>0:
yname = M00.selectbox('Select target', options=chem_data.columns)
measured = chem_data.loc[:,yname] > 0
y = chem_data.loc[:,yname].loc[measured]
spectra = spectra.loc[measured]
else:
M00.warning('Warning: your file includes no target variables to model !', icon="⚠️")
os.unlink(tmp_path)

DIANE
committed
st.header("I - Data visualization", divider='blue')

DIANE
committed
M0, M000 = st.columns([1, .4])
if np.array(spectra.columns).dtype.kind in ['i','f']:
colnames = spectra.columns
else:
colnames = np.arange(spectra.shape[1])
# Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42)
# Assign data to training and test sets
X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index]
X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index]
fig, ax1 = plt.subplots( figsize = (12,3))
spectra.T.plot(legend=False, ax = ax1, linestyle = '--')
ax1.set_ylabel('Signal intensity')
ax1.margins(0)
plt.tight_layout()
fig.savefig("./Report/figures/spectra_plot.png")
fig, ax2 = plt.subplots(figsize = (12,3))
sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True)
sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True)
sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True)
ax2.set_xlabel('y')
plt.legend()
plt.tight_layout()
M0.pyplot(fig)
fig.savefig("./Report/figures/Histogram.png")

DIANE
committed
M000.write('Loaded data summary')
M000.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2))
stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)
####################################### Model creation ###################################################
regression_algo = None # initialize the selected regression algorithm
Reg = None # initialize the regression model object

DIANE
committed
st.header("II - Model creation", divider='blue')

DIANE
committed
M10, M20, M30, M40, M50 = st.columns([1,1,1,1,1])
# select type of supervised modelling problem
modes = ['regression', 'classification']
mode =M10.radio("Analysis Methods", options=modes)
match mode:
case "regression":
reg_algo = ["","PLS", "LW-PLS", "TPE-iPLS"]
regression_algo = M20.selectbox("Choose the regression algorithm", options= reg_algo, key = 12, format_func=lambda x: x if x else "<Select>")
case 'classification':
reg_algo = ["","PLS", "LW-PLS", "TPE-iPLS"]
regression_algo = M20.selectbox("Choose the classification algorithm", options= reg_algo, key = 12, format_func=lambda x: x if x else "<Select>")

DIANE
committed
nb_folds = 3
folds = KF_CV.CV(X_train, y_train, nb_folds)# split train data into nb_folds for cross_validation
M1, M2 = st.columns([2 ,4])
# Model creation
match regression_algo:
case "":
M20.warning('Choose a modelling algorithm from the dropdown list !')
case "PLS":
Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=1)
case 'LW-PLS':
M20.write(f'K-Fold for Cross-Validation (K = {str(nb_folds)})')
info = M20.info('Starting LWPLSR model creation... Please wait a few minutes.')
# export data to csv for Julia train/test
data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
# Cross-Validation calculation
d = {}
for i in range(nb_folds):
d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]]
data_to_work_with.append("xtr_fold{0}".format(i+1))
data_to_work_with.append("ytr_fold{0}".format(i+1))
data_to_work_with.append("xte_fold{0}".format(i+1))
data_to_work_with.append("yte_fold{0}".format(i+1))
# check best pre-treatment with a global PLSR model
preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=20)
temp_path = Path('temp/')
with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile:
json.dump(preReg.best_hyperparams_, outfile)
# export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files
for i in data_to_work_with:
if 'fold' in i:
j = d[i]
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
j = globals()[i]
np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",")
# run Julia Jchemo as subprocess
import subprocess
subprocess_path = Path("Class_Mod/")
subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"])
# retrieve json results from Julia JChemo
try:
with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
Reg_json = json.load(outfile)
# delete csv files
for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
# # delete json file after import
os.unlink(temp_path / "lwplsr_outputs.json")
os.unlink(temp_path / "lwplsr_preTreatments.json")
# format result data into Reg object
pred = ['pred_data_train', 'pred_data_test']### keys of the dict
for i in range(nb_folds):
pred.append("CV" + str(i+1)) ### add cv folds keys to pred
Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'],
'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
reg_model = Reg.model_
Reg.CV_results_ = pd.DataFrame()
Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}}
# # set indexes to Reg.pred_data (train, test, folds idx)
for i in range(len(pred)):
Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
if i == 0: # data_train
# Reg.pred_data_[i] = np.array(Reg.pred_data_[i])
Reg.pred_data_[i].index = list(y_train.index)
Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0]
elif i == 1: # data_test
# Reg.pred_data_[i] = np.array(Reg.pred_data_[i])
Reg.pred_data_[i].index = list(y_test.index)
Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0]
else:
# CVi
Reg.pred_data_[i].index = folds[list(folds)[i-2]]
# Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]])
Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1)
Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1)
Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1]
#### cross validation results print
Reg.best_hyperparams_print = Reg.best_hyperparams_
## plots
Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds)
Reg.pretreated_spectra_ = preReg.pretreated_spectra_
Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_}
Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_}
info.empty()
M20.success('Model created!')
except FileNotFoundError as e:
# Display error message on the interface if modeling is wrong
info.empty()
M20.warning('- ERROR during model creation -')
Reg = None
for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
case 'TPE-iPLS':
s = M20.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
it = M20.number_input(label='Enter the number of iterations', min_value=1, max_value=3, value=2)
progress_text = "The model is being created. Please wait."
Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it)
pro = M1.info("The model is being created. Please wait!")
pro.empty()
M20.info("The model has successfully been created!")
time.sleep(1)
reg_model = Reg.model_
intervalls = Reg.selected_features_.T
intervalls_with_cols = Reg.selected_features_.T
for i in range(intervalls.shape[0]):
for j in range(intervalls.shape[1]):
intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]]
M2.write('-- Important Spectral regions used for model creation --')
M2.table(intervalls_with_cols)

DIANE
committed
################# Model analysis ############
if not (spectra.empty and y.empty):
if regression_algo in reg_algo[1:] and Reg:
fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6))
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02)

DIANE
committed
st.header("Cross-Validation results")
cv1, cv2 = st.columns([2,2])
############
cv2.write('-- Cross-Validation Summary--')
cv2.write(Reg.CV_results_)
cv_results=pd.DataFrame(Reg.CV_results_)
cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')
fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds",
fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']),
y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
cv2.plotly_chart(fig1, use_container_width=True)
fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000)
fig0.update_traces(marker_size=8, showlegend=False)
fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png")
cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
cv1.plotly_chart(fig0, use_container_width=True)
fig1.write_image("./Report/figures/meas_vs_pred_cv_all.png")
M1.write('-- Spectral preprocessing info --')
M1.write(Reg.best_hyperparams_print)
with open("data/params/Preprocessing.json", "w") as outfile:
json.dump(Reg.best_hyperparams_, outfile)
# ##########
M1.write("-- Model performance --")

DIANE
committed
if regression_algo != reg_algo[2]:
M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
else:
M1.dataframe(metrics(t = [y_test, yt], method='regression').scores_)
model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)

DIANE
committed
if regression_algo != reg_algo[2]:
a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
else:
a = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)

DIANE
committed
st.header("III - Model Diagnosis", divider='blue')
if not spectra.empty and not y.empty:

DIANE
committed
M7, M8 = st.columns([2,2])
M7.write('Predicted vs Measured values')
M8.write('Residuals plot')
plt.savefig('./Report/figures/measured_vs_predicted.png')
prep_para = Reg.best_hyperparams_

DIANE
committed
if regression_algo != reg_algo[2]:
prep_para.pop('n_components')
for i in ['deriv','polyorder']:
if Reg.best_hyperparams_[i] == 0:
prep_para[i] = '0'
elif Reg.best_hyperparams_[i] == 1:
prep_para[i] = '1st'
elif Reg.best_hyperparams_[i] > 1:
prep_para[i] = f"{Reg.best_hyperparams_[i]}nd"

DIANE
committed
if regression_algo != reg_algo[2]:
residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
else:
residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
plt.savefig('./Report/figures/residuals_plot.png')

DIANE
committed
if regression_algo != reg_algo[2]:
rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT

DIANE
committed
M9 = st.container()
M9.write("-- Save the model --")
date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_')
if M9.button('Export Model'):
path = 'data/models/model_'
match file:
case '.csv':
#export_package = __import__(model_export)
with open(path + model_name + date_time + '_created_on_' + xcal_csv.name[:xcal_csv.name.find(".")] +""+
'_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model, f)
if regression_algo == reg_algo[3]:
Reg.selected_features_.T.to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")]
+ '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';')
case '.dx':
#export_package = __import__(model_export)
with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model, f)
if regression_algo == reg_algo[3]:
Reg.selected_features_.T.to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
st.write('Model Exported ')
pages_folder = Path("pages/")
show_pages(
[Page("app.py", "Home"),
Page(str(pages_folder / "4-inputs.py"), "Inputs"),
Page(str(pages_folder / "1-samples_selection.py"), "Samples Selection"),
Page(str(pages_folder / "2-model_creation.py"), "Models Creation"),
Page(str(pages_folder / "3-prediction.py"), "Predictions"),
]
)
st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
if not spectra.empty and not y.empty and regression_algo:
fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True)
ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)')
# if regression_algo != reg_algo[2]:
ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (Pretreated)')
ax2.set_xlabel('Wavelenghts')
plt.tight_layout()
for i in range(2):
eval(f'ax{i+1}').grid(color='grey', linestyle=':', linewidth=0.2)
eval(f'ax{i+1}').margins(x = 0)
eval(f'ax{i+1}').legend(loc = 'upper right')
eval(f'ax{i+1}').set_ylabel('Intensity')
if regression_algo == reg_algo[3]:
for j in range(s):
if np.array(spectra.columns).dtype.kind in ['i','f']:
min, max = intervalls_with_cols['from'][j], intervalls_with_cols['to'][j]
else:
min, max = intervalls['from'][j], intervalls['to'][j]

DIANE
committed
eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0)
ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0)[np.array(Reg.sel_ratio_.index)],
ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0)[np.array(Reg.sel_ratio_.index)],
color = 'red', label = 'Important variables')
ax1.legend()
ax2.legend()
M2.write('-- Visualization of the spectral regions used for model creation --')
fig.savefig("./Report/figures/Variable_importance.png")
M2.pyplot(fig)
######################## Download report ###############################
if Reg:
with st.container():
if st.button("Download the report"):
match regression_algo:
case 'PLS':
latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results)
case 'LW-PLS':
latex_report = report.report('Predictive model development', file_name, stats,
list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), regression_algo, model_per, cv_results)
case 'TPE-iPLS':
latex_report = report.report('Predictive model development', file_name, stats,
list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), regression_algo, model_per, cv_results)
case _:
st.warning('Data processing has not been performed or finished yet!', icon = "⚠️")

DIANE
committed
report.compile_latex()