Newer
Older
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
add_sidebar(pages_folder)
repertoire_a_vider = Path('Report/figures')
if os.path.exists(repertoire_a_vider):
for fichier in os.listdir(repertoire_a_vider):
chemin_fichier = repertoire_a_vider / fichier
if os.path.isfile(chemin_fichier) or os.path.islink(chemin_fichier):
os.unlink(chemin_fichier)
elif os.path.isdir(chemin_fichier):
local_css(css_file / "style_model.css")
####################################### page Design #######################################
st.title("Calibration Model Development")
st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra")
st.header("I - Data visualization", divider='blue')
M0, M00 = st.columns([1, .4])
st.header("II - Model creation", divider='blue')
M1, M2 = st.columns([2 ,4])
cv1, cv2 = st.columns([2,2])
cv3 = st.container()
M7, M8 = st.columns([2,2])
M7.write('Predicted vs Measured values')
M8.write('Residuals plot')
M9.write("-- Save the model --")
##############################################################################################
reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
####################################### ###########################################
xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
sepx = M00.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
hdrx = M00.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
if hdrx == "yes": col = 0
else: col = False
ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
sepy = M00.radio("separator (Y file): ", options=[";", ","], key=2)
hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3)
xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0)
yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col)
if yfile.shape[1]>0 and xfile.shape[1]>0 :
spectra, meta_data = col_cat(xfile)
y, idx = col_cat(yfile)
if y.shape[1]>1:
yname = M00.selectbox('Select target', options=y.columns)
y = y.loc[:,yname]
else:
y = y.iloc[:,0]
spectra = pd.DataFrame(spectra).astype(float)
if not meta_data.empty :
st.write(meta_data)
M1.warning('Tune decimal and separator parameters')
data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
if data_file:
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(data_file.read())
tmp_path = tmp.name
chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path)
M00.success("The data have been loaded successfully", icon="✅")
yname = M00.selectbox('Select target', options=chem_data.columns)
measured = chem_data.loc[:,yname] > 0
y = chem_data.loc[:,yname].loc[measured]
spectra = spectra.loc[measured]
else:
M00.warning('Warning: Chemical data are not included in your file !', icon="⚠️")
os.unlink(tmp_path)
### split the data
if not spectra.empty and not y.empty:
if np.array(spectra.columns).dtype.kind in ['i','f']:
colnames = spectra.columns
else:
colnames = np.arange(spectra.shape[1])
#rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
# Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42)
# Assign data to training and test sets
X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index]
X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index]
fig, ax1 = plt.subplots( figsize = (12,3))
spectra.T.plot(legend=False, ax = ax1, linestyle = '--')
ax1.set_ylabel('Signal intensity')
ax1.margins(0)
plt.tight_layout()
fig.savefig("./Report/figures/Spectre_mod.png")
fig, ax2 = plt.subplots(figsize = (12,3))
sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True)
sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True)
sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True)
ax2.set_xlabel('y')
plt.legend()
plt.tight_layout()
M0.pyplot(fig)
fig.savefig("./Report/figures/histo.png")
M0.write('Loaded data summary')
M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2))
LoDaSum=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)
####################################### Insight into the loaded data
#######################################
regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
if regression_algo == reg_algo[1]:
# Train model with model function from application_functions.py
Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10)
reg_model = Reg.model_
#M2.dataframe(Pin.pred_data_)
elif regression_algo == reg_algo[2]:
info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.')
# export data to csv for Julia train/test
data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']
x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
# Cross-Validation calculation
nb_folds = 3
st.write('KFold = ' + str(nb_folds))
folds = KF_CV.CV(x_train_np, y_train_np, nb_folds)
d = {}
for i in range(nb_folds):
d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]]
data_to_work_with.append("xtr_fold{0}".format(i+1))
data_to_work_with.append("ytr_fold{0}".format(i+1))
data_to_work_with.append("xte_fold{0}".format(i+1))
data_to_work_with.append("yte_fold{0}".format(i+1))
temp_path = Path('temp/')
for i in data_to_work_with:
if 'fold' in i:
j = d[i]
else:
j = globals()[i]
np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",")
import subprocess
subprocess_path = Path("Class_Mod/")
subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"])
# retrieve json results from Julia JChemo
try:
with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
Reg_json = json.load(outfile)
for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
os.unlink(temp_path / "lwplsr_outputs.json")
pred = ['pred_data_train', 'pred_data_test']
Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
for i in range(len(pred)):
Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
if i != 1: # if not pred_data_test
Reg.pred_data_[i].index = list(y_train.index)
else:
Reg.pred_data_[i].index = list(y_test.index)
Reg.CV_results_ = pd.DataFrame()
Reg.cv_data_ = pd.DataFrame()
info.empty()
M1.success('Model created!')
except FileNotFoundError as e:
info.empty()
M1.warning('- ERROR during model creation -')
Reg = None
s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3)
progress_text = "The model is being created. Please wait."
Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it)
pro = M1.progress(0, text="The model is being created. Please wait!")
pro.empty()
M1.progress(100, text = "The model has successfully been created!")
time.sleep(1)
reg_model = Reg.model_
M2.write('-- Important Spectral regions used for model creation --')
intervalls = Reg.selected_features_.T
intervalls_with_cols = Reg.selected_features_.T
for i in range(intervalls.shape[0]):
for j in range(intervalls.shape[1]):
intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]]
M2.table(intervalls_with_cols)
# elif regression_algo == reg_algo[4]:
# Reg = PlsR(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test)
# reg_model = Reg.model_
################# Model analysis ############
if regression_algo in reg_algo[1:] and Reg is not None:
#M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ')
fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6))
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02)
# fig.append_trace(go.Scatter(x=[3, 4, 5],
# y=[1000, 1100, 1200],), row=1, col=1)
# fig.append_trace(go.Scatter(x=[2, 3, 4],
# y=[100, 110, 120],), row=2, col=1)
# fig.append_trace(go.Scatter(x=[0, 1, 2],
# y=[10, 11, 12]), row=3, col=1)
# fig.update_layout(height=600, width=600, title_text="Stacked Subplots")
# a = Reg.pretreated_spectra_
# r = pd.concat([y_train, a], axis = 1)
# rr = r.melt("x")
# rr.columns = ['y values', 'x_axis', 'y_axis']
# fig = px.scatter(rr, x = 'x_axis', y = 'y_axis', color_continuous_scale=px.colors.sequential.Viridis, color = 'y values')
# M3.plotly_chart(fig)
# from matplotlib.colors import Normalize
# color_variable = y_train
# norm = Normalize(vmin=color_variable.min(), vmax= color_variable.max())
# cmap = plt.get_cmap('viridis')
# colors = cmap(norm(color_variable.values))
# fig, ax = plt.subplots(figsize = (10,3))
# for i in range(Reg.pretreated_spectra_.shape[0]):
# ax.plot(Reg.pretreated_spectra_.columns, Reg.pretreated_spectra_.iloc[i,:], color = colors[i])
# sm = ScalarMappable(norm = norm, cmap = cmap)
# cbar = plt.colorbar(sm, ax = ax)
# # cbar.set_label('Target range')
# plt.tight_layout()
# htmlfig = mpld3.fig_to_html(fig)
# with M2:
# st.components.v1.html(htmlfig, height=600)
############
cv2.write('-- Cross-Validation Summary--')
cv2.write(Reg.CV_results_)
cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')
fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds",
color_discrete_sequence=px.colors.qualitative.G10)
fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
fig1.update_traces(marker_size=7, showlegend=False)
cv2.plotly_chart(fig1, use_container_width=True)
fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000)
fig0.update_traces(marker_size=8, showlegend=False)
fig0.write_image("./Report/figures/Allinone.png")
cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
cv1.plotly_chart(fig0, use_container_width=True)
fig0.write_image("./Report/figures/Predictions_V.png")
M1.write('-- Spectral preprocessing info --')
M1.write(Reg.best_hyperparams_print)
with open("data/params/Preprocessing.json", "w") as outfile:
json.dump(Reg.best_hyperparams_, outfile)
M1.write("-- Model performance --")
M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
#from st_circular_progress import CircularProgress
#my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance',
# size = "medium", track_color = "black", color = "blue")
#my_circular_progress.st_circular_progress()
#my_circular_progress.update_value(progress=20)
a=reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)
M7.pyplot(a)
plt.savefig('./Report/figures/Predictedvs.png')
residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index)
M8.pyplot(residual_plot)
plt.savefig('./Report/figures/residual_plot.png')
rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT
#model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_')
if M9.button('Export Model'):
path = 'data/models/model_'
if file == files_format[0]:
#export_package = __import__(model_export)
with open(path + model_name + date_time + '_created_on_' + xcal_csv.name[:xcal_csv.name.find(".")] +""+
'_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f:
Reg.selected_features_.T.to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")]
+ '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';')
elif file == files_format[1]:
#export_package = __import__(model_export)
with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model, f)
if regression_algo == reg_algo[3]:
Reg.selected_features_.T.to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
# create a report with information on the model
## see https://stackoverflow.com/a/59578663
pages_folder = Path("pages/")
show_pages(
[Page("app.py", "Home"),
Page(str(pages_folder / "4-inputs.py"), "Inputs"),
Page(str(pages_folder / "1-samples_selection.py"), "Samples Selection"),
Page(str(pages_folder / "2-model_creation.py"), "Models Creation"),
Page(str(pages_folder / "3-prediction.py"), "Predictions"),
]
)
st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
## Load .dx file
Ac_Km = ['histo.png', 'Spectre_mod.png','Predictions_V.png','Allinone.png','Predictedvs.png','residual_plot.png']
with st.container():
latex_report = report.report(LoDaSum, 'model',Ac_Km,a_Test,json_sp,model_per,'full_plsr',cv99)
pass
if not spectra.empty and not y.empty:
if regression_algo in reg_algo[1:] and Reg is not None:
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True)
ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)')
ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)')
ax2.set_xlabel('Wavelenghts')
plt.tight_layout()
for i in range(2):
eval(f'ax{i+1}').grid(color='grey', linestyle=':', linewidth=0.2)
eval(f'ax{i+1}').margins(x = 0)
eval(f'ax{i+1}').legend(loc = 'upper right')
eval(f'ax{i+1}').set_ylabel('Intensity')
if regression_algo == reg_algo[3]:
for j in range(s):
if np.array(spectra.columns).dtype.kind in ['i','f']:
min, max = intervalls_with_cols['from'][j], intervalls_with_cols['to'][j]
else:
min, max = intervalls['from'][j], intervalls['to'][j]
eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0)
if regression_algo == reg_algo[1]:
ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)],
color = 'red', label = 'Important variables')
ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)],
color = 'red', label = 'Important variables')
ax1.legend()
ax2.legend()
M2.write('-- Visualization of the spectral regions used for model creation -- ')
M2.pyplot(fig)