Newer
Older
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
hide_pages("Predictions")
#path = os.path.dirname(os.path.abspath(__file__)).replace('\\','/')
#css_file = path[:path.find('/pages')]+'/style'
#local_css(css_file +"/style_model.css")
local_css(css_file / "style_model.css")
####################################### page Design #######################################
st.title("Calibration Model Development")
st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra")
st.header("I - Data visualization", divider='blue')
M0, M00 = st.columns([1, .4])
st.header("II - Model creation", divider='blue')
st.header("Cross_Validation")
cv1, cv2 = st.columns([2,2])
cv3 = st.container()
M7, M8 = st.columns([2,2])
M7.write('Predicted vs Measured values')
M8.write('Residuals plot')
M9.write("-- Save the model --")
##############################################################################################
reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
####################################### ###########################################
xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
sepx = M00.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
hdrx = M00.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
if hdrx == "yes": col = 0
else: col = False
ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
sepy = M00.radio("separator (Y file): ", options=[";", ","], key=2)
hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3)
xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0)
yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col)
if yfile.shape[1]>0 and xfile.shape[1]>0 :
spectra, meta_data = col_cat(xfile)
y, idx = col_cat(yfile)
if y.shape[1]>1:
yname = M00.selectbox('Select target', options=y.columns)
y = y.loc[:,yname]
else:
y = y.iloc[:,0]
spectra = pd.DataFrame(spectra).astype(float)
if not meta_data.empty :
st.write(meta_data)
M1.warning('Tune decimal and separator parameters')
data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
if data_file:
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(data_file.read())
tmp_path = tmp.name
chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path)
M00.success("The data have been loaded successfully", icon="✅")
yname = M00.selectbox('Select target', options=chem_data.columns)
measured = chem_data.loc[:,yname] > 0
y = chem_data.loc[:,yname].loc[measured]
spectra = spectra.loc[measured]
else:
M00.warning('Warning: Chemical data are not included in your file !', icon="⚠️")
os.unlink(tmp_path)
### split the data
if not spectra.empty and not y.empty:
#rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
# Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42)
# Assign data to training and test sets
X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index]
X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index]
#### insight on loaded data
fig, ax1 = plt.subplots( figsize = (12,3))
spectra.T.plot(legend=False, ax = ax1, linestyle = '--')
ax1.set_ylabel('Signal intensity')
ax1.margins(0)
plt.tight_layout()
M0.pyplot(fig)
fig, ax2 = plt.subplots(figsize = (12,3))
sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True)
sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True)
sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True)
ax2.set_xlabel('y')
plt.legend()
plt.tight_layout()
M0.pyplot(fig)
M0.write('Loaded data summary')
M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2))
####################################### Insight into the loaded data
#######################################
regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
if regression_algo == reg_algo[1]:
# Train model with model function from application_functions.py
Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10)
reg_model = Reg.model_
#M2.dataframe(Pin.pred_data_)
elif regression_algo == reg_algo[2]:
data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']#,'x_train_np_cv1', 'y_train_np_cv1', 'x_test_np_cv1', 'y_test_np_cv1', 'x_train_np_cv2', 'y_train_np_cv2', 'x_test_np_cv2', 'y_test_np_cv2', 'x_train_np_cv3', 'y_train_np_cv3', 'x_test_np_cv3', 'y_test_np_cv3',]
x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
# x_train_np_cv1, y_train_np_cv1, x_test_np_cv1, y_test_np_cv1, x_train_np_cv2, y_train_np_cv2, x_test_np_cv2, y_test_np_cv2, x_train_np_cv3, y_train_np_cv3, x_test_np_cv3, y_test_np_cv3, = X_train_cv1.to_numpy(), y_train_cv1.to_numpy(), X_test_cv1.to_numpy(), y_test_cv1.to_numpy(), X_train_cv2.to_numpy(), y_train_cv2.to_numpy(), X_test_cv2.to_numpy(), y_test_cv2.to_numpy(), X_train_cv3.to_numpy(), y_train_cv3.to_numpy(), X_test_cv3.to_numpy(), y_test_cv3.to_numpy()
temp_path = Path('temp/')
for i in data_to_work_with: np.savetxt(temp_path / str(i + ".csv"), vars()[i], delimiter=",")
import subprocess
subprocess_path = Path("Class_Mod/")
subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"])
with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
Reg_json = json.load(outfile)
for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
os.unlink(temp_path / "lwplsr_outputs.json")
# Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json['pred_data_train']), pd.json_normalize(Reg_json['pred_data_cv1']), pd.json_normalize(Reg_json['pred_data_cv2']), pd.json_normalize(Reg_json['pred_data_cv3']), pd.json_normalize(Reg_json['pred_data_test'])]})
pred = ['pred_data_train', 'pred_data_cv1', 'pred_data_cv2', 'pred_data_cv3', 'pred_data_test']
Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
for i in range(len(pred)):
Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
Reg.pred_data_[i].index = list(y_train.index)
else:
Reg.pred_data_[i].index = list(y_test.index)
s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
progress_text = "The model is being created. Please wait."
Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it)
pro = M1.progress(0, text="The model is being created. Please wait!")
pro.empty()
M1.progress(100, text = "The model has successfully been created!")
time.sleep(1)
reg_model = Reg.model_
fig, ax = plt.subplots(figsize = (12, 6))
X_train.mean().plot(ax = ax)
for i in range(s):
colnames = np.array(y)
ax.axvspan(X_train.columns[intervalls['from'][i]], X_train.columns[intervalls['to'][i]],
color='#2a52be', alpha=0.5, lw=0)
plt.plot(np.arange(X_train.shape[1]), X_train.mean(), color = 'black')
ax.axvspan(intervalls['from'][i], intervalls['to'][i], color='#2a52be', alpha=0.5, lw=0)
M3.write('-- Visualization of the spectral regions used for model creation -- ')
M3.pyplot(fig)
# elif regression_algo == reg_algo[4]:
# Reg = PlsR(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test)
# reg_model = Reg.model_
################# Model analysis ############
cv2.write('-- Cross-Validation Summary--')
cv2.write(Reg.CV_results_)
cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')
fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds",
color_discrete_sequence=px.colors.qualitative.G10)
fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
fig1.update_traces(marker_size=7, showlegend=False)
cv2.plotly_chart(fig1)
fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000)
fig0.update_traces(marker_size=8, showlegend=False)
cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
cv1.plotly_chart(fig0)
M2.write('-- Spectral preprocessing info --')
M2.write(Reg.best_hyperparams)
with open("data/params/Preprocessing.json", "w") as outfile:
json.dump(Reg.best_hyperparams, outfile)
M2.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_)
#from st_circular_progress import CircularProgress
#my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance',
# size = "medium", track_color = "black", color = "blue")
#my_circular_progress.st_circular_progress()
#my_circular_progress.update_value(progress=20)
M7.pyplot(reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index))
M8.pyplot(resid_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index))
# rega = Reg.important_features_ ##### ADD FEATURES IMPORTANCE PLOT
#model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_')
if M9.button('Export Model'):
path = 'data/models/model_'
if file == files_format[0]:
#export_package = __import__(model_export)
with open(path + model_name + date_time + '_created_on_' + xcal_csv.name[:xcal_csv.name.find(".")] +""+
'_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f:
pd.DataFrame(rega[1]).to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")]
+ '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';')
elif file == files_format[1]:
#export_package = __import__(model_export)
with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model, f)
if regression_algo == reg_algo[3]:
rega[1].sort()
pd.DataFrame(rega[1]).to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
st.write('Model Exported ')
st.write('Model Exported')
# create a report with information on the model
## see https://stackoverflow.com/a/59578663
pages_folder = Path("pages/")
show_pages(
[Page("app.py", "Home"),
Page(str(pages_folder / "4-inputs.py"), "Inputs"),
Page(str(pages_folder / "1-samples_selection.py"), "Samples Selection"),
Page(str(pages_folder / "2-model_creation.py"), "Models Creation"),
Page(str(pages_folder / "3-prediction.py"), "Predictions"),
]
)
st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')