# import streamlit import pandas as pd from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * from Class_Mod.Miscellaneous import desc_stats add_header() add_sidebar(pages_folder) repertoire_a_vider = Path('Report/figures') if os.path.exists(repertoire_a_vider): for fichier in os.listdir(repertoire_a_vider): chemin_fichier = repertoire_a_vider / fichier if os.path.isfile(chemin_fichier) or os.path.islink(chemin_fichier): os.unlink(chemin_fichier) elif os.path.isdir(chemin_fichier): os.rmdir(chemin_fichier) json_sp = pd.DataFrame() local_css(css_file / "style_model.css") ####################################### page Design ####################################### st.title("Calibration Model Development") st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") st.header("I - Data visualization", divider='blue') M0, M00 = st.columns([1, .4]) st.header("II - Model creation", divider='blue') M1, M2 = st.columns([2 ,4]) st.header("Cross-Validation results") cv1, cv2 = st.columns([2,2]) cv3 = st.container() st.header("III - Model Diagnosis", divider='blue') M7, M8 = st.columns([2,2]) M7.write('Predicted vs Measured values') M8.write('Residuals plot') M9 = st.container() M9.write("-- Save the model --") ############################################################################################## reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"] ####################################### ########################################### files_format = ['.csv', '.dx'] file = M00.radio('Select files format:', options = files_format) ### Data spectra = pd.DataFrame() y = pd.DataFrame() # load .csv file if file == files_format[0]: xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") if xcal_csv: sepx = M00.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) hdrx = M00.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) if hdrx == "yes": col = 0 else: col = False ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") if ycal_csv: sepy = M00.radio("separator (Y file): ", options=[";", ","], key=2) hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3) if hdry == "yes": col = 0 else: col = False if xcal_csv and ycal_csv: xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0) yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col) if yfile.shape[1]>0 and xfile.shape[1]>0 : spectra, meta_data = col_cat(xfile) y, idx = col_cat(yfile) if y.shape[1]>1: yname = M00.selectbox('Select target', options=y.columns) y = y.loc[:,yname] else: y = y.iloc[:,0] spectra = pd.DataFrame(spectra).astype(float) if not meta_data.empty : st.write(meta_data) if spectra.shape[0] != y.shape[0]: M00.warning('X and Y have different sample size') y = pd.DataFrame spectra = pd.DataFrame else: M1.warning('Tune decimal and separator parameters') ## Load .dx file elif file == files_format[1]: data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") if data_file: with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: tmp.write(data_file.read()) tmp_path = tmp.name chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path) M00.success("The data have been loaded successfully", icon="✅") if chem_data.shape[1]>0: yname = M00.selectbox('Select target', options=chem_data.columns) measured = chem_data.loc[:,yname] > 0 y = chem_data.loc[:,yname].loc[measured] spectra = spectra.loc[measured] else: M00.warning('Warning: Chemical data are not included in your file !', icon="⚠️") os.unlink(tmp_path) ### split the data if not spectra.empty and not y.empty: if np.array(spectra.columns).dtype.kind in ['i','f']: colnames = spectra.columns else: colnames = np.arange(spectra.shape[1]) #rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42) # Assign data to training and test sets X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index] X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index] #### insight on loaded data fig, ax1 = plt.subplots( figsize = (12,3)) spectra.T.plot(legend=False, ax = ax1, linestyle = '--') ax1.set_ylabel('Signal intensity') ax1.margins(0) plt.tight_layout() M0.pyplot(fig) ######## Loaded graph fig.savefig("./Report/figures/Spectre_mod.png") fig, ax2 = plt.subplots(figsize = (12,3)) sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True) sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True) sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True) ax2.set_xlabel('y') plt.legend() plt.tight_layout() M0.pyplot(fig) fig.savefig("./Report/figures/histo.png") M0.write('Loaded data summary') M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)) LoDaSum=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2) ####################################### Insight into the loaded data ####################################### regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10) reg_model = Reg.model_ #M2.dataframe(Pin.pred_data_) elif regression_algo == reg_algo[2]: info = M1.info('Starting LWPLSR model creation... Please wait a few minutes.') # export data to csv for Julia train/test data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() # Cross-Validation calculation nb_folds = 3 st.write('KFold = ' + str(nb_folds)) folds = KF_CV.CV(x_train_np, y_train_np, nb_folds) d = {} for i in range(nb_folds): d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] data_to_work_with.append("xtr_fold{0}".format(i+1)) data_to_work_with.append("ytr_fold{0}".format(i+1)) data_to_work_with.append("xte_fold{0}".format(i+1)) data_to_work_with.append("yte_fold{0}".format(i+1)) temp_path = Path('temp/') for i in data_to_work_with: if 'fold' in i: j = d[i] else: j = globals()[i] np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") # run Julia Jchemo import subprocess subprocess_path = Path("Class_Mod/") subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"]) # retrieve json results from Julia JChemo try: with open(temp_path / "lwplsr_outputs.json", "r") as outfile: Reg_json = json.load(outfile) for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) os.unlink(temp_path / "lwplsr_outputs.json") pred = ['pred_data_train', 'pred_data_test'] Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) for i in range(len(pred)): Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) if i != 1: # if not pred_data_test Reg.pred_data_[i].index = list(y_train.index) else: Reg.pred_data_[i].index = list(y_test.index) Reg.CV_results_ = pd.DataFrame() Reg.cv_data_ = pd.DataFrame() info.empty() M1.success('Model created!') except FileNotFoundError as e: info.empty() M1.warning('- ERROR during model creation -') Reg = None elif regression_algo == reg_algo[3]: s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3) it = M1.number_input(label='Enter the number of iterations', min_value=2, max_value=10, value=3) progress_text = "The model is being created. Please wait." Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it) pro = M1.progress(0, text="The model is being created. Please wait!") pro.empty() M1.progress(100, text = "The model has successfully been created!") time.sleep(1) reg_model = Reg.model_ M2.write('-- Important Spectral regions used for model creation --') intervalls = Reg.selected_features_.T intervalls_with_cols = Reg.selected_features_.T for i in range(intervalls.shape[0]): for j in range(intervalls.shape[1]): intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] M2.table(intervalls_with_cols) # elif regression_algo == reg_algo[4]: # Reg = PlsR(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test) # reg_model = Reg.model_ ################# Model analysis ############ if regression_algo in reg_algo[1:] and Reg is not None: #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ') fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6)) fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02) # fig.append_trace(go.Scatter(x=[3, 4, 5], # y=[1000, 1100, 1200],), row=1, col=1) # fig.append_trace(go.Scatter(x=[2, 3, 4], # y=[100, 110, 120],), row=2, col=1) # fig.append_trace(go.Scatter(x=[0, 1, 2], # y=[10, 11, 12]), row=3, col=1) # fig.update_layout(height=600, width=600, title_text="Stacked Subplots") # a = Reg.pretreated_spectra_ # r = pd.concat([y_train, a], axis = 1) # rr = r.melt("x") # rr.columns = ['y values', 'x_axis', 'y_axis'] # fig = px.scatter(rr, x = 'x_axis', y = 'y_axis', color_continuous_scale=px.colors.sequential.Viridis, color = 'y values') # M3.plotly_chart(fig) # from matplotlib.colors import Normalize # color_variable = y_train # norm = Normalize(vmin=color_variable.min(), vmax= color_variable.max()) # cmap = plt.get_cmap('viridis') # colors = cmap(norm(color_variable.values)) # fig, ax = plt.subplots(figsize = (10,3)) # for i in range(Reg.pretreated_spectra_.shape[0]): # ax.plot(Reg.pretreated_spectra_.columns, Reg.pretreated_spectra_.iloc[i,:], color = colors[i]) # sm = ScalarMappable(norm = norm, cmap = cmap) # cbar = plt.colorbar(sm, ax = ax) # # cbar.set_label('Target range') # plt.tight_layout() # htmlfig = mpld3.fig_to_html(fig) # with M2: # st.components.v1.html(htmlfig, height=600) ############ cv2.write('-- Cross-Validation Summary--') cv2.write(Reg.CV_results_) cv99=pd.DataFrame(Reg.CV_results_) cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --') fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", color_discrete_sequence=px.colors.qualitative.G10) fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash")) fig1.update_traces(marker_size=7, showlegend=False) cv2.plotly_chart(fig1, use_container_width=True) fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000) fig0.update_traces(marker_size=8, showlegend=False) fig0.write_image("./Report/figures/Allinone.png") cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --') cv1.plotly_chart(fig0, use_container_width=True) fig0.write_image("./Report/figures/Predictions_V.png") yc = Reg.pred_data_[0] yt = Reg.pred_data_[1] #if M1.write('-- Spectral preprocessing info --') M1.write(Reg.best_hyperparams_print) a_Test=Reg.best_hyperparams_print with open("data/params/Preprocessing.json", "w") as outfile: json.dump(Reg.best_hyperparams_, outfile) ########## M1.write("-- Model performance --") M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) #from st_circular_progress import CircularProgress #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance', # size = "medium", track_color = "black", color = "blue") #my_circular_progress.st_circular_progress() #my_circular_progress.update_value(progress=20) a=reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) M7.pyplot(a) plt.savefig('./Report/figures/Predictedvs.png') residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) M8.pyplot(residual_plot) plt.savefig('./Report/figures/residual_plot.png') rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) model_name = M9.text_input('Give it a name') date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_') if M9.button('Export Model'): path = 'data/models/model_' if file == files_format[0]: #export_package = __import__(model_export) with open(path + model_name + date_time + '_created_on_' + xcal_csv.name[:xcal_csv.name.find(".")] +""+ '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f: joblib.dump(reg_model, f) if regression_algo == reg_algo[3]: Reg.selected_features_.T.to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")] + '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';') elif file == files_format[1]: #export_package = __import__(model_export) with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f: joblib.dump(reg_model, f) if regression_algo == reg_algo[3]: Reg.selected_features_.T.to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';') st.write('Model Exported ') # create a report with information on the model ## see https://stackoverflow.com/a/59578663 if st.session_state['interface'] == 'simple': pages_folder = Path("pages/") show_pages( [Page("app.py", "Home"), Page(str(pages_folder / "4-inputs.py"), "Inputs"), Page(str(pages_folder / "1-samples_selection.py"), "Samples Selection"), Page(str(pages_folder / "2-model_creation.py"), "Models Creation"), Page(str(pages_folder / "3-prediction.py"), "Predictions"), ] ) st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') ## Load .dx file Ac_Km = ['histo.png', 'Spectre_mod.png','Predictions_V.png','Allinone.png','Predictedvs.png','residual_plot.png'] with st.container(): if st.button("Download the report"): if regression_algo == reg_algo[1]: latex_report = report.report(LoDaSum, 'model',Ac_Km,a_Test,json_sp,model_per,'full_plsr',cv99) report.compile_latex() else: pass else: pass if not spectra.empty and not y.empty: if regression_algo in reg_algo[1:] and Reg is not None: fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') ax2.set_xlabel('Wavelenghts') plt.tight_layout() for i in range(2): eval(f'ax{i+1}').grid(color='grey', linestyle=':', linewidth=0.2) eval(f'ax{i+1}').margins(x = 0) eval(f'ax{i+1}').legend(loc = 'upper right') eval(f'ax{i+1}').set_ylabel('Intensity') if regression_algo == reg_algo[3]: for j in range(s): if np.array(spectra.columns).dtype.kind in ['i','f']: min, max = intervalls_with_cols['from'][j], intervalls_with_cols['to'][j] else: min, max = intervalls['from'][j], intervalls['to'][j] eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0) if regression_algo == reg_algo[1]: ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], color = 'red', label = 'Important variables') ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], color = 'red', label = 'Important variables') ax1.legend() ax2.legend() M2.write('-- Visualization of the spectral regions used for model creation -- ') M2.pyplot(fig)