From 9c51385277a32857e62fd82f97a306f93fdd5b3e Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Mon, 15 Apr 2024 16:10:02 +0200 Subject: [PATCH] model creation from dx --- pages/2-model_creation.py | 171 ++++++++++++++++++++++++-------------- 1 file changed, 109 insertions(+), 62 deletions(-) diff --git a/pages/2-model_creation.py b/pages/2-model_creation.py index 3fadcb4..3f506ea 100644 --- a/pages/2-model_creation.py +++ b/pages/2-model_creation.py @@ -3,9 +3,12 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * + st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") + + def nn(x): return x is not None ######################################################################################## @@ -26,91 +29,135 @@ M9, M10 = st.columns([2,2]) M9.write("-- Save the model --") +files_format = ['.csv', '.dx'] +file = M3.radio('select data file format:', options = files_format) -# CSV files loader -xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") -ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") +### Data +spectra = pd.DataFrame +y = pd.DataFrame - -if xcal_csv is not None and ycal_csv is not None: +# load .csv file +if file == files_format[0]: + xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") + + if xcal_csv and ycal_csv: + # Select list for CSV delimiter - sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) + sep = M3.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), + options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) # Select list for CSV header True / False - hdr = M3.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) + hdr = M3.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), + options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) + ############### if hdr == 'yes': col = 0 else: col = False - rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") - x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) - # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) - # Assign data to training and test sets - X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index]) - y_train = y_train.iloc[:,0] - y_test = y_test.iloc[:,0] - - - - ############################# Regression modelling ########################################## - regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) - if regression_algo == reg_algo[1]: - # Train model with model function from application_functions.py - Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test) - reg_model = Reg.model_ - #M2.dataframe(Pin.pred_data_) - - elif regression_algo == reg_algo[2]: - reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) - - elif regression_algo == reg_algo[3]: - s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3) - it = M2.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100) - progress_text = "The model is being created. Please wait." - - Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s) - pro = M1.progress(0, text="The model is being created. Please wait!") - rega = Reg.BandSelect(n_iter=it) - pro.empty() - M1.progress(100, text = "The model has successfully been created!") + ############### + spectra, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) + spectra = pd.DataFrame(spectra) + y = pd.DataFrame(y) + + + +## Load .dx file +elif file == files_format[1]: + data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") + if data_file: + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(data_file.read()) + tmp_path = tmp.name + chem_data, spectra, meta_data = read_dx(file = tmp_path) + M3.success("The data have been loaded successfully", icon="✅") + yname = M3.selectbox('Select target', options=chem_data.columns) + spectra = spectra + y = chem_data.loc[:,yname] + + os.unlink(tmp_path) + +### split the data +if not spectra.empty and not y.empty: + rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") + # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing + train_index, test_index = train_test_split_idx(spectra, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) + # Assign data to training and test sets + X_train, y_train, X_test, y_test = pd.DataFrame(spectra.iloc[train_index,:]), pd.DataFrame(y.iloc[train_index]), pd.DataFrame(spectra.iloc[test_index,:]), pd.DataFrame(y.iloc[test_index]) + y_train = y_train.iloc[:,0] + y_test = y_test.iloc[:,0] + + +####################################### + regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) + if regression_algo == reg_algo[1]: + # Train model with model function from application_functions.py + Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test) + reg_model = Reg.model_ + #M2.dataframe(Pin.pred_data_) + elif regression_algo == reg_algo[2]: + reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) + + elif regression_algo == reg_algo[3]: + s = M1.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3) + it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100) + progress_text = "The model is being created. Please wait." - time.sleep(1) - reg_model = Reg.model_ - M2.table(rega[0]) + Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s) + pro = M1.progress(0, text="The model is being created. Please wait!") + rega = Reg.BandSelect(n_iter=it) + pro.empty() + M1.progress(100, text = "The model has successfully been created!") + time.sleep(1) + reg_model = Reg.model_ + M2.write('-- Table of selected wavelengths --') + M2.table(rega[0]) ################# Model analysis ############ - - if regression_algo in reg_algo[1:]: - yc = Reg.pred_data_[0] - ycv = Reg.pred_data_[1] - yt = Reg.pred_data_[2] + if regression_algo in reg_algo[1:]: + yc = Reg.pred_data_[0] + ycv = Reg.pred_data_[1] + yt = Reg.pred_data_[2] - M1.write("-- Performance metrics --") - M1.dataframe(Reg.metrics_) + M2.write("-- Performance metrics --") + M2.dataframe(Reg.metrics_) - M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) - M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) + M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) + M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) - model_name = M9.text_input('Give it a name') - if M9.button('Export Model'): + model_name = M9.text_input('Give it a name') + if M9.button('Export Model'): + path = 'data/models/model_' + if file == files_format[0]: #export_package = __import__(model_export) - with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f: + with open(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f: joblib.dump(reg_model, f) - - if regression_algo == reg_algo[3]: - rega[1].sort() - pd.DataFrame(rega[1]).to_csv('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_''Wavelengths_index.csv', sep = ';') + if regression_algo == reg_algo[3]: + rega[1].sort() + pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_'+'Wavelengths_index.csv', sep = ';') + + elif file == files_format[1]: + #export_package = __import__(model_export) + with open(path + model_name + '_on_' + '_data_' + '.pkl','wb') as f: + joblib.dump(reg_model, f) + if regression_algo == reg_algo[3]: + rega[1].sort() + pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';') + st.write('Model Exported') + + if regression_algo == reg_algo[3]: st.write('Model Exported') - + # create a report with information on the model ## see https://stackoverflow.com/a/59578663 - #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv])) - if st.session_state['interface'] == 'simple': - st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') + if st.session_state['interface'] == 'simple': + st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') + + +## Load .dx file -- GitLab