diff --git a/src/Class_Mod/Miscellaneous.py b/src/Class_Mod/Miscellaneous.py index 3c2f560ff3e8b000b4a4d813f881d9768a2276c4..a4e934e317db106458c29e9fd3f4f56f0252c8f9 100644 --- a/src/Class_Mod/Miscellaneous.py +++ b/src/Class_Mod/Miscellaneous.py @@ -23,13 +23,11 @@ def prediction(NIRS_csv, qsep, qhdr, model): @st.cache_data def reg_plot( meas, pred, train_idx, test_idx): ec = np.subtract(np.array(meas[0]).reshape(-1), np.array(pred[0]).reshape(-1)) - ecv = np.subtract(np.array(meas[1]).reshape(-1), np.array(pred[1]).reshape(-1)) - et = np.subtract(np.array(meas[2]).reshape(-1), np.array(pred[2]).reshape(-1)) + et = np.subtract(np.array(meas[1]).reshape(-1), np.array(pred[1]).reshape(-1)) fig, ax = plt.subplots(figsize = (12,4)) sns.regplot(x = meas[0] , y = pred[0], color='blue', label = 'Calib') - sns.regplot(x = meas[1], y = pred[1], color='red', label = 'CV') - sns.regplot(x = meas[2], y = pred[2], color='green', label = 'Test') + sns.regplot(x = meas[1], y = pred[1], color='green', label = 'Test') plt.plot([np.min(meas[0])-0.05, np.max([meas[0]])+0.05], [np.min(meas[0])-0.05, np.max([meas[0]])+0.05], color = 'black') for i, txt in enumerate(train_idx): @@ -37,14 +35,9 @@ def reg_plot( meas, pred, train_idx, test_idx): if np.abs(ec[i])> np.mean(ec)+ 3*np.std(ec): plt.annotate(txt ,(np.array(meas[0]).reshape(-1)[i], np.array(pred[0]).reshape(-1)[i])) - - for i, txt in enumerate(train_idx): - if np.abs(ecv[i])> np.mean(ecv)+ 3*np.std(ecv): - plt.annotate(txt ,(np.array(meas[1]).reshape(-1)[i], np.array(pred[1]).reshape(-1)[i])) - for i, txt in enumerate(test_idx): if np.abs(et[i])> np.mean(et)+ 3*np.std(et): - plt.annotate(txt ,(np.array(meas[2]).reshape(-1)[i], np.array(pred[2]).reshape(-1)[i])) + plt.annotate(txt ,(np.array(meas[1]).reshape(-1)[i], np.array(pred[1]).reshape(-1)[i])) ax.set_ylabel('Predicted values') ax.set_xlabel('Measured values') @@ -55,33 +48,33 @@ def reg_plot( meas, pred, train_idx, test_idx): def resid_plot( meas, pred, train_idx, test_idx): ec = np.subtract(np.array(meas[0]).reshape(-1), np.array(pred[0]).reshape(-1)) - ecv = np.subtract(np.array(meas[1]).reshape(-1), np.array(pred[1]).reshape(-1)) - et = np.subtract(np.array(meas[2]).reshape(-1), np.array(pred[2]).reshape(-1)) + et = np.subtract(np.array(meas[1]).reshape(-1), np.array(pred[1]).reshape(-1)) fig, ax = plt.subplots(figsize = (12,4)) - sns.residplot(x = meas[0], y = pred[0], color='blue', label = 'Calib') - sns.residplot(x = meas[1], y = pred[1], color='red', label = 'CV') - sns.residplot(x = meas[2], y = pred[2], color='green', label = 'Test') + sns.scatterplot(x = meas[0], y = ec, color='blue', label = 'Calib') + sns.scatterplot(x = meas[1], y = et, color='green', label = 'Test') + plt.axhline(y= 0, c ='black', linestyle = ':') + lim = np.max(abs(np.concatenate([ec, et], axis = 0)))*1.1 + plt.ylim(- lim, lim ) + + for i, txt in enumerate(train_idx): #plt.annotate(txt ,(np.array(meas[0]).reshape(-1)[i],ec[i])) if np.abs(ec[i])> np.mean(ec)+ 3*np.std(ec): - plt.annotate(txt ,(np.array(pred[0]).reshape(-1)[i],ec[i])) - - - for i, txt in enumerate(train_idx): - if np.abs(ecv[i])> np.mean(ecv)+ 3*np.std(ecv): - plt.annotate(txt ,(np.array(pred[1]).reshape(-1)[i],ecv[i])) + plt.annotate(txt ,(np.array(meas[0]).reshape(-1)[i],ec[i])) for i, txt in enumerate(test_idx): if np.abs(et[i])> np.mean(et)+ 3*np.std(et): - plt.annotate(txt ,(np.array(pred[2]).reshape(-1)[i],et[i])) + plt.annotate(txt ,(np.array(meas[1]).reshape(-1)[i],et[i])) ax.set_xlabel(f'{ train_idx.shape}') ax.set_ylabel('Residuals') ax.set_xlabel('Measured values') plt.legend() + plt.margins(0) + # function that create a download button - needs the data to save and the file name to store to @@ -105,3 +98,18 @@ def plot_spectra(df, xunits, yunits): plt.margins(x = 0) return fig + + +## descriptive stat +def desc_stats(x): + a = {} + a['N samples'] = x.shape[0] + a['Min'] = np.min(x) + a['Max'] = np.max(x) + a['Mean'] = np.mean(x) + a['Median'] = np.median(x) + a['S'] = np.std(x) + a['RSD(%)'] = np.std(x)*100/np.mean(x) + a['Skewness'] = skew(x, axis=0, bias=True) + a['Kurtosis'] = kurtosis(x, axis=0, bias=True) + return a \ No newline at end of file diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 502ad9527d2d8a0074a625c335a6015bfa9b4f68..44f97e701b8651b3e2018054b94ee8b23468e2cf 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -4,7 +4,7 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * from pandas.api.types import is_float_dtype - +from Class_Mod.Miscellaneous import desc_stats add_header() st.session_state["interface"] = st.session_state.get('interface') @@ -13,11 +13,14 @@ if st.session_state["interface"] == 'simple': ####################################### page Design ####################################### -st.header("Calibration Model Development", divider='blue') -st.write("Create a predictive model, then use it for predicting your target variable (chemical values) from NIRS spectra") +st.title("Calibration Model Development") +st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") +st.header("I - Data visualization", divider='blue') +M0, M00 = st.columns([1, .4]) +st.header("II - Model creation", divider='blue') + M1, M2, M3 = st.columns([2,3,2]) M4, M5 = st.columns([6,2]) -st.write("---") st.header("Model Diagnosis", divider='blue') M7, M8 = st.columns([2,2]) @@ -27,12 +30,11 @@ M9, M10 = st.columns([2,2]) M9.write("-- Save the model --") ###################################################################### - reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR", "Full-PLSR-sklearn", "PrePLStester"] ####################################### ########################################### files_format = ['.csv', '.dx'] -file = M3.radio('select files format:', options = files_format) +file = M00.radio('select files format:', options = files_format) ### Data spectra = pd.DataFrame @@ -40,19 +42,19 @@ y = pd.DataFrame # load .csv file if file == files_format[0]: - xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") if xcal_csv: - sepx = M3.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), + sepx = M00.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) - hdrx = M3.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), + hdrx = M00.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) if hdrx == "yes": col = 0 else: col = False - ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") + ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") if ycal_csv: - sepy = M3.radio("separator (Y file): ", options=[";", ","], key=2) - hdry = M3.radio("samples name (Y file)?: ", options=["no", "yes"], key=3) + sepy = M00.radio("separator (Y file): ", options=[";", ","], key=2) + hdry = M00.radio("samples name (Y file)?: ", options=["no", "yes"], key=3) if hdry == "yes": col = 0 else: col = False @@ -64,7 +66,7 @@ if file == files_format[0]: spectra, meta_data = col_cat(xfile) y, idx = col_cat(yfile) if y.shape[1]>1: - yname = M3.selectbox('Select target', options=y.columns) + yname = M00.selectbox('Select target', options=y.columns) y = y.loc[:,yname] else: y = y.iloc[:,0] @@ -75,7 +77,7 @@ if file == files_format[0]: st.write(meta_data) if spectra.shape[0] != y.shape[0]: - M3.warning('X and Y have different sample size') + M00.warning('X and Y have different sample size') y = pd.DataFrame spectra = pd.DataFrame @@ -89,50 +91,59 @@ if file == files_format[0]: ## Load .dx file elif file == files_format[1]: - data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") + data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") if data_file: with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: tmp.write(data_file.read()) tmp_path = tmp.name chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path) - M3.success("The data have been loaded successfully", icon="✅") + M00.success("The data have been loaded successfully", icon="✅") if chem_data.shape[1]>0: - yname = M3.selectbox('Select target', options=chem_data.columns) + yname = M00.selectbox('Select target', options=chem_data.columns) measured = chem_data.loc[:,yname] > 0 y = chem_data.loc[:,yname].loc[measured] spectra = spectra.loc[measured] else: - M3.warning('Warning: Chemical data are not included in your file !', icon="⚠️") + M00.warning('Warning: Chemical data are not included in your file !', icon="⚠️") os.unlink(tmp_path) ### split the data if not spectra.empty and not y.empty: - rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") - # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) + #rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") + # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing + train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42) + # Assign data to training and test sets X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index] X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index] - sk = lambda x: skew(x, axis=0, bias=True) - ku = lambda x:kurtosis(x, axis=0, bias=True) - cv = lambda x: x.std()*100/x.mean() - - M2.write('Loaded data summary') - M2.write(f'The loaded spectra consist of {spectra.shape[1]} wavelengths') - datainf = pd.DataFrame() - datainf['N samples'] = [X_train.shape[0], X_test.shape[0], spectra.shape[0] ] - datainf['Mean'] = [y_train.mean(), y_test.mean(), y.mean()] - datainf['SD'] = [y_train.std(), y_test.std(), y.std()] - datainf['CV(%)'] = [cv(y_train), cv(y_test), cv(y)] - datainf['Skewness'] = [sk(y_train), sk(y_test), sk(y)] - datainf['Kurtosis'] = [ku(y_train), ku(y_test), ku(y)] - datainf.index = ['Train', 'Test', 'Total'] - M2.write(datainf.round(3)) - -####################################### + + #### insight on loaded data + fig, ax1 = plt.subplots( figsize = (12,3)) + spectra.T.plot(legend=False, ax = ax1, linestyle = '--') + ax1.set_ylabel('Signal intensity') + ax1.margins(0) + plt.tight_layout() + M0.pyplot(fig) + + fig, ax2 = plt.subplots(figsize = (12,3)) + sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True) + sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True) + sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True) + ax2.set_xlabel('y') + plt.legend() + plt.tight_layout() + + M0.pyplot(fig) + + + M0.write('Loaded data summary') + M0.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['Train', 'Test', 'Total'] ).round(2)) + ####################################### Insight into the loaded data + + ####################################### regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py @@ -161,16 +172,6 @@ if not spectra.empty and not y.empty: Reg.pred_data_[i].index = list(y_train.index) else: Reg.pred_data_[i].index = list(y_test.index) - # Reg.pred_data_[0] = Reg.pred_data_[0].T.reset_index().drop(columns = ['index']) - # Reg.pred_data_[0].index = list(y_train.index) - # Reg.pred_data_[1] = Reg.pred_data_[1].T.reset_index().drop(columns = ['index']) - # Reg.pred_data_[1].index = list(y_train_cv1.index) - # Reg.pred_data_[2] = Reg.pred_data_[2].T.reset_index().drop(columns = ['index']) - # Reg.pred_data_[2].index = list(y_train_cv2.index) - # Reg.pred_data_[3] = Reg.pred_data_[3].T.reset_index().drop(columns = ['index']) - # Reg.pred_data_[3].index = list(y_train_cv3.index) - # Reg.pred_data_[4] = Reg.pred_data_[4].T.reset_index().drop(columns = ['index']) - # Reg.pred_data_[4].index = list(y_test.index) elif regression_algo == reg_algo[3]: s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3) @@ -220,7 +221,6 @@ if not spectra.empty and not y.empty: ################# Model analysis ############ if regression_algo in reg_algo[1:]: yc = Reg.pred_data_[0] - ycv = Reg.pred_data_[1] yt = Reg.pred_data_[2] #if @@ -230,7 +230,7 @@ if not spectra.empty and not y.empty: json.dump(Reg.best_hyperparams, outfile) M2.write("-- Performance metrics --") - M2.dataframe(metrics(c = [y_train, yc], cv = [y_train, ycv], t = [y_test, yt], method='regression').scores_) + M2.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) #from st_circular_progress import CircularProgress #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance', # size = "medium", track_color = "black", color = "blue") @@ -238,8 +238,8 @@ if not spectra.empty and not y.empty: #my_circular_progress.st_circular_progress() #my_circular_progress.update_value(progress=20) - M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt], train_idx = train_index, test_idx = test_index)) - M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt], train_idx = train_index, test_idx = test_index)) + M7.pyplot(reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)) + M8.pyplot(resid_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index)) #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)