diff --git a/src/Class_Mod/DATA_HANDLING.py b/src/Class_Mod/DATA_HANDLING.py index ddce505fe8da9a984e8b5f8de6a74bfad1287708..892c0c0854533b346a4e2363a61408c4d114a4ae 100644 --- a/src/Class_Mod/DATA_HANDLING.py +++ b/src/Class_Mod/DATA_HANDLING.py @@ -154,3 +154,23 @@ class KF_CV: for i in f.keys(): ycv[idx[i]] = f[i] return ycv + + +### Selectivity ratio +def sel_ratio(model, x ): + from scipy.stats import f + + x = pd.DataFrame(x) + wtp = model.coef_.T/ np.linalg.norm(model.coef_.T) + ttp = np.array(x @ wtp) + ptp = np.array(x.T) @ np.array(ttp)/(ttp.T @ ttp) + qexpi = np.linalg.norm(ttp @ ptp.T, axis = 0)**2 + e = np.array(x-x.mean()) - ttp @ ptp.T + qres = np.linalg.norm(e, axis = 0)**2 + sr = pd.DataFrame(qexpi/qres, index = x.columns, columns = ['sr']) + + fcr = f.ppf(0.05, sr.shape[0]-2, sr.shape[0]-3) + c = sr > fcr + sr.index = np.arange(x.shape[1]) + SR = sr.iloc[c.to_numpy(),:] + return SR \ No newline at end of file diff --git a/src/Class_Mod/RegModels.py b/src/Class_Mod/RegModels.py index 18ce604d7737a8bffa14426eafd1bfe1160946b0..63e740d7f40a1ad9f4d55edd7865c242c16afd8e 100644 --- a/src/Class_Mod/RegModels.py +++ b/src/Class_Mod/RegModels.py @@ -1,5 +1,5 @@ from Packages import * -from Class_Mod import metrics, Snv, No_transformation, KF_CV +from Class_Mod import metrics, Snv, No_transformation, KF_CV, sel_ratio class Regmodel(object): @@ -10,14 +10,15 @@ class Regmodel(object): self._nc, self._nt, self._p = train[0].shape[0], test[0].shape[0], train[0].shape[1] self._model, self._best = None, None self._yc, self._ycv, self._yt = None, None, None - self._cv_df = pd.DataFrame + self._cv_df = pd.DataFrame() + self._sel_ratio = pd.DataFrame() self._nfolds = nfolds self._selected_bands = pd.DataFrame(index = ['from', 'to']) self.important_features = None self._hyper_params = {'polyorder': hp.choice('polyorder', [0, 1, 2]), 'deriv': hp.choice('deriv', [0, 1, 2]), 'window_length': hp.choice('window_length', [15, 21, 27, 33]), - 'scatter': hp.choice('scatter', ['Snv', 'No_transformation'])} + 'normalization': hp.choice('normalization', ['Snv', 'No_transformation'])} if add_hyperparams is not None: self._hyper_params.update(add_hyperparams) self._best = None @@ -54,10 +55,10 @@ class Regmodel(object): return self._best @property def best_hyperparams_print(self): - if self._best['scatter'] == 'Snv': + if self._best['normalization'] == 'Snv': a = 'Standard Normal Variate (SNV)' - elif self._best['scatter'] == 'No_transformation': + elif self._best['normalization'] == 'No_transformation': a = " No transformation was performed" SG = f'- Savitzky-Golay derivative parameters \:(Window_length:{self._best['window_length']}; polynomial order: {self._best['polyorder']}; Derivative order : {self._best['deriv']})' @@ -85,6 +86,10 @@ class Regmodel(object): @property def selected_features_(self): return self._selected_bands + + @property + def sel_ratio_(self): + return self._sel_ratio ########################################### ######################################### class Plsr(Regmodel): @@ -95,7 +100,7 @@ class Plsr(Regmodel): def objective(self, params): x0 = [self._xc, self._xt] - x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)] + x1 = [eval(str(params['normalization'])+"(x0[i])") for i in range(2)] a, b, c = params['deriv'], params['polyorder'], params['window_length'] if a > b or b > c: @@ -127,6 +132,7 @@ class Plsr(Regmodel): self._model = Model self._best = params self.pretreated = pd.DataFrame(x2[0]) + self._sel_ratio = sel_ratio(Model, x2[0]) return score @@ -153,7 +159,7 @@ class TpeIpls(Regmodel): # ## Preprocessing x0 = [self._xc, self._xt] - x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)] + x1 = [eval(str(params['normalization'])+"(x0[i])") for i in range(2)] a, b, c = params['deriv'], params['polyorder'], params['window_length'] if a > b or b > c: diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index a30e15e9b743656132755c45e5b5f5aebc9fce3d..1cd06b868fa7c769c57992d5426bc69789c36b9f 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -499,8 +499,7 @@ Ac_Km = ['Spectra_Plot.png', 'Elbow.png', 'graphe_loadings.png', 'plot_axe1_axe2 # Streamlit container with st.container(): - header3, header4 = st.columns(2) - if header3.button("Exporter le rapport"): + if st.button("Download report"): if test == '.csv': if dim_red_method == dim_red_methods[1] and clus_method == cluster_methods[1]: latex_report = report.report(sam, tcr, Nb_ech, nb_clu, 'sample', Ac_Km, 'csv', 'kmeans') diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 49a680f7e8a4ff4075f00271a5b7e23ab050bb15..8e3082ae221ddd33363a41ed21187db9e7a18e87 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -67,8 +67,9 @@ files_format = ['.csv', '.dx'] file = M00.radio('Select files format:', options = files_format) ### Data -spectra = pd.DataFrame -y = pd.DataFrame +spectra = pd.DataFrame() +y = pd.DataFrame() + # load .csv file if file == files_format[0]: @@ -139,6 +140,10 @@ elif file == files_format[1]: ### split the data if not spectra.empty and not y.empty: + if np.array(spectra.columns).dtype.kind in ['i','f']: + colnames = spectra.columns + else: + colnames = np.arange(spectra.shape[1]) #rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") @@ -216,39 +221,25 @@ if not spectra.empty and not y.empty: M1.progress(100, text = "The model has successfully been created!") time.sleep(1) reg_model = Reg.model_ - # M3.write('-- Spectral regions used for model creation --') - # intervalls = Reg.bands.T - # M3.table(intervalls) - # fig, ax = plt.subplots(figsize = (12, 6)) - # X_train.mean().plot(ax = ax) - # for i in range(s): - # colnames = np.array(y) - # num = {'u','i','f','c'} - # if np.array(X_train.columns).dtype.kind in num: - # plt.plot(X_train.columns, X_train.mean(), color = 'black') - # ax.axvspan(X_train.columns[intervalls['from'][i]], X_train.columns[intervalls['to'][i]], - # color='#2a52be', alpha=0.5, lw=0) - # plt.tight_layout() - # plt.margins(x = 0) - # else: - # plt.plot(np.arange(X_train.shape[1]), X_train.mean(), color = 'black') - # ax.axvspan(intervalls['from'][i], intervalls['to'][i], color='#2a52be', alpha=0.5, lw=0) - # plt.tight_layout() - # plt.margins(x = 0) - - # M3.write('-- Visualization of the spectral regions used for model creation -- ') - # M3.pyplot(fig) - M2.write('-- Spectral regions used for model creation --') - intervalls = Reg.selected_features_.T - M2.table(intervalls) + M2.write('-- Important Spectral regions used for model creation --') + intervalls = Reg.selected_features_.T + intervalls_with_cols = Reg.selected_features_.T + for i in range(intervalls.shape[0]): + for j in range(intervalls.shape[1]): + intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] + M2.table(intervalls_with_cols) + # elif regression_algo == reg_algo[4]: # Reg = PlsR(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test) # reg_model = Reg.model_ + + + ################# Model analysis ############ if regression_algo in reg_algo[1:]: - M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ') + #M2.write('-- Pretreated data (train) visualization and important spectral regions in the model -- ') fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6)) fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02) @@ -270,43 +261,22 @@ if not spectra.empty and not y.empty: # M3.plotly_chart(fig) - from matplotlib.colors import Normalize - color_variable = y_train - norm = Normalize(vmin=color_variable.min(), vmax= color_variable.max()) - cmap = plt.get_cmap('viridis') - colors = cmap(norm(color_variable.values)) - fig, ax = plt.subplots(figsize = (10,3)) - - for i in range(Reg.pretreated_spectra_.shape[0]): - ax.plot(Reg.pretreated_spectra_.columns, Reg.pretreated_spectra_.iloc[i,:], color = colors[i]) - sm = ScalarMappable(norm = norm, cmap = cmap) - cbar = plt.colorbar(sm, ax = ax) - # cbar.set_label('Target range') - plt.tight_layout() - htmlfig = mpld3.fig_to_html(fig) - with M2: - st.components.v1.html(htmlfig, height=600) - - - - # X_train.mean().plot(ax = ax2) - # for i in range(s): - # colnames = np.array(y) - # num = {'u','i','f','c'} - # if np.array(X_train.columns).dtype.kind in num: - # plt.plot(X_train.columns, X_train.mean(), color = 'black') - # ax2.axvspan(X_train.columns[intervalls['from'][i]], X_train.columns[intervalls['to'][i]], - # color='#2a52be', alpha=0.5, lw=0) - # plt.tight_layout() - # plt.margins(x = 0) - # else: - # plt.plot(np.arange(X_train.shape[1]), X_train.mean(), color = 'black') - # ax2.axvspan(intervalls['from'][i], intervalls['to'][i], color='#2a52be', alpha=0.5, lw=0) - # plt.tight_layout() - # plt.margins(x = 0) - - # pd.DataFrame(Reg.pretreated_spectra_).plot(ax = ax1) - # M3.pyplot(fig) + # from matplotlib.colors import Normalize + # color_variable = y_train + # norm = Normalize(vmin=color_variable.min(), vmax= color_variable.max()) + # cmap = plt.get_cmap('viridis') + # colors = cmap(norm(color_variable.values)) + # fig, ax = plt.subplots(figsize = (10,3)) + + # for i in range(Reg.pretreated_spectra_.shape[0]): + # ax.plot(Reg.pretreated_spectra_.columns, Reg.pretreated_spectra_.iloc[i,:], color = colors[i]) + # sm = ScalarMappable(norm = norm, cmap = cmap) + # cbar = plt.colorbar(sm, ax = ax) + # # cbar.set_label('Target range') + # plt.tight_layout() + # htmlfig = mpld3.fig_to_html(fig) + # with M2: + # st.components.v1.html(htmlfig, height=600) ############ @@ -329,7 +299,7 @@ if not spectra.empty and not y.empty: cv1.plotly_chart(fig0, use_container_width=True) fig0.write_image("./Report/figures/Predictions_V.png") - + yc = Reg.pred_data_[0] yt = Reg.pred_data_[1] @@ -339,8 +309,8 @@ if not spectra.empty and not y.empty: M1.write(Reg.best_hyperparams_print) a_Test=Reg.best_hyperparams_print - # with open("data/params/Preprocessing.json", "w") as outfile: - # json.dump(Reg.best_hyperparams_, outfile) + with open("data/params/Preprocessing.json", "w") as outfile: + json.dump(Reg.best_hyperparams_, outfile) ########## M1.write("-- Model performance --") M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) @@ -361,7 +331,7 @@ if not spectra.empty and not y.empty: M8.pyplot(residual_plot) plt.savefig('./Report/figures/residual_plot.png') - rega = Reg.important_features_ ##### ADD FEATURES IMPORTANCE PLOT + rega = Reg.selected_features_ ##### ADD FEATURES IMPORTANCE PLOT #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) model_name = M9.text_input('Give it a name') @@ -374,8 +344,7 @@ if not spectra.empty and not y.empty: '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f: joblib.dump(reg_model, f) if regression_algo == reg_algo[3]: - rega[1].sort() - pd.DataFrame(rega[1]).to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")] + Reg.selected_features_.T.to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")] + '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';') elif file == files_format[1]: @@ -383,13 +352,8 @@ if not spectra.empty and not y.empty: with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f: joblib.dump(reg_model, f) if regression_algo == reg_algo[3]: - rega[1].sort() - pd.DataFrame(rega[1]).to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';') + Reg.selected_features_.T.to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';') st.write('Model Exported ') - - if regression_algo == reg_algo[3]: - st.write('Model Exported') - # create a report with information on the model ## see https://stackoverflow.com/a/59578663 @@ -418,4 +382,39 @@ with st.container(): pass else: - pass \ No newline at end of file + pass + + +if not spectra.empty and not y.empty: + if regression_algo in reg_algo[1:]: + fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) + ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') + ax2.plot(colnames, np.mean(Reg.pretreated_spectra_ , axis = 0), color = 'black', label = 'Average spectrum (pretreated)') + + + ax2.set_xlabel('Wavelenghts') + plt.tight_layout() + + for i in range(2): + eval(f'ax{i+1}').grid(color='grey', linestyle=':', linewidth=0.2) + eval(f'ax{i+1}').margins(x = 0) + eval(f'ax{i+1}').legend(loc = 'upper right') + eval(f'ax{i+1}').set_ylabel('Intensity') + if regression_algo == reg_algo[3]: + for j in range(s): + if np.array(spectra.columns).dtype.kind in ['i','f']: + min, max = intervalls_with_cols['from'][j], intervalls_with_cols['to'][j] + else: + min, max = intervalls['from'][j], intervalls['to'][j] + + eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0) + if regression_algo == reg_algo[1]: + ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], + color = 'red', label = 'Important variables') + ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0).ravel()[np.array(Reg.sel_ratio_.index)], + color = 'red', label = 'Important variables') + ax1.legend() + ax2.legend() + + M2.write('-- Visualization of the spectral regions used for model creation -- ') + M2.pyplot(fig) \ No newline at end of file diff --git a/src/pages/3-prediction.py b/src/pages/3-prediction.py index 2f515438326e5483d76a429e84cfbdbcbbacbb6c..b79dafdfe07ab2e0afc7894fc66d46598d4ccca2 100644 --- a/src/pages/3-prediction.py +++ b/src/pages/3-prediction.py @@ -17,18 +17,21 @@ local_css(css_file / "style_model.css") st.header("Data loading", divider='blue') -model_column1, space1, file_column1= st.columns([2, 1, 1]) +M1, M2= st.columns([2, 1]) + +st.header('Data preprocessing', divider='blue') +M3, M4= st.columns([2, 1]) + st.header("Prediction making", divider='blue') -model_column2, space2, file_column2= st.columns([2, 1, 1]) -_, space3, _ = st.columns([1, 3, 1]) +M5, M6 = st.columns([2, 0.01]) files_format = ['.csv', '.dx'] -file = file_column1.file_uploader("Select NIRS Data to predict", type = files_format, help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") +file = M2.file_uploader("Select NIRS Data to predict", type = files_format, help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") export_folder = './data/predictions/' export_name = 'Predictions_of_' reg_algo = ["Interval-PLS"] -pred_data = pd.DataFrame +pred_data = pd.DataFrame() loaded_model = None @@ -38,8 +41,8 @@ if file: if test == files_format[0]: # - qsep = file_column1.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+file.name))), key=2) - qhdr = file_column1.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+file.name))), key=3) + qsep = M2.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+file.name))), key=2) + qhdr = M2.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+file.name))), key=3) if qhdr == 'yes': col = 0 else: @@ -51,55 +54,61 @@ if file: tmp.write(file.read()) tmp_path = tmp.name chem_data, spectra, meta_data, _ = read_dx(file = tmp_path) - file_column1.success("The data have been loaded successfully", icon="✅") + M2.success("The data have been loaded successfully", icon="✅") if chem_data.to_numpy().shape[1]>0: - yname = file_column1.selectbox('Select target', options=chem_data.columns) + yname = M2.selectbox('Select target', options=chem_data.columns) measured = chem_data.loc[:,yname] == 0 y = chem_data.loc[:,yname].loc[measured] pred_data = spectra.loc[measured] else: pred_data = spectra - os.unlink(tmp_path) # Load parameters if not pred_data.empty:# Load the model with joblib - model_column1.write('Raw spectra') + M1.write('Raw spectra') fig = plot_spectra(pred_data, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") - model_column1.pyplot(fig) + M1.pyplot(fig) + ### preprocessing preprocessed = pd.DataFrame if not pred_data.empty: - params = file_column1.file_uploader("Load preprocessings params", type = '.json', help=" .json file") + params = M4.file_uploader("Load preprocessings params", type = '.json', help=" .json file") if params: prep = json.load(params) + # M4.write(ProcessLookupError) - if prep['Scatter'] == 'SNV': + if prep['normalization'] == 'Snv': x1 = Snv(pred_data) + norm = 'Standard Normal Variate' else: + norm = 'No Normalization was applied' x1 = pred_data x2 = savgol_filter(x1, - window_length = prep["Saitzky-Golay derivative parameters"]["window_length"], - polyorder = prep["Saitzky-Golay derivative parameters"]["polyorder"], - deriv=prep["Saitzky-Golay derivative parameters"]["deriv"], + window_length = prep["window_length"], + polyorder = prep["polyorder"], + deriv=prep["deriv"], delta=1.0, axis=-1, mode="interp", cval=0.0) preprocessed = pd.DataFrame(x2, index = pred_data.index, columns = pred_data.columns) - + +################################################################################################ ## plot preprocessed spectra if not preprocessed.empty: - model_column1.write('Preprocessed spectra') + M3.write('Preprocessed spectra') fig2 = plot_spectra(preprocessed, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") - model_column1.pyplot(fig2) - + M3.pyplot(fig2) + SG = f'- Savitzky-Golay derivative parameters \:(Window_length:{prep['window_length']}; polynomial order: {prep['polyorder']}; Derivative order : {prep['deriv']})' + Norm = f'- Spectral Normalization \: {norm}' + M4.write('The spectra were preprocessed using:\n'+SG+"\n"+Norm) ################### Predictions making ########################## if not pred_data.empty:# Load the model with joblib #dir = os.listdir('data/models/')[1:] dir = os.listdir('data/models/') dir.insert(0,'') - model_name = model_column2.selectbox("Select your model from the dropdown list:", options = dir, key = 21) + model_name = M5.selectbox("Select your model from the dropdown list:", options = dir, key = 21) if model_name and model_name !='': export_name += '_with_' + model_name[:model_name.find('.')] @@ -107,16 +116,18 @@ if not pred_data.empty:# Load the model with joblib loaded_model = joblib.load(f) if loaded_model: - model_column2.success("The model has been loaded successfully", icon="✅") - s = model_column2.checkbox('the model is of ipls type?') + M5.success("The model has been loaded successfully", icon="✅") + s = M5.checkbox('the model is of ipls type?') if s: - index = model_column2.file_uploader("select wavelengths index file", type="csv") + index = M5.file_uploader("select wavelengths index file", type="csv") if index: - idx = pd.read_csv(index, sep=';', index_col=0).iloc[:,0].to_numpy() - + intervalls = pd.read_csv(index, sep=';', index_col=0).to_numpy() + idx = [] + for i in range(intervalls.shape[0]): + idx.extend(np.arange(intervalls[i,0], intervalls[i,1]+1)) if loaded_model: - if model_column2.button('Predict'): + if M5.button('Predict'): if s: result = loaded_model.predict(preprocessed.iloc[:,idx]) else: @@ -124,19 +135,14 @@ if loaded_model: result = loaded_model.predict(x2) result = pd.DataFrame(result, index = pred_data.index) - st.write('Predicted values') - st.dataframe(result.T) ############################# + M5.write('Predicted values distribution') # Creating histogram - fig, axs = plt.subplots(1, 1, - figsize =(12, 6), + fig, axs = plt.subplots(1, 1, figsize =(15, 3), tight_layout = True) # Add x, y gridlines - axs.grid( color ='grey', - linestyle ='-.', linewidth = 0.5, - alpha = 0.6) - plt.title('Predicted values distribution') + axs.grid( color ='grey', linestyle ='-.', linewidth = 0.5, alpha = 0.6) # Remove axes splines for s in ['top', 'bottom', 'left', 'right']: axs.spines[s].set_visible(False) @@ -156,13 +162,13 @@ if loaded_model: color = plt.cm.viridis(norm(thisfrac)) thispatch.set_facecolor(color) - space3.pyplot(fig) + M5.pyplot(fig) + M6.write('Predicted values table') + M6.dataframe(result.T) ################################## result.to_csv(export_folder + export_name + '.csv', sep = ';') # export to local drive - Download download_results(export_folder + export_name + '.csv', export_name + '.csv') # create a report with information on the prediction - ## see https://stackoverflow.com/a/59578663 - - + ## see https://stackoverflow.com/a/59578663 \ No newline at end of file