diff --git a/src/Class_Mod/RegModels.py b/src/Class_Mod/RegModels.py index 0063bccc675e8aadd7bd2b9f773d5b245bd6a891..01fda381bc0463bb325a2fe89ab036f5c8b5b239 100644 --- a/src/Class_Mod/RegModels.py +++ b/src/Class_Mod/RegModels.py @@ -93,8 +93,8 @@ class Regmodel(object): ########################################### PLSR ######################################### class Plsr(Regmodel): - def __init__(self, train, test, n_iter = 10): - super().__init__(train, test, n_iter, add_hyperparams = {'n_components': hp.randint('n_components', 2,20)}) + def __init__(self, train, test, n_iter = 10, nfolds = 3): + super().__init__(train, test, n_iter, nfolds = nfolds, add_hyperparams = {'n_components': hp.randint('n_components', 1,20)}) ### parameters in common def objective(self, params): @@ -114,14 +114,9 @@ class Plsr(Regmodel): params['deriv'], params['polyorder'], params['window_length'] = a, b, c x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] - Model = PLSRegression(scale = False, n_components = params['n_components']) - # self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds) - # self._cv_df['Average'] = self._cv_df.mean(axis = 1) - # self._cv_df['S'] = self._cv_df.std(axis = 1) - # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] - # self._cv_df = self._cv_df.T.round(2) + model = PLSRegression(scale = False, n_components = params['n_components']) folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) - yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + yp = KF_CV().cross_val_predictor(model = model, folds = folds, x = x2[0], y = np.array(self._ytrain)) self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] score = self._cv_df.loc["cv",'rmse'] @@ -147,15 +142,15 @@ class Plsr(Regmodel): ############################################ iplsr ######################################### class TpeIpls(Regmodel): - def __init__(self, train, test, n_iter = 10, n_intervall = 5): + def __init__(self, train, test, n_iter = 10, n_intervall = 5, nfolds = 3): self.n_intervall = n_intervall self.n_arrets = self.n_intervall*2 - r = {'n_components': hp.randint('n_components', 2,10)} + r = {'n_components': hp.randint('n_components', 1,20)} r.update({f'v{i}': hp.randint(f'v{i}', 0, train[0].shape[1]) for i in range(1,self.n_arrets+1)}) - super().__init__(train, test, n_iter, add_hyperparams = r) + super().__init__(train, test, n_iter, add_hyperparams = r, nfolds = nfolds) ### parameters in common @@ -166,7 +161,7 @@ class TpeIpls(Regmodel): arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)] id = np.unique(np.concatenate(arrays, axis=0), axis=0) - # ## Preprocessing + ### Preprocessing x0 = [self._xc, self._xt] x1 = [eval(str(params['normalization'])+"(x0[i])") for i in range(2)] @@ -180,35 +175,35 @@ class TpeIpls(Regmodel): params['deriv'], params['polyorder'], params['window_length'] = a, b, c x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)] - # print(x2) - # ## Modelling - folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) + + prepared_data = [x2[i][:,id] for i in range(2)] + + + ### Modelling + folds = KF_CV().CV(x = prepared_data[0], y = np.array(self._ytrain), n_folds = self._nfolds) try: - - Model = PLSRegression(scale = False, n_components = params['n_components']) - yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + model = PLSRegression(scale = False, n_components = params['n_components']) + yp = KF_CV().cross_val_predictor(model = model, folds = folds, x = prepared_data[0], y = np.array(self._ytrain)) self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] except ValueError as ve: - Model = PLSRegression(scale = False, n_components = 1) params["n_components"] = 1 - yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) + model = PLSRegression(scale = False, n_components = params["n_components"]) + yp = KF_CV().cross_val_predictor(model = model, folds = folds, x = prepared_data[0], y = np.array(self._ytrain)) self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] - # self._cv_df['Average'] = self._cv_df.mean(axis = 1) - # self._cv_df['S'] = self._cv_df.std(axis = 1) - # self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average'] - # self._cv_df = self._cv_df.T.round(2) + + score = self._cv_df.loc['cv','rmse'] - Model = PLSRegression(scale = False, n_components = params['n_components']) - Model.fit(x2[0][:,id], self._ytrain) + Model = PLSRegression(scale = False, n_components = model.n_components) + Model.fit(prepared_data[0], self._ytrain) if self.SCORE > score: self.SCORE = score self._ycv = KF_CV().meas_pred_eq(y = np.array(self._ytrain), ypcv=yp, folds=folds) - self._yc = Model.predict(x2[0][:,id]) - self._yt = Model.predict(x2[1][:,id]) + self._yc = Model.predict(prepared_data[0]) + self._yt = Model.predict(prepared_data[1]) self._model = Model for key,value in params.items(): try: params[key] = int(value) @@ -231,4 +226,4 @@ class TpeIpls(Regmodel): class Pcr(Regmodel): def __init__(self, train, test, n_iter = 10, n_val = 5): super.__init__() - {f'pc{i}': hp.randint(f'pc{i+1}', 0, train[0].shape[1]) for i in range(self.n_val)} \ No newline at end of file + {f'pc{i}': hp.randint(f'pc{i+1}', 0, train[0].shape[1]) for i in range(self.n_val)} diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 1aed061b5ff76028733454a5afb343613ae0739d..19e2ab6a3153282b84197dc21625b01554dd0306 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -658,6 +658,6 @@ if not sam.empty: zipname = json.load(f) if os.path.split(recent_file)[1] == os.path.split(zipname)[1]: with open("./temp/"+zipname, "rb") as fp: - st.write('Download the Analysis Results') + st.subheader('Download the Analysis Results') st.download_button('Download', data = fp, file_name=zipname, mime="application/zip", - args=None, kwargs=None,type="primary",use_container_width=True) + args=None, kwargs=None,type="primary",use_container_width=True) \ No newline at end of file diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index a9f995a776b5475dcccbd362fa2449a1437b6317..ad94b1fb602a581c9bd26715af15c75028976053 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -1,5 +1,5 @@ from Packages import * -st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") +st.set_page_config(page_title = "NIRS Utils", page_icon = ":goat:", layout = "wide") from Modules import * from Class_Mod.DATA_HANDLING import * add_header() @@ -33,6 +33,7 @@ def delete_dir(): def increment(): st.session_state.counter += 1 + # #################################### Methods ############################################## class lw: def __init__(self, Reg_json, pred): @@ -40,28 +41,12 @@ class lw: self.best_hyperparams_ = Reg_json['best_lwplsr_params'] self.pred_data_ = [pd.json_normalize(Reg_json[i]) for i in pred] -# @st.cache_data -# # def tpeipls_(change, n_intervall, n_iter): -# Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = n_intervall, n_iter=n_iter) -# # time.sleep(1) -# # reg_model = Reg.model_ -# # global intervalls -# # intervalls = Reg.selected_features_.T -# # intervalls_with_cols = Reg.selected_features_.T -# # for i in range(intervalls.shape[0]): -# # for j in range(intervalls.shape[1]): -# # intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] -# # rega = Reg.selected_features_ -# return Reg #, reg_model, intervalls, intervalls_with_cols, rega -def auto_execute(func): - func() - return func # ####################################### page preamble ####################################### st.title("Calibration Model Development") # page title st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") M0, M00 = st.columns([1, .4]) -M0.image("./images/model_creation.png", use_column_width=True) # graphical abstract +M0.image("./images/model_creation.png", use_column_width = True) # graphical abstract @@ -69,7 +54,7 @@ M0.image("./images/model_creation.png", use_column_width=True) # graphical abstr ################################################################# Begin : I- Data loading and preparation ###################################### files_format = ['csv', 'dx'] # Supported files format -file = M00.radio('Select files format:', options = files_format,horizontal=True) # Select a file format +file = M00.radio('Select files format:', options = files_format,horizontal = True) # Select a file format spectra = pd.DataFrame() # preallocate the spectral data block y = pd.DataFrame() # preallocate the target(s) data block match file: @@ -77,12 +62,12 @@ match file: case 'csv': with M00: # Load X-block data - xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + xcal_csv = st.file_uploader("Select NIRS Data", type = "csv", help = " :mushroom: select a csv matrix with samples as rows and lambdas as columns") if xcal_csv: sepx = st.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), - options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0,horizontal=True) + options = [";", ","], index = [";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key = 0,horizontal = True) hdrx = st.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), - options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1,horizontal=True) + options = ["no", "yes"], index = ["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key = 1,horizontal = True) match hdrx: case "yes": col = 0 @@ -92,12 +77,12 @@ match file: st.warning('Insert your spectral data file here!') # Load Y-block data - ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") + ycal_csv = st.file_uploader("Select corresponding Chemical Data", type = "csv", help = " :mushroom: select a csv matrix with samples as rows and chemical values as a column") if ycal_csv: sepy = st.radio("Select separator (Y file) - _detected_: " + str(find_delimiter('data/'+ycal_csv.name)), - options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+ycal_csv.name))), key=2,horizontal=True) + options = [";", ","], index = [";", ","].index(str(find_delimiter('data/'+ycal_csv.name))), key = 2, horizontal = True) hdry = st.radio("samples name (Y file)? - _detected_: " + str(find_col_index('data/'+ycal_csv.name)), - options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+ycal_csv.name))), key=3,horizontal=True) + options = ["no", "yes"], index = ["no", "yes"].index(str(find_col_index('data/'+ycal_csv.name))), key = 3, horizontal = True) match hdry: case "yes": @@ -121,50 +106,59 @@ match file: @st.cache_data def csv_loader(change): file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) - xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0) - yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col) - xfile.to_csv("./Report/datasets/"+xcal_csv.name,sep = ';', encoding='utf-8', mode='a') - yfile.to_csv("./Report/datasets/"+ycal_csv.name,sep = ';', encoding='utf-8', mode='a') + xfile = pd.read_csv(xcal_csv, decimal = '.', sep = sepx, index_col = col, header = 0) + yfile = pd.read_csv(ycal_csv, decimal = '.', sep = sepy, index_col = col) + xfile.to_csv("./Report/datasets/"+xcal_csv.name,sep = ';', encoding = 'utf-8', mode = 'a') + yfile.to_csv("./Report/datasets/"+ycal_csv.name,sep = ';', encoding = 'utf-8', mode = 'a') return xfile, yfile, file_name - xfile, yfile, file_name = csv_loader(change =xy_hash) + + xfile, yfile, file_name = csv_loader(change = xy_hash) if yfile.shape[1]>0 and xfile.shape[1]>0 : # prepare x data - spectra, meta_data = col_cat(xfile) + try: + spectra, meta_data = col_cat(xfile) + except: + st.error('Error: The format of the X-file does not correspond to the expected dialect settings. To read the file correctly, please adjust the separator parameters.') spectra = pd.DataFrame(spectra).astype(float) # prepare y data - chem_data, idx = col_cat(yfile) - if chem_data.shape[1]>1: - yname = M00.selectbox('Select target', options=chem_data.columns) - y = chem_data.loc[:,yname] - else: - y = chem_data.iloc[:,0] - + try: + chem_data, idx = col_cat(yfile) + except: + st.error('Error: The format of the Y-file does not correspond to the expected dialect settings. To read the file correctly, please adjust the separator parameters.') + + if 'chem_data' in globals(): + if chem_data.shape[1]>1: + yname = M00.selectbox('Select target', options = chem_data.columns) + y = chem_data.loc[:, yname] + else: + y = chem_data.iloc[:, 0] + ### warning if spectra.shape[0] != y.shape[0]: - st.warning('X and Y have different sample size') + st.error('Error: X and Y have different sample size') y = pd.DataFrame spectra = pd.DataFrame else: - st.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !') + st.error('Error: The data has not been loaded successfully, please consider tuning the dialect settings !') # Load .dx file case 'dx': with M00: - data_file = st.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") + data_file = st.file_uploader("Select Data", type = ".dx", help = " :mushroom: select a dx file") if data_file: file_name = str(data_file.name) ## creating the temp file - with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + with NamedTemporaryFile(delete = False, suffix = ".dx") as tmp: tmp.write(data_file.read()) tmp_path = tmp.name with open(tmp.name, 'r') as dd: dxdata = dd.read() - xy_hash = str(dxdata) + xy_hash = hash_data(str(dxdata)) with open('Report/datasets/'+data_file.name, 'w') as dd: dd.write(dxdata) ## load and parse the temp dx file @@ -173,18 +167,18 @@ match file: chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path) os.unlink(tmp_path) return chem_data, spectra, meta_data, meta_data_st - chem_data, spectra, meta_data, meta_data_st = dx_loader(change = hash_data(xy_hash)) + chem_data, spectra, meta_data, meta_data_st = dx_loader(change = dxdata) if not spectra.empty: - st.success("The data have been loaded successfully", icon="✅") + st.success("The data have been loaded successfully", icon = "✅") if chem_data.shape[1]>0: - yname = st.selectbox('Select target', options=chem_data.columns) - measured = chem_data.loc[:,yname] > 0 - y = chem_data.loc[:,yname].loc[measured] + yname = st.selectbox('Select target', options = chem_data.columns) + measured = chem_data.loc[:, yname] > 0 + y = chem_data.loc[:, yname].loc[measured] spectra = spectra.loc[measured] else: - st.warning('Warning: your file includes no target variables to model !', icon="âš ï¸") + st.warning('Warning: your file includes no target variables to model !', icon = "âš ï¸") else : @@ -198,19 +192,18 @@ match file: ################################################### BEGIN : visualize and split the data #################################################### -st.header("I - Data visualization", divider='blue') +st.header("I - Data visualization", divider = 'blue') if not spectra.empty and not y.empty: @st.cache_data def visualize(change): - - if np.array(spectra.columns).dtype.kind in ['i','f']: + if np.array(spectra.columns).dtype.kind in ['i', 'f']: colnames = spectra.columns else: colnames = np.arange(spectra.shape[1]) # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42) + train_index, test_index = train_test_split_idx(spectra, y = y, method = "kennard_stone", metric = "correlation", test_size = 0.25, random_state = 42) # Assign data to training and test sets X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index] @@ -219,30 +212,32 @@ if not spectra.empty and not y.empty: #### insight on loaded data # M0, M000 = st.columns([1, .4]) - fig1, ax1 = plt.subplots( figsize = (12,3)) - spectra.T.plot(legend=False, ax = ax1, linestyle = '-', linewidth = 0.6) + fig1, ax1 = plt.subplots( figsize = (12, 3)) + spectra.T.plot(legend = False, ax = ax1, linestyle = '-', linewidth = 0.6) ax1.set_ylabel('Signal intensity') ax1.margins(0) plt.tight_layout() fig2, ax2 = plt.subplots(figsize = (12,3)) - sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True) - sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True) - sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True) + sns.histplot(y, color = "deeppink", kde = True, label = "y", ax = ax2, fill = True) + sns.histplot(y_train, color = "blue", kde = True, label = "y (train)", ax = ax2, fill = True) + sns.histplot(y_test, color = "green", kde = True, label = "y (test)", ax = ax2, fill = True) ax2.set_xlabel('y') plt.legend() plt.tight_layout() - stats=pd.DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =['train', 'test', 'total'] ).round(2) + stats = pd.DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =['train', 'test', 'total'] ).round(2) + + fig1.savefig("./Report/figures/spectra_plot.png") + fig2.savefig("./Report/figures/histogram.png") - return X_train, X_test,y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2 - X_train, X_test,y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2= visualize(change = hash_data(y+np.median(spectra))) + return X_train, X_test, y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2 + X_train, X_test, y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2 = visualize(change = xy_hash) M0, M000 = st.columns([1, .4]) with M0: st.pyplot(fig1) ######## Loaded graph st.pyplot(fig2) - fig1.savefig("./Report/figures/spectra_plot.png") - fig2.savefig("./Report/figures/Histogram.png") + with M000: st.write('Loaded data summary') st.write(stats) @@ -254,25 +249,25 @@ if not spectra.empty and not y.empty: -########################################################## BEGIN : Create Model #################################################### + ################################################### BEGIN : Create Model #################################################### regression_algo = None # initialize the selected regression algorithm Reg = None # initialize the regression model object -intervalls_with_cols = pd.DataFrame() +# intervalls_with_cols = pd.DataFrame() -st.header("II - Model creation", divider='blue') +st.header("II - Model creation", divider = 'blue') if not (spectra.empty and y.empty): - M10, M20, M30, M40, M50 = st.columns([1,1,1,1,1]) + M10, M20, M30, M40, M50 = st.columns([1, 1, 1, 1, 1]) # select type of supervised modelling problem modes = ['regression', 'classification'] - mode =M10.radio("Analysis Methods", options=modes) + mode = M10.radio("Analysis Methods", options=modes) match mode: case "regression": - reg_algo = ["","PLS", "LW-PLS", "TPE-iPLS"] - regression_algo = M20.selectbox("Choose the regression algorithm", options= reg_algo, key = "regression_algo", format_func=lambda x: x if x else "<Select>") + reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS"] + regression_algo = M20.selectbox("Choose the regression algorithm", options = reg_algo, key = "regression_algo", format_func = lambda x: x if x else "<Select>") case 'classification': - reg_algo = ["","PLS", "LW-PLS", "TPE-iPLS"] - regression_algo = M20.selectbox("Choose the classification algorithm", options= reg_algo, key = 12, format_func=lambda x: x if x else "<Select>") + reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS"] + regression_algo = M20.selectbox("Choose the classification algorithm", options = reg_algo, key = 12, format_func = lambda x: x if x else "<Select>") # # Training set preparation for cross-validation(CV) @@ -282,121 +277,131 @@ if not (spectra.empty and y.empty): # Model creation-M20 columns with M20: - if regression_algo: - info = st.info('The model is being created. This may take a few minutes.') - if regression_algo == 'TPE-iPLS':# if model type is ipls then ask for the number of iterations and intervalls - s = st.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6) - it = st.number_input(label='Enter the number of iterations', min_value=2, max_value=500, value=2) - - if regression_algo: # if a regression method is selected then create the model - @st.cache_data - def RequestingModelCreation(change, regression_algo): - match regression_algo: - case 'PLS': - Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=5) - reg_model = Reg.model_ - rega = Reg.selected_features_ - case 'LW-PLS': - # export data to csv for Julia train/test - global x_train_np, y_train_np, x_test_np, y_test_np - data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] - x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() - # Cross-Validation calculation - - d = {} + @st.cache_data + def RequestingModelCreation(xydata, change, regression_algo, s, it): + match regression_algo: + case 'PLS': + Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 50, nfolds = nb_folds) + reg_model = Reg.model_ + rega = Reg.selected_features_ + + case 'LW-PLS': + # export data to csv for Julia train/test + global x_train_np, y_train_np, x_test_np, y_test_np + data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] + x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() + # Cross-Validation calculation + d = {} + for i in range(nb_folds): + d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] + data_to_work_with.append("xtr_fold{0}".format(i+1)) + data_to_work_with.append("ytr_fold{0}".format(i+1)) + data_to_work_with.append("xte_fold{0}".format(i+1)) + data_to_work_with.append("yte_fold{0}".format(i+1)) + # check best pre-treatment with a global PLSR model + preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=20) + temp_path = Path('temp/') + with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: + json.dump(preReg.best_hyperparams_, outfile) + # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files + for i in data_to_work_with: + if 'fold' in i: + j = d[i] + else: + j = globals()[i] + # st.write(j) + np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") + # run Julia Jchemo as subprocess + import subprocess + subprocess_path = Path("Class_Mod/") + subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"]) + # retrieve json results from Julia JChemo + try: + with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + Reg_json = json.load(outfile) + # delete csv files + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + # delete json file after import + os.unlink(temp_path / "lwplsr_outputs.json") + os.unlink(temp_path / "lwplsr_preTreatments.json") + # format result data into Reg object + pred = ['pred_data_train', 'pred_data_test']### keys of the dict for i in range(nb_folds): - d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] - data_to_work_with.append("xtr_fold{0}".format(i+1)) - data_to_work_with.append("ytr_fold{0}".format(i+1)) - data_to_work_with.append("xte_fold{0}".format(i+1)) - data_to_work_with.append("yte_fold{0}".format(i+1)) - # check best pre-treatment with a global PLSR model - preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=20) - temp_path = Path('temp/') - with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: - json.dump(preReg.best_hyperparams_, outfile) - # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files - for i in data_to_work_with: - if 'fold' in i: - j = d[i] - else: - j = globals()[i] - # st.write(j) - np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") - # run Julia Jchemo as subprocess - import subprocess - subprocess_path = Path("Class_Mod/") - subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"]) - # retrieve json results from Julia JChemo - try: - with open(temp_path / "lwplsr_outputs.json", "r") as outfile: - Reg_json = json.load(outfile) - # delete csv files - for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) - # delete json file after import - os.unlink(temp_path / "lwplsr_outputs.json") - os.unlink(temp_path / "lwplsr_preTreatments.json") - # format result data into Reg object - pred = ['pred_data_train', 'pred_data_test']### keys of the dict - for i in range(nb_folds): - pred.append("CV" + str(i+1)) ### add cv folds keys to pred - # global Reg - # Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], - # 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) - # global Reg - Reg = lw(Reg_json=Reg_json, pred = pred) - reg_model = Reg.model_ - Reg.CV_results_ = pd.DataFrame() - Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} - # set indexes to Reg.pred_data (train, test, folds idx) - for i in range(len(pred)): - Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) - if i == 0: # data_train - # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) - Reg.pred_data_[i].index = list(y_train.index) - Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] - elif i == 1: # data_test - # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) - Reg.pred_data_[i].index = list(y_test.index) - Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] - else: - # CVi - Reg.pred_data_[i].index = folds[list(folds)[i-2]] - # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) - Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) - Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) - - Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] - #### cross validation results print - Reg.best_hyperparams_print = Reg.best_hyperparams_ - ## plots - Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds) - Reg.pretreated_spectra_ = preReg.pretreated_spectra_ + pred.append("CV" + str(i+1)) ### add cv folds keys to pred - Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} - Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} - - Reg.__hash__ = hash_data(Reg.best_hyperparams_print) - except FileNotFoundError as e: - Reg = None - for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) - Reg.__hash__ = 0 - case 'TPE-iPLS': - Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it) + # global Reg + # Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], + # 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + # global Reg + Reg = lw(Reg_json = Reg_json, pred = pred) reg_model = Reg.model_ + Reg.CV_results_ = pd.DataFrame() + Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} + # set indexes to Reg.pred_data (train, test, folds idx) + for i in range(len(pred)): + Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) + if i == 0: # data_train + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + Reg.pred_data_[i].index = list(y_train.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + elif i == 1: # data_test + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + Reg.pred_data_[i].index = list(y_test.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + else: + # CVi + Reg.pred_data_[i].index = folds[list(folds)[i-2]] + # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) + Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) + Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) + + Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] + #### cross validation results print + Reg.best_hyperparams_print = Reg.best_hyperparams_ + ## plots + Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv = Reg.cv_data_['YpredCV'], folds = folds) + Reg.pretreated_spectra_ = preReg.pretreated_spectra_ - intervalls = Reg.selected_features_.T - intervalls_with_cols = Reg.selected_features_.T - - for i in range(intervalls.shape[0]): - for j in range(intervalls.shape[1]): - intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] - rega = Reg.selected_features_ - - st.session_state.intervalls = Reg.selected_features_.T - st.session_state.intervalls_with_cols =intervalls_with_cols - return Reg - Reg = RequestingModelCreation(change =st.session_state.counter, regression_algo = regression_algo) + Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} + Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} + + Reg.__hash__ = hash_data(Reg.best_hyperparams_print) + except FileNotFoundError as e: + Reg = None + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + + case 'TPE-iPLS': + Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it, nfolds = nb_folds) + reg_model = Reg.model_ + + global intervalls, intervalls_with_cols + intervalls = Reg.selected_features_.T + intervalls_with_cols = Reg.selected_features_.T + + for i in range(intervalls.shape[0]): + for j in range(intervalls.shape[1]): + intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] + rega = Reg.selected_features_ + + st.session_state.intervalls = Reg.selected_features_.T + st.session_state.intervalls_with_cols = intervalls_with_cols + return Reg + + + + + + if regression_algo: + + info = st.info('The model is being created. This may take a few minutes.') + if regression_algo == 'TPE-iPLS':# if model type is ipls then ask for the number of iterations and intervalls + s = st.number_input(label = 'Enter the maximum number of intervals', min_value = 1, max_value = 6) + it = st.number_input(label = 'Enter the number of iterations', min_value = 2, max_value = 500, value = 2) + else: + s, it = None, None + + st.write() + Reg = RequestingModelCreation( xydata = hash_data(xy_hash), change = st.session_state.counter, regression_algo = regression_algo, s = s, it = it) else: st.warning('Choose a modelling algorithm from the dropdown list !') @@ -409,13 +414,13 @@ if not (spectra.empty and y.empty): if regression_algo: if regression_algo == 'TPE-iPLS': - intervalls = st.session_state.intervalls - intervalls_with_cols = st.session_state.intervalls_with_cols + if ('intervalls' and 'intervalls_with_cols') in st.session_state: + intervalls = st.session_state.intervalls + intervalls_with_cols = st.session_state.intervalls_with_cols if Reg: - if st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True):# remodel feature for re-tuning the model increment() @@ -435,9 +440,9 @@ if Reg: # Show the model performance table st.write("-- Model performance --") if regression_algo != reg_algo[2]: - model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) + model_per = pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method = 'regression').scores_) else: - model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) + model_per = pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method = 'regression').scores_) st.dataframe(model_per) @@ -453,7 +458,7 @@ if Reg: plt.tight_layout() for i in range(2): - eval(f'ax{i+1}').grid(color='grey', linestyle=':', linewidth=0.2) + eval(f'ax{i+1}').grid(color = 'grey', linestyle = ':', linewidth = 0.2) eval(f'ax{i+1}').margins(x = 0) eval(f'ax{i+1}').legend(loc = 'upper right') eval(f'ax{i+1}').set_ylabel('Intensity') @@ -461,14 +466,14 @@ if Reg: a = change for j in range(s): if np.array(spectra.columns).dtype.kind in ['i','f']: - min, max = intervalls_with_cols['from'][j], intervalls_with_cols['to'][j] + min, max = intervalls_with_cols.iloc[j,0], intervalls_with_cols.iloc[j,1] else: - min, max = intervalls['from'][j], intervalls['to'][j] + min, max = intervalls.iloc[j,0], intervalls.iloc[j,1] - eval(f'ax{i+1}').axvspan(min, max, color='#00ff00', alpha=0.5, lw=0) + eval(f'ax{i+1}').axvspan(min, max, color = '#00ff00', alpha = 0.5, lw = 0) if regression_algo == 'PLS': - ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0)[np.array(Reg.sel_ratio_.index)], + ax1.scatter(colnames[np.array(Reg.sel_ratio_.index)], np.mean(X_train, axis = 0).iloc[np.array(Reg.sel_ratio_.index)], color = '#7ab0c7', label = 'Important variables') ax2.scatter(colnames[Reg.sel_ratio_.index], np.mean(Reg.pretreated_spectra_, axis = 0)[np.array(Reg.sel_ratio_.index)], color = '#7ab0c7', label = 'Important variables') @@ -476,14 +481,15 @@ if Reg: ax2.legend() return fig - fig = prep_important(change = st.session_state.counter, regression_algo = regression_algo, model_hash = str(Reg.__hash__)) + if Reg: + fig = prep_important(change = st.session_state.counter, regression_algo = regression_algo, model_hash = str(Reg.__hash__)) with M2:## Visualize raw,preprocessed spectra, and selected intervalls(in case of ipls) - if not intervalls_with_cols.empty: - st.write('-- Important Spectral regions used for model creation --') - st.table(intervalls_with_cols) + if regression_algo =='TPE-iPLS' : + st.write('-- Important Spectral regions used for model creation --') + st.table(intervalls_with_cols) st.write('-- Visualization of the spectral regions used for model creation --') - fig.savefig("./Report/figures/Variable_importance.png") + fig.savefig("./Report/figures/variable_importance.png") st.pyplot(fig) @@ -493,23 +499,22 @@ if Reg: 6: "Six",7: "Seven",8: "Eight",9: "Nine",10: "Ten"} st.header(f" {numbers_dict[nb_folds]}-Fold Cross-Validation results") - cv1, cv2 = st.columns([2,2]) + cv1, cv2 = st.columns([2, 2]) with cv2: + cv_results = pd.DataFrame(Reg.CV_results_).round(4)# CV table st.write('-- Cross-Validation Summary--') - st.write(Reg.CV_results_.style.map(lambda _: "background-color: #cecece;", subset=(Reg.CV_results_.index.drop(['sd', 'mean', 'cv']), slice(None)))) - # st.write(Reg.CV_results_) - cv_results=pd.DataFrame(Reg.CV_results_)# CV table + st.write(cv_results.astype(str).style.map(lambda _: "background-color: #cecece;", subset = (cv_results.index.drop(['sd', 'mean', 'cv']), slice(None)))) st.write('-- Out-of-Fold Predictions Visualization (All in one) --') - fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", + fig1 = px.scatter(Reg.cv_data_[0], x = 'Measured', y = 'Predicted' , trendline = 'ols', color = 'Folds', symbol = 'Folds', color_discrete_sequence=px.colors.qualitative.G10) - fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), - y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash")) - fig1.update_traces(marker_size=7, showlegend=False) - st.plotly_chart(fig1, use_container_width=True) - fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, - color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000) - fig0.update_traces(marker_size=8, showlegend=False) + fig1.add_shape(type = 'line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), + y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color = 'black', dash = "dash")) + fig1.update_traces(marker_size = 7, showlegend=False) + st.plotly_chart(fig1, use_container_width = True) + fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline = 'ols', color = 'Folds', symbol = "Folds", facet_col = 'Folds',facet_col_wrap = 1, + color_discrete_sequence = px.colors.qualitative.G10, text = 'index', width = 800, height = 1000) + fig0.update_traces(marker_size = 8, showlegend = False) fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png") with cv1: @@ -539,7 +544,7 @@ if Reg: # reg plot and residuals plot if regression_algo != reg_algo[2]: regression_plot = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) - residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) + residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx = train_index, test_idx = test_index) else: regression_plot = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) @@ -556,8 +561,36 @@ if Reg: residual_plot.savefig('./Report/figures/residuals_plot.png') ################################################### END : Model Diagnosis ####################################################### -st.write('Download the Analysis Results') + + + +################################################### BEGIN : Download results #################################################### +date_time = datetime.datetime.now().strftime('_%y_%m_%d_%H_%M_') +# 1- +# 2- +# 3- +# 4- +#5- + +if Reg: + @st.cache_data + def download_res(file,sam): + zipname = f'results{date_time}subset_selection_{file.name.split('.')[0]}.zip' # name of the zipfile + with open('./temp/fname.json', 'w') as f: # dump filename and save it as a .json file + json.dump(zipname, f) + shutil.make_archive(base_name = zipname.split('.')[0],format = "zip",root_dir = "./Report", base_dir = "figures")# create zip containing figures and report + + + + + + + + +st.subheader('Download the Analysis Results') +# st.download_button('Download', data = fp, file_name = zipname, mime ="application/zip", +# args = None, kwargs = None,type = "primary",use_container_width = True)