diff --git a/src/Class_Mod/DATA_HANDLING.py b/src/Class_Mod/DATA_HANDLING.py index 17e4dcb44161db3710fec2ccf94ad8363e35cfbc..7f73676037f807782b933c9638d1ac7afb0a384d 100644 --- a/src/Class_Mod/DATA_HANDLING.py +++ b/src/Class_Mod/DATA_HANDLING.py @@ -3,14 +3,17 @@ from .Evaluation_Metrics import metrics ## try to automatically detect the field separator within the CSV def find_delimiter(filename): - sniffer = csv.Sniffer() - with open(filename) as fp: - delimiter = sniffer.sniff(fp.read(200)).delimiter + import clevercsv + with open(filename, newline='') as csvfile: + delimiter = clevercsv.Sniffer().sniff(csvfile.read(100)).delimiter + # sniffer = csv.Sniffer() + # with open(filename) as fp: + # delimiter = sniffer.sniff(fp.read(200)).delimiter return delimiter def find_col_index(filename): with open(filename) as fp: - lines = pd.read_csv(fp, skiprows=3, nrows=3, index_col=False, sep=str(find_delimiter(filename))) + lines = pd.read_csv(fp, skiprows=3, nrows=3, index_col=False, sep=find_delimiter(filename)) col_index = 'yes' if lines.iloc[:,0].dtypes != np.float64 else 'no' return col_index diff --git a/src/Class_Mod/RegModels.py b/src/Class_Mod/RegModels.py index 6e4cc01aa69614f1e85eb20165c68063a440d026..0063bccc675e8aadd7bd2b9f773d5b245bd6a891 100644 --- a/src/Class_Mod/RegModels.py +++ b/src/Class_Mod/RegModels.py @@ -185,12 +185,13 @@ class TpeIpls(Regmodel): # ## Modelling folds = KF_CV().CV(x = x2[0], y = np.array(self._ytrain), n_folds = self._nfolds) try: + Model = PLSRegression(scale = False, n_components = params['n_components']) yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] except ValueError as ve: + Model = PLSRegression(scale = False, n_components = 1) params["n_components"] = 1 - Model = PLSRegression(scale = False, n_components = params['n_components']) yp = KF_CV().cross_val_predictor(model = Model, folds = folds, x = x2[0], y = np.array(self._ytrain)) self._cv_df = KF_CV().metrics_cv(y = np.array(self._ytrain), ypcv = yp, folds =folds)[1] # self._cv_df['Average'] = self._cv_df.mean(axis = 1) diff --git a/src/Class_Mod/VarSel.py b/src/Class_Mod/VarSel.py index a2d53630df7532440eee387a16e103e8c803740d..9f60fc4a73294660fb31517afe74c112430ab12c 100644 --- a/src/Class_Mod/VarSel.py +++ b/src/Class_Mod/VarSel.py @@ -69,10 +69,11 @@ class TpeIpls: try: Model = PLSRegression(scale = self.scale,n_components = params['n_components']) Model.fit(self.x_train.iloc[:,id], self.y_train) - except ValueError as ve: - params["n_components"] = 1 - Model = PLSRegression(scale = self.scale,n_components = params['n_components']) + except ValueError as ve: + Model = PLSRegression(scale = self.scale,n_components = 1) Model.fit(self.x_train.iloc[:,id], self.y_train) + params['n_components'] = 1 + ## make prediction yc = Model.predict(self.x_train.iloc[:,id]).ravel() diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index f89dd4e98f7cc7b75cfdfe567c9ef4c18d57d2fe..a9f995a776b5475dcccbd362fa2449a1437b6317 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -34,192 +34,227 @@ def delete_dir(): def increment(): st.session_state.counter += 1 # #################################### Methods ############################################## -@st.cache_data -def csv_loader(x,y): - file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) - xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0) - yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col) - return xfile, yfile, file_name +class lw: + def __init__(self, Reg_json, pred): + self.model_ = Reg_json['model'] + self.best_hyperparams_ = Reg_json['best_lwplsr_params'] + self.pred_data_ = [pd.json_normalize(Reg_json[i]) for i in pred] + +# @st.cache_data +# # def tpeipls_(change, n_intervall, n_iter): +# Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = n_intervall, n_iter=n_iter) +# # time.sleep(1) +# # reg_model = Reg.model_ +# # global intervalls +# # intervalls = Reg.selected_features_.T +# # intervalls_with_cols = Reg.selected_features_.T +# # for i in range(intervalls.shape[0]): +# # for j in range(intervalls.shape[1]): +# # intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] +# # rega = Reg.selected_features_ +# return Reg #, reg_model, intervalls, intervalls_with_cols, rega +def auto_execute(func): + func() + return func -@st.cache_data -def dx_loader(change): - with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: - tmp.write(data_file.read()) - tmp_path = tmp.name - chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path) - os.unlink(tmp_path) - return chem_data, spectra, meta_data, meta_data_st +# ####################################### page preamble ####################################### +st.title("Calibration Model Development") # page title +st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") +M0, M00 = st.columns([1, .4]) +M0.image("./images/model_creation.png", use_column_width=True) # graphical abstract -@st.cache_data -def visualize(change): - - if np.array(spectra.columns).dtype.kind in ['i','f']: - colnames = spectra.columns - else: - colnames = np.arange(spectra.shape[1]) - # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42) - # Assign data to training and test sets - X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index] - X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index] +################################################################# Begin : I- Data loading and preparation ###################################### +files_format = ['csv', 'dx'] # Supported files format +file = M00.radio('Select files format:', options = files_format,horizontal=True) # Select a file format +spectra = pd.DataFrame() # preallocate the spectral data block +y = pd.DataFrame() # preallocate the target(s) data block +match file: + # load csv file + case 'csv': + with M00: + # Load X-block data + xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + if xcal_csv: + sepx = st.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), + options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0,horizontal=True) + hdrx = st.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), + options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1,horizontal=True) + match hdrx: + case "yes": + col = 0 + case "no": + col = False + else: + st.warning('Insert your spectral data file here!') + + # Load Y-block data + ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") + if ycal_csv: + sepy = st.radio("Select separator (Y file) - _detected_: " + str(find_delimiter('data/'+ycal_csv.name)), + options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+ycal_csv.name))), key=2,horizontal=True) + hdry = st.radio("samples name (Y file)? - _detected_: " + str(find_col_index('data/'+ycal_csv.name)), + options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+ycal_csv.name))), key=3,horizontal=True) + + match hdry: + case "yes": + col = 0 + case "no": + col = False - #### insight on loaded data - # M0, M000 = st.columns([1, .4]) - fig1, ax1 = plt.subplots( figsize = (12,3)) - spectra.T.plot(legend=False, ax = ax1, linestyle = '--') - ax1.set_ylabel('Signal intensity') - ax1.margins(0) - plt.tight_layout() - # M0.pyplot(fig1) ######## Loaded graph - # fig1.savefig("./Report/figures/spectra_plot.png") + else: + st.warning('Insert your target data file here!') + + + # AFTER LOADING BOTH X AND Y FILES + if xcal_csv and ycal_csv: + # create a str instance for storing the hash of both x and y data + xy_hash = '' + from io import StringIO + for i in ["xcal_csv", "ycal_csv"]: + stringio = StringIO(eval(f'{i}.getvalue().decode("utf-8")')) + xy_hash += str(hash_data(str(stringio))) + + @st.cache_data + def csv_loader(change): + file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) + xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0) + yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col) + xfile.to_csv("./Report/datasets/"+xcal_csv.name,sep = ';', encoding='utf-8', mode='a') + yfile.to_csv("./Report/datasets/"+ycal_csv.name,sep = ';', encoding='utf-8', mode='a') + return xfile, yfile, file_name + xfile, yfile, file_name = csv_loader(change =xy_hash) + + + if yfile.shape[1]>0 and xfile.shape[1]>0 : + + # prepare x data + spectra, meta_data = col_cat(xfile) + spectra = pd.DataFrame(spectra).astype(float) + + # prepare y data + chem_data, idx = col_cat(yfile) + if chem_data.shape[1]>1: + yname = M00.selectbox('Select target', options=chem_data.columns) + y = chem_data.loc[:,yname] + else: + y = chem_data.iloc[:,0] + + ### warning + if spectra.shape[0] != y.shape[0]: + st.warning('X and Y have different sample size') + y = pd.DataFrame + spectra = pd.DataFrame - fig2, ax2 = plt.subplots(figsize = (12,3)) - sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True) - sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True) - sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True) - ax2.set_xlabel('y') - plt.legend() - plt.tight_layout() + else: + st.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !') + + # Load .dx file + case 'dx': + with M00: + data_file = st.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") + if data_file: + file_name = str(data_file.name) + ## creating the temp file + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(data_file.read()) + tmp_path = tmp.name + with open(tmp.name, 'r') as dd: + dxdata = dd.read() + xy_hash = str(dxdata) + with open('Report/datasets/'+data_file.name, 'w') as dd: + dd.write(dxdata) + ## load and parse the temp dx file + @st.cache_data + def dx_loader(change): + chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path) + os.unlink(tmp_path) + return chem_data, spectra, meta_data, meta_data_st + chem_data, spectra, meta_data, meta_data_st = dx_loader(change = hash_data(xy_hash)) + + if not spectra.empty: + st.success("The data have been loaded successfully", icon="✅") + if chem_data.shape[1]>0: + + yname = st.selectbox('Select target', options=chem_data.columns) + measured = chem_data.loc[:,yname] > 0 + y = chem_data.loc[:,yname].loc[measured] + spectra = spectra.loc[measured] + else: + st.warning('Warning: your file includes no target variables to model !', icon="âš ï¸") - # M0.pyplot(fig2) - # fig2.savefig("./Report/figures/Histogram.png") - - # M000.write('Loaded data summary') - # M000.write(pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2)) - stats=pd.DataFrame([desc_stats(y_train),desc_stats(y_test),desc_stats(y)], index =['train', 'test', 'total'] ).round(2) - - return X_train, X_test,y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2 + else : + st.warning('Load your file here!') -@st.cache_data -def pls_(change): - Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=200) - reg_model = Reg.model_ - rega = Reg.selected_features_ - return Reg, reg_model, rega +################################################### END : I- Data loading and preparation #################################################### -@st.cache_data -def tpeipls_(change, n_intervall, n_iter): - Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = n_intervall, n_iter=n_iter) - time.sleep(1) - reg_model = Reg.model_ - intervalls = Reg.selected_features_.T - intervalls_with_cols = Reg.selected_features_.T - for i in range(intervalls.shape[0]): - for j in range(intervalls.shape[1]): - intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] - rega = Reg.selected_features_ - return Reg, reg_model, intervalls, intervalls_with_cols, rega -# ####################################### page preamble ####################################### -st.title("Calibration Model Development") # page title -st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") -M0, M00 = st.columns([1, .4]) -M0.image("./images/model_creation.png", use_column_width=True) # graphical abstract -####################################### I- Data preparation -files_format = ['.csv', '.dx'] # Supported files format -file = M00.radio('Select files format:', options = files_format,horizontal=True) # Select a file format -spectra = pd.DataFrame() # preallocate the spectral data block -y = pd.DataFrame() # preallocate the target(s) data block -match file: - ## load .csv file - case '.csv': - # Load X-block data - xcal_csv = M00.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") - if xcal_csv: - sepx = M00.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), - options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0,horizontal=True) - hdrx = M00.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), - options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1,horizontal=True) - match hdrx: - case "yes": - col = 0 - case "no": - col = False - else: - M00.warning('Insert your spectral data file here!') - - # Load Y-block data - ycal_csv = M00.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") - if ycal_csv: - sepy = M00.radio("Select separator (Y file) - _detected_: " + str(find_delimiter('data/'+ycal_csv.name)), - options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+ycal_csv.name))), key=2,horizontal=True) - hdry = M00.radio("samples name (Y file)? - _detected_: " + str(find_col_index('data/'+ycal_csv.name)), - options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+ycal_csv.name))), key=3,horizontal=True) - - match hdry: - case "yes": - col = 0 - case "no": - col = False - else: - M00.warning('Insert your target data file here!') +################################################### BEGIN : visualize and split the data #################################################### +st.header("I - Data visualization", divider='blue') +if not spectra.empty and not y.empty: + @st.cache_data + def visualize(change): - if xcal_csv and ycal_csv: - xfile, yfile, file_name = csv_loader(x = hash_data(xcal_csv.name+str(xcal_csv.size)), y =hash_data(ycal_csv.name+str(ycal_csv.size))) - - if yfile.shape[1]>0 and xfile.shape[1]>0 : - spectra, meta_data = col_cat(xfile) - chem_data, idx = col_cat(yfile) - if chem_data.shape[1]>1: - yname = M00.selectbox('Select target', options=chem_data.columns) - y = chem_data.loc[:,yname] - else: - y = chem_data.iloc[:,0] - + if np.array(spectra.columns).dtype.kind in ['i','f']: + colnames = spectra.columns + else: + colnames = np.arange(spectra.shape[1]) - spectra = pd.DataFrame(spectra).astype(float) - # if not meta_data.empty : - # st.write(meta_data) - if spectra.shape[0] != y.shape[0]: - M00.warning('X and Y have different sample size') - y = pd.DataFrame - spectra = pd.DataFrame + # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing + train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42) - else: - M00.error('Error: The data has not been loaded successfully, please consider tuning the decimal and separator !') - - ## Load .dx file - case '.dx': - data_file = M00.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") - if not data_file: - M00.warning('Load your file here!') - else : - file_name = str(data_file.name) - chem_data, spectra, meta_data, meta_data_st = dx_loader(change = hash_data(str(data_file.size))) - if not spectra.empty: - M00.success("The data have been loaded successfully", icon="✅") - if chem_data.shape[1]>0: - yname = M00.selectbox('Select target', options=chem_data.columns) - measured = chem_data.loc[:,yname] > 0 - y = chem_data.loc[:,yname].loc[measured] - spectra = spectra.loc[measured] - else: - M00.warning('Warning: your file includes no target variables to model !', icon="âš ï¸") + # Assign data to training and test sets + X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index] + X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index] -# visualize and split the data -st.header("I - Data visualization", divider='blue') -if not spectra.empty and not y.empty: - X_train, X_test,y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2= visualize(hash_data(y+np.median(spectra))) + #### insight on loaded data + # M0, M000 = st.columns([1, .4]) + fig1, ax1 = plt.subplots( figsize = (12,3)) + spectra.T.plot(legend=False, ax = ax1, linestyle = '-', linewidth = 0.6) + ax1.set_ylabel('Signal intensity') + ax1.margins(0) + plt.tight_layout() + + fig2, ax2 = plt.subplots(figsize = (12,3)) + sns.histplot(y, color="deeppink", kde = True,label="y",ax = ax2, fill=True) + sns.histplot(y_train, color="blue", kde = True,label="y (train)",ax = ax2, fill=True) + sns.histplot(y_test, color="green", kde = True,label="y (test)",ax = ax2, fill=True) + ax2.set_xlabel('y') + plt.legend() + plt.tight_layout() + stats=pd.DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =['train', 'test', 'total'] ).round(2) + + return X_train, X_test,y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2 + X_train, X_test,y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2= visualize(change = hash_data(y+np.median(spectra))) + M0, M000 = st.columns([1, .4]) - M0.pyplot(fig1) ######## Loaded graph - fig1.savefig("./Report/figures/spectra_plot.png") - M0.pyplot(fig2) - fig2.savefig("./Report/figures/Histogram.png") - M000.write('Loaded data summary') - M000.write(stats) - - ####################################### Model creation ################################################### + with M0: + st.pyplot(fig1) ######## Loaded graph + st.pyplot(fig2) + fig1.savefig("./Report/figures/spectra_plot.png") + fig2.savefig("./Report/figures/Histogram.png") + with M000: + st.write('Loaded data summary') + st.write(stats) + +################################################### END : visualize and split the data ####################################################### + + + + + + +########################################################## BEGIN : Create Model #################################################### regression_algo = None # initialize the selected regression algorithm Reg = None # initialize the regression model object intervalls_with_cols = pd.DataFrame() @@ -245,139 +280,171 @@ if not (spectra.empty and y.empty): folds = KF_CV.CV(X_train, y_train, nb_folds)# split train data into nb_folds for cross_validation - # Model creation - match regression_algo: - case "": - M20.warning('Choose a modelling algorithm from the dropdown list !') - - case "PLS": - Reg, reg_model, rega = pls_(change =st.session_state.counter) - - case 'LW-PLS': - M20.write(f'K-Fold for Cross-Validation (K = {str(nb_folds)})') - info = M20.info('Starting LWPLSR model creation... Please wait a few minutes.') - # export data to csv for Julia train/test - data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] - x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() - # Cross-Validation calculation - - d = {} - for i in range(nb_folds): - d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] - data_to_work_with.append("xtr_fold{0}".format(i+1)) - data_to_work_with.append("ytr_fold{0}".format(i+1)) - data_to_work_with.append("xte_fold{0}".format(i+1)) - data_to_work_with.append("yte_fold{0}".format(i+1)) - # check best pre-treatment with a global PLSR model - preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=20) - temp_path = Path('temp/') - with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: - json.dump(preReg.best_hyperparams_, outfile) - # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files - for i in data_to_work_with: - if 'fold' in i: - j = d[i] - else: - j = globals()[i] - np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") - # run Julia Jchemo as subprocess - import subprocess - subprocess_path = Path("Class_Mod/") - subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"]) - # retrieve json results from Julia JChemo - try: - with open(temp_path / "lwplsr_outputs.json", "r") as outfile: - Reg_json = json.load(outfile) - # delete csv files - for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) - # # delete json file after import - os.unlink(temp_path / "lwplsr_outputs.json") - os.unlink(temp_path / "lwplsr_preTreatments.json") - # format result data into Reg object - pred = ['pred_data_train', 'pred_data_test']### keys of the dict - for i in range(nb_folds): - pred.append("CV" + str(i+1)) ### add cv folds keys to pred - - Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], - 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) - reg_model = Reg.model_ - Reg.CV_results_ = pd.DataFrame() - Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} - # # set indexes to Reg.pred_data (train, test, folds idx) - for i in range(len(pred)): - Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) - if i == 0: # data_train - # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) - Reg.pred_data_[i].index = list(y_train.index) - Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] - elif i == 1: # data_test - # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) - Reg.pred_data_[i].index = list(y_test.index) - Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] - else: - # CVi - Reg.pred_data_[i].index = folds[list(folds)[i-2]] - # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) - Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) - Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) - - Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] - #### cross validation results print - Reg.best_hyperparams_print = Reg.best_hyperparams_ - ## plots - Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds) - Reg.pretreated_spectra_ = preReg.pretreated_spectra_ + # Model creation-M20 columns + with M20: + if regression_algo: + info = st.info('The model is being created. This may take a few minutes.') + if regression_algo == 'TPE-iPLS':# if model type is ipls then ask for the number of iterations and intervalls + s = st.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6) + it = st.number_input(label='Enter the number of iterations', min_value=2, max_value=500, value=2) + + if regression_algo: # if a regression method is selected then create the model + @st.cache_data + def RequestingModelCreation(change, regression_algo): + match regression_algo: + case 'PLS': + Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=5) + reg_model = Reg.model_ + rega = Reg.selected_features_ + case 'LW-PLS': + # export data to csv for Julia train/test + global x_train_np, y_train_np, x_test_np, y_test_np + data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] + x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() + # Cross-Validation calculation + + d = {} + for i in range(nb_folds): + d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] + data_to_work_with.append("xtr_fold{0}".format(i+1)) + data_to_work_with.append("ytr_fold{0}".format(i+1)) + data_to_work_with.append("xte_fold{0}".format(i+1)) + data_to_work_with.append("yte_fold{0}".format(i+1)) + # check best pre-treatment with a global PLSR model + preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=20) + temp_path = Path('temp/') + with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: + json.dump(preReg.best_hyperparams_, outfile) + # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files + for i in data_to_work_with: + if 'fold' in i: + j = d[i] + else: + j = globals()[i] + # st.write(j) + np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") + # run Julia Jchemo as subprocess + import subprocess + subprocess_path = Path("Class_Mod/") + subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"]) + # retrieve json results from Julia JChemo + try: + with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + Reg_json = json.load(outfile) + # delete csv files + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + # delete json file after import + os.unlink(temp_path / "lwplsr_outputs.json") + os.unlink(temp_path / "lwplsr_preTreatments.json") + # format result data into Reg object + pred = ['pred_data_train', 'pred_data_test']### keys of the dict + for i in range(nb_folds): + pred.append("CV" + str(i+1)) ### add cv folds keys to pred + # global Reg + # Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], + # 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + # global Reg + Reg = lw(Reg_json=Reg_json, pred = pred) + reg_model = Reg.model_ + Reg.CV_results_ = pd.DataFrame() + Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} + # set indexes to Reg.pred_data (train, test, folds idx) + for i in range(len(pred)): + Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) + if i == 0: # data_train + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + Reg.pred_data_[i].index = list(y_train.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + elif i == 1: # data_test + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + Reg.pred_data_[i].index = list(y_test.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + else: + # CVi + Reg.pred_data_[i].index = folds[list(folds)[i-2]] + # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) + Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) + Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) + + Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] + #### cross validation results print + Reg.best_hyperparams_print = Reg.best_hyperparams_ + ## plots + Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv= Reg.cv_data_['YpredCV'], folds=folds) + Reg.pretreated_spectra_ = preReg.pretreated_spectra_ + + Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} + Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} + + Reg.__hash__ = hash_data(Reg.best_hyperparams_print) + except FileNotFoundError as e: + Reg = None + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + Reg.__hash__ = 0 + case 'TPE-iPLS': + Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it) + reg_model = Reg.model_ + + intervalls = Reg.selected_features_.T + intervalls_with_cols = Reg.selected_features_.T + + for i in range(intervalls.shape[0]): + for j in range(intervalls.shape[1]): + intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] + rega = Reg.selected_features_ + + st.session_state.intervalls = Reg.selected_features_.T + st.session_state.intervalls_with_cols =intervalls_with_cols + return Reg + Reg = RequestingModelCreation(change =st.session_state.counter, regression_algo = regression_algo) + else: + st.warning('Choose a modelling algorithm from the dropdown list !') - Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} - Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} - info.empty() - M20.success('Model created!') - except FileNotFoundError as e: - # Display error message on the interface if modeling is wrong - info.empty() - M20.warning('- ERROR during model creation -') - Reg = None - for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) - - case 'TPE-iPLS': - s = M20.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6) - it = M20.number_input(label='Enter the number of iterations', min_value=50, max_value=500, value=50) - progress_text = "The model is being created. Please wait." - Reg, reg_model, intervalls, intervalls_with_cols, rega = tpeipls_(change = st.session_state.counter, n_intervall= s, n_iter = it) - - # pro = M1.info("The model is being created. Please wait!") - # pro.empty() - M20.info("The model has successfully been created!") - + if regression_algo: + info.empty() + if Reg: + st.success('Success! Your model has been created and is ready to use.') + else: + st.error("Error: Model creation failed. Please try again.") + + if regression_algo: + if regression_algo == 'TPE-iPLS': + intervalls = st.session_state.intervalls + intervalls_with_cols = st.session_state.intervalls_with_cols -if Reg: - if st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True): - increment() - - M1, M2 = st.columns([2 ,4]) - M1.write('-- Spectral preprocessing info --') - M1.write(Reg.best_hyperparams_print) - with open("data/params/Preprocessing.json", "w") as outfile: - json.dump(Reg.best_hyperparams_, outfile) +if Reg: + if st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True):# remodel feature for re-tuning the model + increment() + # fitted values and predicted values yc = Reg.pred_data_[0] yt = Reg.pred_data_[1] - # ########## - M1.write("-- Model performance --") - if regression_algo != reg_algo[2]: - M1.dataframe(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) - else: - M1.dataframe(metrics(t = [y_test, yt], method='regression').scores_) + + M1, M2 = st.columns([2 ,4]) + with M1: + # Show and export the preprocessing methods + st.write('-- Spectral preprocessing info --') + st.write(Reg.best_hyperparams_print) + with open("data/params/Preprocessing.json", "w") as outfile: + json.dump(Reg.best_hyperparams_, outfile) + + # Show the model performance table + st.write("-- Model performance --") + if regression_algo != reg_algo[2]: + model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) + else: + model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) + st.dataframe(model_per) - model_per=pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method='regression').scores_) - # M1.dataframe(model_per) # duplicate with line 371 + + # M1.dataframe(model_per) # duplicate with line 371 @st.cache_data - def prep_important(change,regression_algo): + def prep_important(change, regression_algo, model_hash): fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') # if regression_algo != reg_algo[2]: @@ -409,51 +476,53 @@ if Reg: ax2.legend() return fig + fig = prep_important(change = st.session_state.counter, regression_algo = regression_algo, model_hash = str(Reg.__hash__)) + + with M2:## Visualize raw,preprocessed spectra, and selected intervalls(in case of ipls) + if not intervalls_with_cols.empty: + st.write('-- Important Spectral regions used for model creation --') + st.table(intervalls_with_cols) + st.write('-- Visualization of the spectral regions used for model creation --') + fig.savefig("./Report/figures/Variable_importance.png") + st.pyplot(fig) - fig = prep_important(change = st.session_state.counter, regression_algo = regression_algo) - if not intervalls_with_cols.empty: - M2.write('-- Important Spectral regions used for model creation --') - M2.table(intervalls_with_cols) - - M2.write('-- Visualization of the spectral regions used for model creation --') - fig.savefig("./Report/figures/Variable_importance.png") - M2.pyplot(fig) + if Reg: # Display CV results + numbers_dict = {1: "One", 2: "Two",3: "Three",4: "Four",5: "Five", + 6: "Six",7: "Seven",8: "Eight",9: "Nine",10: "Ten"} + st.header(f" {numbers_dict[nb_folds]}-Fold Cross-Validation results") - ################# CV results ############ + cv1, cv2 = st.columns([2,2]) + with cv2: + st.write('-- Cross-Validation Summary--') + st.write(Reg.CV_results_.style.map(lambda _: "background-color: #cecece;", subset=(Reg.CV_results_.index.drop(['sd', 'mean', 'cv']), slice(None)))) + # st.write(Reg.CV_results_) + cv_results=pd.DataFrame(Reg.CV_results_)# CV table - if Reg: - # fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6)) - # fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02) + st.write('-- Out-of-Fold Predictions Visualization (All in one) --') + fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", + color_discrete_sequence=px.colors.qualitative.G10) + fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), + y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash")) + fig1.update_traces(marker_size=7, showlegend=False) + st.plotly_chart(fig1, use_container_width=True) + fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, + color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000) + fig0.update_traces(marker_size=8, showlegend=False) + fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png") - st.header("Cross-Validation results") - cv1, cv2 = st.columns([2,2]) - ############ - cv2.write('-- Cross-Validation Summary--') - cv2.write(Reg.CV_results_) - cv_results=pd.DataFrame(Reg.CV_results_) - cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --') + with cv1: + st.write('-- Out-of-Fold Predictions Visualization (Separate plots) --') + st.plotly_chart(fig0, use_container_width=True) + fig1.write_image("./Report/figures/meas_vs_pred_cv_all.png") - fig1 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", - color_discrete_sequence=px.colors.qualitative.G10) - fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), - y0 = .95 * min(Reg.cv_data_[0].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[0].loc[:,'Measured']), line = dict(color='black', dash = "dash")) - fig1.update_traces(marker_size=7, showlegend=False) - cv2.plotly_chart(fig1, use_container_width=True) - fig0 = px.scatter(Reg.cv_data_[0], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1, - color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000) - fig0.update_traces(marker_size=8, showlegend=False) - fig0.write_image("./Report/figures/meas_vs_pred_cv_onebyone.png") - cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --') - cv1.plotly_chart(fig0, use_container_width=True) - fig1.write_image("./Report/figures/meas_vs_pred_cv_all.png") + ################################################### BEGIN : Model Diagnosis #################################################### st.header("III - Model Diagnosis", divider='blue') - if Reg: # signal preprocessing results preparation for latex report prep_para = Reg.best_hyperparams_ @@ -467,7 +536,7 @@ if Reg: elif Reg.best_hyperparams_[i] > 1: prep_para[i] = f"{Reg.best_hyperparams_[i]}nd" - ### reg plot and residuals plot + # reg plot and residuals plot if regression_algo != reg_algo[2]: regression_plot = reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index) residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) @@ -476,130 +545,19 @@ if Reg: residual_plot = resid_plot([y_train, y_test], [yc, yt], train_idx=train_index, test_idx=test_index) M7, M8 = st.columns([2,2]) + with M7: + st.write('Predicted vs Measured values') + st.pyplot(regression_plot) + regression_plot.savefig('./Report/figures/measured_vs_predicted.png') - M7.write('Predicted vs Measured values') - M8.write('Residuals plot') - - M7.pyplot(regression_plot) - M8.pyplot(residual_plot) + with M8: + st.write('Residuals plot') + st.pyplot(residual_plot) + residual_plot.savefig('./Report/figures/residuals_plot.png') + +################################################### END : Model Diagnosis ####################################################### +st.write('Download the Analysis Results') - residual_plot.savefig('./Report/figures/residuals_plot.png') - regression_plot.savefig('./Report/figures/measured_vs_predicted.png') -######################################### -if Reg: - st.header('Download Analysis Results', divider='blue') - def export_model(): - path = 'data/models/model_' - match file: - case '.csv': - #export_package = __import__(model_export) - with open(path + model_name + date_time + '_created_on_' + xcal_csv.name[:xcal_csv.name.find(".")] +""+ - '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f: - joblib.dump(reg_model, f) - if regression_algo == reg_algo[3]: - Reg.selected_features_.T.to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")] - + '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';') - - case '.dx': - #export_package = __import__(model_export) - with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f: - joblib.dump(reg_model, f) - if regression_algo == reg_algo[3]: - Reg.selected_features_.T.to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';') - - - - - - def export_report(): - match regression_algo: - case 'PLS': - latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), regression_algo, model_per, cv_results) - - case 'LW-PLS': - latex_report = report.report('Predictive model development', file_name, stats, - list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), regression_algo, model_per, cv_results) - - case 'TPE-iPLS': - latex_report = report.report('Predictive model development', file_name, stats, - list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), regression_algo, model_per, cv_results) - - case _: - st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸") - - report.compile_latex() - - - - M9, M10 = st.columns([1,1]) - M10.info('The results are automatically converted into LaTeX code, a strong typesetting system noted for its remarkable document formatting.\ - The comprehensive capabilities of LaTeX ensure that your data and findings are cleanly and properly presented,\ - swith accurate formatting and organizing.') - # M9.write("-- Save the model --") - model_name = M9.text_input("Please provide a name for the created model: ",value = 'UNNAMED' , placeholder = 'model name') - - items_download = M9.selectbox('To proceed, please choose the file or files you want to download from the list below:', - options = ['','Model', 'Report', 'Both Model & Report'], index=0, format_func=lambda x: x if x else "<Select>", - key=None, help=None, on_change=None, args=None, kwargs=None, placeholder="Choose an option", disabled=False, label_visibility="visible") - - - ## Save model and download report - - # st.session_state.a = "Please wait while your LaTeX report is being compiled..." - date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_') - # match items_download: - # case '': - - if items_download: - if M9.button('Download', type="primary"): - match items_download: - case '': - M9.warning('Please select an item from the dropdown list!') - case 'Model': - export_model() - case 'Report': - # M9.info("Please wait while your LaTeX report is being compiled...") - export_report() - case 'Both Model & Report': - export_model() - export_report() - M9.success('The selected item has been exported successfully!') - - -if Reg: - - if st.session_state['interface'] == 'simple': - pages_folder = Path("pages/") - show_pages( - [Page("app.py", "Home"), - Page(str(pages_folder / "4-inputs.py"), "Inputs"), - Page(str(pages_folder / "1-samples_selection.py"), "Samples Selection"), - Page(str(pages_folder / "2-model_creation.py"), "Models Creation"), - Page(str(pages_folder / "3-prediction.py"), "Predictions"), - ] - ) - st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') - - -##### for outliers removal - # data_df = pd.DataFrame( - # { - # "widgets": ["st.selectbox", "st.number_input", "st.text_area", "st.button"], - # "favorite": [True, False, False, True], - # } - # ) - # st.data_editor( - # data_df, - # column_config={ - # "favorite": st.column_config.CheckboxColumn( - # "Your favorite", - # help="Select your widgets", - # default=False, - # ) - # }, - # disabled=["widgets"], - # hide_index=True, - # ) \ No newline at end of file