From 6718fc6faf9440a9cc50aff1fd3665929838ccd9 Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Wed, 10 Apr 2024 16:07:42 +0200 Subject: [PATCH] - Wavelength selection successfully incorporated - Modifications on Model creation pages - correction of regression metrics --- Class_Mod/Miscellaneous.py | 4 +- Class_Mod/PLSR_.py | 2 +- Class_Mod/Regression_metrics.py | 7 ++- Class_Mod/VarSel.py | 102 +++++++++++++++++--------------- Packages.py | 3 + pages/2-model_creation.py | 98 ++++++++++++++++-------------- 6 files changed, 115 insertions(+), 101 deletions(-) diff --git a/Class_Mod/Miscellaneous.py b/Class_Mod/Miscellaneous.py index 1ea7dde..1627b39 100644 --- a/Class_Mod/Miscellaneous.py +++ b/Class_Mod/Miscellaneous.py @@ -40,10 +40,10 @@ def resid_plot( meas, pred): sns.residplot(x = meas[1], y = pred[1], color='red', label = 'CV') sns.residplot(x = meas[2], y = pred[2], color='green', label = 'Test') ax.set_ylabel('Residuals') - ax.set_xlabel('Predicted values') + ax.set_xlabel('Measured values') plt.legend() # function that create a download button - needs the data to save and the file name to store to def download_results(data, export_name): with open(data) as f: - st.download_button('Download Results', f, export_name) \ No newline at end of file + st.download_button('Download Results', f, export_name) diff --git a/Class_Mod/PLSR_.py b/Class_Mod/PLSR_.py index 7050ae4..709b8c4 100644 --- a/Class_Mod/PLSR_.py +++ b/Class_Mod/PLSR_.py @@ -25,7 +25,7 @@ class PinardPlsr: pipeline = Pipeline([ ('scaler', MinMaxScaler()), # scaling the data ('preprocessing', FeatureUnion(preprocessing)), # preprocessing - ('PLS', PLSRegression())]) + ('PLS', PLSRegression(n_components=14))]) # Estimator including y values scaling estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler()) # Training diff --git a/Class_Mod/Regression_metrics.py b/Class_Mod/Regression_metrics.py index a450a46..f958d8c 100644 --- a/Class_Mod/Regression_metrics.py +++ b/Class_Mod/Regression_metrics.py @@ -17,11 +17,12 @@ class metrics: @property def evaluate_(self): xbar = np.mean(self.meas) # the average of measured values - e2 = np.square(np.subtract(self.meas, self.pred))# the squared error + e = np.subtract(self.meas.ravel(), self.pred.ravel()) + e2 = e**2# the squared error # Sum of squared: # TOTAL - sst = np.sum((self.meas-xbar)**2) + sst = np.sum((self.meas- xbar)**2) # RESIDUAL ssr = np.sum(e2) # REGRESSION OR MODEL @@ -32,7 +33,7 @@ class metrics: # Compute statistical metrics metr = pd.DataFrame() metr['r'] = [np.corrcoef(self.meas.ravel(), self.pred)[0,1]] - metr['r2'] = [ssm/sst] + metr['r2'] = [1-ssr/sst] metr['rmse'] = [np.sqrt(np.mean(e2))] metr['mae'] = [np.mean(np.abs(e2))] metr['rpd'] = [np.std(self.meas)/np.sqrt(np.mean(e2))] diff --git a/Class_Mod/VarSel.py b/Class_Mod/VarSel.py index 453602a..6e4a378 100644 --- a/Class_Mod/VarSel.py +++ b/Class_Mod/VarSel.py @@ -1,7 +1,6 @@ from Packages import * from Class_Mod import metrics - class TpeIpls: ''' This framework is added to the clan of wavelengths selection algorithms.It was introduced as an improvement @@ -14,10 +13,10 @@ class TpeIpls: '''Optimization algorithms can be used to find the subset of variables that optimize a certain criterion (e.g., maximize predictive performance, minimize overfitting)''' - SCORE = 10000 + SCORE = 100000000 index_export = pd.DataFrame() - def __init__(self, x_train, x_test, y_train, y_test, scale, Kfold, n_intervall): - + def __init__(self, x_train, x_test, y_train, y_test, + scale, Kfold, n_intervall): TpeIpls.SCORE = 10000 self.x_train = x_train self.x_test = x_test @@ -27,13 +26,12 @@ class TpeIpls: self.Kfold = Kfold self.p = self.x_train.shape[1] self.n_intervall = n_intervall - self.__n_arrets = self.n_intervall*2 - self.PLS_params = {f'v{i}': hp.randint(f'v{i}', 0, self.p) for i in range(1,self.__n_arrets+1)} + self.n_arrets = self.n_intervall*2 + self.PLS_params = {f'v{i}': hp.randint(f'v{i}', 0, self.p) for i in range(1,self.n_arrets+1)} self.PLS_params['n_components'] = hp.randint("n_components", 1, 6) - - def _objective(self, params): - self.idx = [params[f'v{i}'] for i in range(1,self.__n_arrets+1)] + def objective(self, params): + self.idx = [params[f'v{i}'] for i in range(1,self.n_arrets+1)] self.idx.sort() arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)] @@ -65,72 +63,78 @@ class TpeIpls: TpeIpls.SCORE = score self.nlv = params['n_components'] - print('--**-------------##---------#~###~#---------##---------------**--') - print(f'***** R²train : [{round(r2c * 100)}]**** R²cv : [{round(r2cv * 100)}]**** R²test : [{round(r2t * 100)}]*****') - print(f'***** N Predictiors : [{len(id)}] ******** NLV : [{params["n_components"]}]*****') TpeIpls.index_export = pd.DataFrame() TpeIpls.index_export["Vars"] = self.x_test.columns[id] TpeIpls.index_export.index = id - # Save model - #TpeIpls.index_export.to_excel(path + 'variables.xlsx') - ##3-performance - metrics(train=(self.y_train, yc), cv=(self.y_train, ycv) , test=(self.y_test, yt)).round(2).to_excel(path + "performance.xlsx") + self.segments = arrays - - print("''---------------------------- evolution noticed, hence a new model was saved-------------------------------''") - self.idx = self.idx return score - def tune(self, n_iter): - print('------------------------------------------------ Optimization of the process has started ---------------------------------------------') + + ############################################## + + def BandSelect(self, n_iter): trials = Trials() - best_params = fmin(fn=self._objective, + best_params = fmin(fn=self.objective, space=self.PLS_params, algo=tpe.suggest, # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach max_evals=n_iter, trials=trials, verbose=2) - - - @property - def segments_(self): - self.bands = {} + ban = {} for i in range(len(self.segments)): - self.bands[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]] + ban[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]] - bands = pd.DataFrame(self.bands).T - bands.columns = ['from', 'to'] - return bands - + self.bands = pd.DataFrame(ban).T + self.bands.columns = ['from', 'to'] + - @property - def tpe_pls_performance(self): f = [] - for i in range(self.segments_.shape[0]): - f.extend(np.arange(self.segments_["from"][i], self.segments_["to"][i]+1)) + for i in range(self.bands.shape[0]): + f.extend(np.arange(self.bands["from"][i], self.bands["to"][i]+1)) + variables_idx = list(set(f)) + + + + ############################################ + for i in range(self.bands.shape[0]): + f.extend(np.arange(self.bands["from"][i], self.bands["to"][i]+1)) variables_idx = list(set(f)) - pls = PLSRegression(n_components=self.nlv, scale= self.scale) - pls.fit(self.x_train.iloc[:,variables_idx], self.y_train) + self.pls = PLSRegression(n_components=self.nlv, scale= self.scale) + self.pls.fit(self.x_train.iloc[:,variables_idx], self.y_train) - self.yc = pls.predict(self.x_train.iloc[:,variables_idx]).ravel() - self.ycv = cross_val_predict(pls, self.x_train.iloc[:,variables_idx], self.y_train, cv=self.Kfold, n_jobs=-1).ravel() - self.yt = pls.predict(self.x_test.iloc[:,variables_idx]).ravel() + self.yc = self.pls.predict(self.x_train.iloc[:,variables_idx]).ravel() + self.ycv = cross_val_predict(self.pls, self.x_train.iloc[:,variables_idx], self.y_train, cv=self.Kfold, n_jobs=-1).ravel() + self.yt = self.pls.predict(self.x_test.iloc[:,variables_idx]).ravel() - perf = metrics(train=(self.y_train, self.yc), cv=(self.y_train, self.ycv) , test=(self.y_test, self.yt)).round(2) + return self.bands, variables_idx + - return perf + @property + def model_(self): + return self.pls + @property + def metrics_(self): + metc = metrics(self.y_train, self.yc) + metc = metc.evaluate_ + + metcv = metrics(self.y_train, self.ycv) + metcv = metcv.evaluate_ + + mett = metrics( self.y_test, self.yt) + mett = mett.evaluate_ + + met = pd.concat([metc, metcv, mett], axis = 0) + met.index = ['calib','cv','test'] + return met @property - def meas_vs_pred(self): - fig, ax = plt.subplots() - sns.regplot(x = self.y_train ,y = self.yc, ax = ax) - sns.regplot(x = self.y_train ,y = self.ycv,ax = ax) - sns.regplot(x = self.y_test,y = self.yt,ax = ax) - plt.show() \ No newline at end of file + def pred_data_(self): + return self.yc, self.ycv, self.yt \ No newline at end of file diff --git a/Packages.py b/Packages.py index 68c0bf0..cc180ac 100644 --- a/Packages.py +++ b/Packages.py @@ -57,4 +57,7 @@ import joblib # import pickle as pkl from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal + + + st.set_option('deprecation.showPyplotGlobalUse', False) diff --git a/pages/2-model_creation.py b/pages/2-model_creation.py index 7752440..23d7fea 100644 --- a/pages/2-model_creation.py +++ b/pages/2-model_creation.py @@ -3,24 +3,36 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * + +def nn(x): + return x is not None ######################################################################################## -# Model creation module -container2 = st.container(border=True) +reg_algo = ["","Full-PLS", "Locally Weighted PLS", "Interval-PLS"] +# Model creation module +st.header("Calibration Model Development", divider='blue') +st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra") M1, M2, M3 = st.columns([2,2,2]) +M1.write("-- Performance metrics --") M4, M5 = st.columns([6,2]) -container3 = st.container(border=True) +st.write("---") +st.header("Model Diagnosis", divider='blue') + M7, M8 = st.columns([2,2]) +M7.write('Predicted vs Measured values') +M8.write('Residuals plot') +M9, M10 = st.columns([2,2]) +M9.write("-- Save the model --") + + + -available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"] -with container2: - st.header("Calibration Model Development", divider='blue') - st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra") - # CSV files loader - xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") - ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") +# CSV files loader +xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") +ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") - if xcal_csv is not None and ycal_csv is not None: + +if xcal_csv is not None and ycal_csv is not None: # Select list for CSV delimiter sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) # Select list for CSV header True / False @@ -29,61 +41,55 @@ with container2: col = 0 else: col = False - rd_seed = M1.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i") + rd_seed = M1.slider("Change Train-test split", min_value=1, max_value=1212, value=42, format="%i") x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) # Assign data to training and test sets X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index]) - ############################# + y_train = y_train.iloc[:,0] + y_test = y_test.iloc[:,0] + - regression_algo = M1.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12) - if regression_algo == 'SciKitLearn PLSR': + ############################# Regression modelling ########################################## + regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) + if regression_algo == reg_algo[1]: # Train model with model function from application_functions.py Reg = PinardPlsr(x_train=X_train, x_test=X_test,y_train=y_train, y_test=y_test) reg_model = Reg.model_ - #M2.dataframe(Pin.pred_data_) - elif regression_algo == 'Jchemo Local Weighted PLSR': + elif regression_algo == reg_algo[2]: reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) - elif regression_algo == "Intervalle Selection PLSR": + elif regression_algo == reg_algo[3]: s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value="min") - reg_model = TpeIpls(x_train= X_train, y_train= y_train, x_test=X_test, y_test= y_test,Kfold= 3,scale= True, n_intervall = 3) - reg_model.tune(n_iter=10) - - if regression_algo in ["SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]: - with container3: - st.header("Model Diagnosis", divider='blue') - yc = Reg.pred_data_[0] - ycv = Reg.pred_data_[1] - yt = Reg.pred_data_[2] - M7.write('Predicted vs Measured values') - M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) - M8.write('Residuals plot') - M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) - - - # Export the model with pickle or joblib - if regression_algo != '': - M1.write("-- Performance metrics --") + it = M2.number_input(label='Enter the maximum number of iteration', min_value=50, max_value=1000, value="min") + Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = 6) + rega = Reg.BandSelect(n_iter=it) + reg_model = Reg.model_ + + ################# Model analysis ############ + + if regression_algo in reg_algo[1:]: + yc = Reg.pred_data_[0] + ycv = Reg.pred_data_[1] + yt = Reg.pred_data_[2] + + M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) + M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) M1.dataframe(Reg.metrics_) - M1.write("-- Save the model --") + + #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) - model_name = M1.text_input('Give it a name') - if M1.button('Export Model'): + model_name = M9.text_input('Give it a name') + if M9.button('Export Model'): #export_package = __import__(model_export) with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f: - joblib.dump(reg_model,f) + joblib.dump(reg_model, f) st.write('Model Exported') # create a report with information on the model ## see https://stackoverflow.com/a/59578663 - #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv])) - - -# graphical delimiter -st.write("---") - + #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv])) \ No newline at end of file -- GitLab