From 6718fc6faf9440a9cc50aff1fd3665929838ccd9 Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Wed, 10 Apr 2024 16:07:42 +0200
Subject: [PATCH] - Wavelength selection successfully incorporated -
 Modifications on Model creation pages - correction of regression metrics

---
 Class_Mod/Miscellaneous.py      |   4 +-
 Class_Mod/PLSR_.py              |   2 +-
 Class_Mod/Regression_metrics.py |   7 ++-
 Class_Mod/VarSel.py             | 102 +++++++++++++++++---------------
 Packages.py                     |   3 +
 pages/2-model_creation.py       |  98 ++++++++++++++++--------------
 6 files changed, 115 insertions(+), 101 deletions(-)

diff --git a/Class_Mod/Miscellaneous.py b/Class_Mod/Miscellaneous.py
index 1ea7dde..1627b39 100644
--- a/Class_Mod/Miscellaneous.py
+++ b/Class_Mod/Miscellaneous.py
@@ -40,10 +40,10 @@ def resid_plot( meas, pred):
     sns.residplot(x = meas[1], y = pred[1], color='red', label = 'CV')
     sns.residplot(x = meas[2], y = pred[2], color='green', label = 'Test')
     ax.set_ylabel('Residuals')
-    ax.set_xlabel('Predicted values')
+    ax.set_xlabel('Measured values')
     plt.legend()
 
 # function that create a download button - needs the data to save and the file name to store to
 def download_results(data, export_name):
     with open(data) as f:
-        st.download_button('Download Results', f, export_name)
\ No newline at end of file
+        st.download_button('Download Results', f, export_name)
diff --git a/Class_Mod/PLSR_.py b/Class_Mod/PLSR_.py
index 7050ae4..709b8c4 100644
--- a/Class_Mod/PLSR_.py
+++ b/Class_Mod/PLSR_.py
@@ -25,7 +25,7 @@ class PinardPlsr:
         pipeline = Pipeline([
             ('scaler', MinMaxScaler()), # scaling the data
             ('preprocessing', FeatureUnion(preprocessing)), # preprocessing
-            ('PLS',  PLSRegression())])
+            ('PLS',  PLSRegression(n_components=14))])
         # Estimator including y values scaling
         estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())
         # Training
diff --git a/Class_Mod/Regression_metrics.py b/Class_Mod/Regression_metrics.py
index a450a46..f958d8c 100644
--- a/Class_Mod/Regression_metrics.py
+++ b/Class_Mod/Regression_metrics.py
@@ -17,11 +17,12 @@ class metrics:
     @property
     def evaluate_(self):
            xbar = np.mean(self.meas) # the average of measured values
-           e2 = np.square(np.subtract(self.meas, self.pred))# the squared error
+           e = np.subtract(self.meas.ravel(), self.pred.ravel())
+           e2 = e**2# the squared error
 
           # Sum of squared:
            # TOTAL
-           sst = np.sum((self.meas-xbar)**2)
+           sst = np.sum((self.meas- xbar)**2)
            # RESIDUAL
            ssr = np.sum(e2)
            # REGRESSION OR MODEL
@@ -32,7 +33,7 @@ class metrics:
           # Compute statistical metrics
            metr = pd.DataFrame()
            metr['r'] = [np.corrcoef(self.meas.ravel(), self.pred)[0,1]]
-           metr['r2'] = [ssm/sst]
+           metr['r2'] = [1-ssr/sst]
            metr['rmse'] = [np.sqrt(np.mean(e2))]
            metr['mae'] = [np.mean(np.abs(e2))]
            metr['rpd'] = [np.std(self.meas)/np.sqrt(np.mean(e2))]
diff --git a/Class_Mod/VarSel.py b/Class_Mod/VarSel.py
index 453602a..6e4a378 100644
--- a/Class_Mod/VarSel.py
+++ b/Class_Mod/VarSel.py
@@ -1,7 +1,6 @@
 from Packages import *
 from Class_Mod import metrics
 
-
 class TpeIpls:
     '''
     This framework is added to the clan of wavelengths selection algorithms.It was introduced as an improvement
@@ -14,10 +13,10 @@ class TpeIpls:
 
     '''Optimization algorithms can be used to find the subset of variables that optimize a certain criterion
       (e.g., maximize predictive performance, minimize overfitting)'''
-    SCORE = 10000
+    SCORE = 100000000
     index_export = pd.DataFrame()
-    def __init__(self, x_train, x_test, y_train, y_test, scale, Kfold, n_intervall):
-        
+    def __init__(self, x_train, x_test, y_train, y_test,
+                  scale, Kfold, n_intervall):
         TpeIpls.SCORE = 10000
         self.x_train = x_train
         self.x_test = x_test
@@ -27,13 +26,12 @@ class TpeIpls:
         self.Kfold = Kfold
         self.p = self.x_train.shape[1]
         self.n_intervall = n_intervall
-        self.__n_arrets = self.n_intervall*2
-        self.PLS_params = {f'v{i}': hp.randint(f'v{i}', 0, self.p) for i in range(1,self.__n_arrets+1)}
+        self.n_arrets = self.n_intervall*2
+        self.PLS_params = {f'v{i}': hp.randint(f'v{i}', 0, self.p) for i in range(1,self.n_arrets+1)}
         self.PLS_params['n_components'] = hp.randint("n_components", 1, 6)
 
-
-    def _objective(self, params):
-        self.idx = [params[f'v{i}'] for i in range(1,self.__n_arrets+1)]
+    def objective(self, params):
+        self.idx = [params[f'v{i}'] for i in range(1,self.n_arrets+1)]
         self.idx.sort()
         
         arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)]
@@ -65,72 +63,78 @@ class TpeIpls:
             TpeIpls.SCORE = score
             self.nlv = params['n_components'] 
 
-            print('--**-------------##---------#~###~#---------##---------------**--')
-            print(f'***** RÂ²train : [{round(r2c * 100)}]**** RÂ²cv : [{round(r2cv * 100)}]**** RÂ²test : [{round(r2t * 100)}]*****')
-            print(f'***** N Predictiors : [{len(id)}]   ********   NLV : [{params["n_components"]}]*****')            
 
             TpeIpls.index_export = pd.DataFrame()
             TpeIpls.index_export["Vars"] = self.x_test.columns[id]
             TpeIpls.index_export.index = id
 
-            # Save model
-            #TpeIpls.index_export.to_excel(path + 'variables.xlsx')
-            ##3-performance
-            metrics(train=(self.y_train, yc), cv=(self.y_train, ycv) , test=(self.y_test, yt)).round(2).to_excel(path + "performance.xlsx")
+       
             self.segments = arrays
-
-            print("''---------------------------- evolution noticed, hence a new model was saved-------------------------------''")
-            self.idx = self.idx
         return score
 
     
 
-    def tune(self, n_iter):
-        print('------------------------------------------------  Optimization of the process has started ---------------------------------------------')
+
+    ##############################################
+
+    def BandSelect(self, n_iter):
         trials = Trials()
         
-        best_params = fmin(fn=self._objective,
+        best_params = fmin(fn=self.objective,
                            space=self.PLS_params,
                            algo=tpe.suggest,  # Tree of Parzen Estimatorsâ€™ (tpe) which is a Bayesian approach
                            max_evals=n_iter,
                            trials=trials,
                            verbose=2)
-    
-
 
-    @property
-    def segments_(self):
-        self.bands = {}
+        ban = {}
         for i in range(len(self.segments)):
-            self.bands[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]]
+            ban[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]]
         
-        bands = pd.DataFrame(self.bands).T
-        bands.columns = ['from', 'to']
-        return bands
-    
+        self.bands = pd.DataFrame(ban).T
+        self.bands.columns = ['from', 'to']
+
 
-    @property
-    def tpe_pls_performance(self):
         f = []
-        for i in range(self.segments_.shape[0]):
-            f.extend(np.arange(self.segments_["from"][i], self.segments_["to"][i]+1))
+        for i in range(self.bands.shape[0]):
+            f.extend(np.arange(self.bands["from"][i], self.bands["to"][i]+1))
+        variables_idx = list(set(f))
+
+
+
+        ############################################
+        for i in range(self.bands.shape[0]):
+            f.extend(np.arange(self.bands["from"][i], self.bands["to"][i]+1))
         variables_idx = list(set(f))
         
-        pls = PLSRegression(n_components=self.nlv, scale= self.scale)
-        pls.fit(self.x_train.iloc[:,variables_idx], self.y_train)
+        self.pls = PLSRegression(n_components=self.nlv, scale= self.scale)
+        self.pls.fit(self.x_train.iloc[:,variables_idx], self.y_train)
 
-        self.yc = pls.predict(self.x_train.iloc[:,variables_idx]).ravel()
-        self.ycv = cross_val_predict(pls, self.x_train.iloc[:,variables_idx], self.y_train, cv=self.Kfold, n_jobs=-1).ravel()
-        self.yt = pls.predict(self.x_test.iloc[:,variables_idx]).ravel()
+        self.yc = self.pls.predict(self.x_train.iloc[:,variables_idx]).ravel()
+        self.ycv = cross_val_predict(self.pls, self.x_train.iloc[:,variables_idx], self.y_train, cv=self.Kfold, n_jobs=-1).ravel()
+        self.yt = self.pls.predict(self.x_test.iloc[:,variables_idx]).ravel()
         
-        perf = metrics(train=(self.y_train, self.yc), cv=(self.y_train, self.ycv) , test=(self.y_test, self.yt)).round(2)
+        return self.bands, variables_idx
+    
 
-        return perf
+    @property
+    def model_(self):
+        return self.pls
+    @property
+    def metrics_(self):
+        metc = metrics(self.y_train, self.yc)
+        metc = metc.evaluate_
+
+        metcv = metrics(self.y_train, self.ycv)
+        metcv = metcv.evaluate_
+
+        mett = metrics( self.y_test, self.yt)
+        mett = mett.evaluate_
+        
+        met = pd.concat([metc, metcv, mett], axis = 0)
+        met.index = ['calib','cv','test']
+        return met
 
     @property
-    def meas_vs_pred(self):
-        fig, ax = plt.subplots()
-        sns.regplot(x = self.y_train ,y = self.yc, ax = ax)
-        sns.regplot(x = self.y_train ,y = self.ycv,ax = ax)
-        sns.regplot(x = self.y_test,y = self.yt,ax = ax)
-        plt.show()
\ No newline at end of file
+    def pred_data_(self):
+        return self.yc, self.ycv, self.yt
\ No newline at end of file
diff --git a/Packages.py b/Packages.py
index 68c0bf0..cc180ac 100644
--- a/Packages.py
+++ b/Packages.py
@@ -57,4 +57,7 @@ import joblib
 # import pickle as pkl
 
 from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal
+
+
+
 st.set_option('deprecation.showPyplotGlobalUse', False)
diff --git a/pages/2-model_creation.py b/pages/2-model_creation.py
index 7752440..23d7fea 100644
--- a/pages/2-model_creation.py
+++ b/pages/2-model_creation.py
@@ -3,24 +3,36 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
 from Modules import *
 from Class_Mod.DATA_HANDLING import *
 
+
+def nn(x):
+    return x is not None
 ########################################################################################
-# Model creation module
-container2 = st.container(border=True)
+reg_algo = ["","Full-PLS", "Locally Weighted PLS", "Interval-PLS"]
 
+# Model creation module
+st.header("Calibration Model Development", divider='blue')
+st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra")
 M1, M2, M3 = st.columns([2,2,2])
+M1.write("-- Performance metrics --")
 M4, M5 = st.columns([6,2])
-container3 = st.container(border=True)
+st.write("---")
+st.header("Model Diagnosis", divider='blue')
+
 M7, M8 = st.columns([2,2])
+M7.write('Predicted vs Measured values')
+M8.write('Residuals plot')
+M9, M10 = st.columns([2,2])
+M9.write("-- Save the model --")
+
+
+
 
-available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]
-with container2:
-    st.header("Calibration Model Development", divider='blue')
-    st.write("Create a predictive model, then use it for predicting your target variable(chemical values) from NIRS spectra")
-    # CSV files loader
-    xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
-    ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
+# CSV files loader
+xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
+ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
 
-    if xcal_csv is not None and ycal_csv is not None:
+
+if xcal_csv is not None and ycal_csv is not None:
         # Select list for CSV delimiter
         sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
         # Select list for CSV header True / False
@@ -29,61 +41,55 @@ with container2:
             col = 0
         else:
             col = False
-        rd_seed = M1.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
+        rd_seed = M1.slider("Change Train-test split", min_value=1, max_value=1212, value=42, format="%i")
         x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
         # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
         train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
         # Assign data to training and test sets
         X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index])
-        #############################
+        y_train = y_train.iloc[:,0]
+        y_test = y_test.iloc[:,0]
+
 
-        regression_algo = M1.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12)
 
-        if regression_algo == 'SciKitLearn PLSR':
+        ############################# Regression modelling ##########################################
+        regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
+        if regression_algo == reg_algo[1]:
             # Train model with model function from application_functions.py
             Reg = PinardPlsr(x_train=X_train, x_test=X_test,y_train=y_train, y_test=y_test)
             reg_model = Reg.model_
-
             #M2.dataframe(Pin.pred_data_)
 
-        elif regression_algo == 'Jchemo Local Weighted PLSR':
+        elif regression_algo == reg_algo[2]:
             reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
 
-        elif regression_algo == "Intervalle Selection PLSR":
+        elif regression_algo == reg_algo[3]:
             s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value="min")
-            reg_model = TpeIpls(x_train= X_train, y_train= y_train, x_test=X_test, y_test= y_test,Kfold= 3,scale= True, n_intervall = 3)
-            reg_model.tune(n_iter=10)
-
-        if regression_algo in ["SciKitLearn PLSR", "Jchemo Local Weighted PLSR", "Intervalle Selection PLSR"]:
-            with container3:
-                st.header("Model Diagnosis", divider='blue')
-                yc = Reg.pred_data_[0]
-                ycv = Reg.pred_data_[1]
-                yt = Reg.pred_data_[2]
-                M7.write('Predicted vs Measured values')
-                M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
-                M8.write('Residuals plot')
-                M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
-
-
-        # Export the model with pickle or joblib
-        if regression_algo != '':
-            M1.write("-- Performance metrics --")
+            it = M2.number_input(label='Enter the maximum number of iteration', min_value=50, max_value=1000, value="min")
+            Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = 6)
+            rega = Reg.BandSelect(n_iter=it)
+            reg_model = Reg.model_
+        
+        ################# Model analysis ############
+
+        if regression_algo in reg_algo[1:]:
+            yc = Reg.pred_data_[0]
+            ycv = Reg.pred_data_[1]
+            yt = Reg.pred_data_[2]
+            
+            M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
+            M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
             M1.dataframe(Reg.metrics_)
-            M1.write("-- Save the model --")
+            
+            
             #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
-            model_name = M1.text_input('Give it a name')
-            if M1.button('Export Model'):
+            model_name = M9.text_input('Give it a name')
+            if M9.button('Export Model'):
                 #export_package = __import__(model_export)
                 with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
-                    joblib.dump(reg_model,f)
+                    joblib.dump(reg_model, f)
                 st.write('Model Exported')
 
                 # create a report with information on the model
                 ## see https://stackoverflow.com/a/59578663
-        #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
-
-
-# graphical delimiter
-st.write("---")
-
+        #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
\ No newline at end of file
-- 
GitLab