- Cross validation methods

- Implementing CV and visualizing the results - Organization des classes des algos de régression

- Cross validation methods
- Implementing CV and visualizing the results - Organization des classes des algos de régression
2ec31fd7 · DIANE · ac1a9cbd · 2ec31fd7 · 2ec31fd7 · 2ec31fd7
Commit 2ec31fd7 authored 9 months ago by DIANE
--- a/src/Class_Mod/DATA_HANDLING.py
+++ b/src/Class_Mod/DATA_HANDLING.py
 from Packages import *
+from .Evaluation_Metrics import metrics

 ## try to automatically detect the field separator within the CSV
 def find_delimiter(filename):
@@ -62,24 +63,17 @@ def MinMaxScale(X):
    sc = pd.DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns)
    return sc

-
-
 ######################################## Spectral preprocessing
-
 def Detrend(X):
    c = detrend(X, axis=-1, type='linear', bp=0, overwrite_data=False)
    return c

-def Sg11(X):
-    c = savgol_filter(X, polyorder=1, deriv=1, window_length = 7)
-    return c
-
 def Snv(X):
    xt = np.array(X).T
    c = (xt-xt.mean())/xt.std()
    return pd.DataFrame(c.T, index=X.index, columns= X.columns)

-def Non(X):
+def No_transformation(X):
    return X


@@ -105,18 +99,36 @@ class KF_CV:
        y = np.array(y)

        yp = {}
-        folds = CV(x=x, y=y, n_folds=n_folds)### Test index
+        folds = KF_CV.CV(x=x, y=y, n_folds=n_folds)### Test index
        key = list(folds.keys())

        for i in range(n_folds):
            model.fit(np.delete(x, folds[key[i]], axis=0), np.delete(y, folds[key[i]], axis=0))
            yp[key[i]] = model.predict(x[folds[key[i]]]) #### predictions/fold
-        return yp, folds
+        
+
+        cvcv = {}
+        coeff = {}
+        for i, Fname in enumerate(folds.keys()):
+            r = pd.DataFrame()
+            r['Predicted'] = yp[Fname]
+            r['Measured'] = y[folds[Fname]]
+            ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]),yp[Fname].reshape(-1,1))
+            r.index = folds[Fname]
+            r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0]
+            cvcv[i] = r
+            coeff[Fname] = [ols.coef_[0][0], ols.intercept_[0]]
+
+        data = pd.concat(cvcv, axis = 0)
+        data['index'] = [data.index[i][1] for i in range(data.shape[0])]
+        data.index = data['index']
+        coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept'])    
+        return yp, folds, data, coeff

    ### compute metrics for each fold
    @staticmethod
    def process(model, x, y, n_folds:int):
-        f, idx = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds)
+        f, idx,_ , _ = KF_CV.cross_val_predictor(model, x=x,y=y, n_folds=n_folds)
        e = {}
        for i in idx.keys():
            e[i] = metrics().reg_(y.iloc[idx[i]],f[i])
@@ -138,7 +150,7 @@ class KF_CV:
    @staticmethod
    def ycv(model, x, y, n_folds:int):
        ycv = np.zeros(y.shape[0])
-        f, idx = KF_CV.cross_val_predictor(model, x,y, n_folds)
+        f, idx,_,_ = KF_CV.cross_val_predictor(model, x,y, n_folds)
        for i in f.keys():
            ycv[idx[i]] = f[i]            
        return ycv
--- a/src/Class_Mod/RegModels.py
+++ b/src/Class_Mod/RegModels.py
+from Packages import *
+from Class_Mod import metrics, Snv, No_transformation, KF_CV
+
+
+class Regmodel(object):
+    
+    def __init__(self, train : [pd.DataFrame, pd.DataFrame], test : [pd.DataFrame, pd.DataFrame], n_iter, add_hyperparams = None, nfolds = 5, **kwargs):
+        self.SCORE = 100000000
+        self._xc, self._xt, self._ytrain, self._ytest = train[0], test[0], train[1], test[1]
+        self._nc, self._nt, self._p = train[0].shape[0], test[0].shape[0], train[0].shape[1]
+        self._model, self._best = None, None
+        self._yc, self._ycv, self._yt = None, None, None
+        self._cv_df = pd.DataFrame
+        self._nfolds = nfolds
+        self.bands = pd.DataFrame()
+        self.important_features = pd.DataFrame()
+        self._hyper_params = {'polyorder': hp.choice('polyorder', [0, 1, 2]),
+                            'deriv': hp.choice('deriv', [0, 1, 2]),
+                            'window_length': hp.choice('window_length', [15, 21, 27, 33]),
+                            'scatter': hp.choice('scatter', ['Snv', 'No_transformation'])}
+        if add_hyperparams is not None:
+            self._hyper_params.update(add_hyperparams)
+            self._best = None
+
+        trials = Trials()
+        best_params = fmin(fn=self.objective,
+                            space=self._hyper_params,
+                            algo=tpe.suggest,  # Tree of Parzen Estimators’ (tpe) which is a Bayesian approach
+                            max_evals=n_iter,
+                            trials=trials,
+                            verbose=1)
+    
+    @property
+    def train_data_(self):
+        return [self._xc, self._ytrain]
+    
+    @property
+    def test_data_(self):
+        return [self._xt, self._ytest]
+
+    @property
+    def get_params_(self):
+       return self._hyper_params
+    
+    def objective(self, params):
+       pass
+    
+    @property
+    def best_hyperparams(self): 
+       return self._best
+    
+    @property
+    def model_(self):
+        return self._model
+    
+    @property
+    def pred_data_(self):
+        return self._yc, self._yt
+    
+    @property
+    def cv_data_(self):
+        return self._ycv
+    
+    @property
+    def CV_results_(self):
+        return self._cv_df
+    @property
+    def important_features_(self):
+        return self.important_features
+  
+###########################################    #########################################
+class Plsr(Regmodel):
+    def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10):
+        super().__init__(train, test, n_iter, add_hyperparams = {'n_components': hp.randint('n_components', 2,20)})
+        ### parameters in common
+        
+    def objective(self, params):
+        x0 = [self._xc, self._xt]
+        
+        x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)]
+
+        a, b, c = params['deriv'], params['polyorder'], params['window_length']
+        if a > b or b > c:
+            if self._best is not None:
+                a, b, c = self._best['deriv'], self._best['polyorder'], self._best['window_length']
+
+            else:
+                a, b, c = 0, 0, 1
+
+        params['deriv'], params['polyorder'], params['window_length']  = a, b, c
+        x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)]
+
+        Model = PLSRegression(scale = False, n_components = params['n_components'])
+        self._cv_df = KF_CV().process(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
+        self._cv_df['Average'] = self._cv_df.mean(axis = 1)
+        self._cv_df['S'] = self._cv_df.std(axis = 1)
+        self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
+        self._cv_df = self._cv_df.T.round(2)
+        score = self._cv_df.loc["CV(%)",'rmse']
+        
+        Model = PLSRegression(scale = False, n_components = params['n_components'])
+        Model.fit(x2[0], self._ytrain)
+
+        if self.SCORE > score:
+            self.SCORE = score
+            self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0], y = self._ytrain, n_folds = self._nfolds)
+            self._yc = Model.predict(x2[0])
+            self._yt = Model.predict(x2[1])
+            self._model = Model
+            self._best = params
+            self.x2 = x2[0]
+        return score
+
+
+    ############################################    #########################################
+class TpeIpls(Regmodel):
+    def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10, n_intervall = 5):
+        self.n_intervall = n_intervall
+        self.n_arrets = self.n_intervall*2
+        self.bands = pd.DataFrame()
+        self.bands.index = ['from', 'to']
+        
+        r = {'n_components': hp.randint('n_components', 2,20)}
+        r.update({f'v{i}': hp.randint(f'v{i}', 0, train[0].shape[1]) for i in range(1,self.n_arrets+1)})
+
+        super().__init__(train, test, n_iter, add_hyperparams = r)
+        ### parameters in common
+        
+    def objective(self, params):
+        ### wevelengths index
+        self.idx = [params[f'v{i}'] for i in range(1,self.n_arrets+1)]
+        self.idx.sort()
+        arrays = [np.arange(self.idx[2*i],self.idx[2*i+1]+1) for i in range(self.n_intervall)]
+        id = np.unique(np.concatenate(arrays, axis=0), axis=0)
+
+        # ## Preprocessing
+        x0 = [self._xc, self._xt]
+        x1 = [eval(str(params['scatter'])+"(x0[i])") for i in range(2)]
+
+        a, b, c = params['deriv'], params['polyorder'], params['window_length']
+        if a > b or b > c:
+            if self._best is not None:
+                a, b, c = self._best['deriv'], self._best['polyorder'], self._best['window_length']
+
+            else:
+                a, b, c = 0, 0, 1
+
+        params['deriv'], params['polyorder'], params['window_length']  = a, b, c
+        x2 = [savgol_filter(x1[i], polyorder=params['polyorder'], deriv=params['deriv'], window_length = params['window_length']) for i in range(2)]
+        # print(x2)
+
+        # ## Modelling
+        Model = PLSRegression(scale = False, n_components = params['n_components'])
+        self._cv_df = KF_CV().process(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
+        self._cv_df['Average'] = self._cv_df.mean(axis = 1)
+        self._cv_df['S'] = self._cv_df.std(axis = 1)
+        self._cv_df['CV(%)'] = self._cv_df['S'] * 100 / self._cv_df['Average']
+        self._cv_df = self._cv_df.T.round(2)
+        score = self._cv_df.loc['CV(%)','rmse']
+        
+        Model = PLSRegression(scale = False, n_components = params['n_components'])
+        Model.fit(x2[0][:,id], self._ytrain)
+
+        if self.SCORE > score:
+            self.SCORE = score
+            self._ycv = KF_CV().cross_val_predictor(model = Model, x = x2[0][:,id], y = self._ytrain, n_folds = self._nfolds)
+            self._yc = Model.predict(x2[0][:,id])
+            self._yt = Model.predict(x2[1][:,id])
+            self._model = Model
+            self._best = params
+            self.x2 = x2[0][:,id]
+            self.segments = arrays
+            
+            for i in range(len(self.segments)):
+                self.bands[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]]
+            self.bands.index = ['from','to']
+                
+        return score
+    
+    ############################################    #########################################
+
+class Pcr(Regmodel):
+    def __init__(self, train: [pd.DataFrame, pd.DataFrame], test: [pd.DataFrame, pd.DataFrame], n_iter = 10, n_val = 5):
+        super.__init__()
+        {f'pc{i}': hp.randint(f'pc{i+1}', 0, train[0].shape[1]) for i in range(self.n_val)}
\ No newline at end of file
--- a/src/Class_Mod/__init__.py
+++ b/src/Class_Mod/__init__.py
@@ -7,7 +7,7 @@ from .DATA_HANDLING import *
 from .PLSR_ import PinardPlsr
 from .LWPLSR_ import LWPLSR
 from .Evaluation_Metrics import metrics
-from .VarSel import TpeIpls
+#from .VarSel import TpeIpls
 from .Miscellaneous import resid_plot, reg_plot
 from .DxReader import DxRead, read_dx
 from .HDBSCAN_Clustering import Hdbscan
@@ -15,3 +15,4 @@ from .SK_PLSR_ import PlsR
 from .PLSR_Preprocess import PlsProcess
 from .NMF_ import Nmf
 from .Ap import AP
+from .RegModels import Plsr, TpeIpls
\ No newline at end of file
--- a/src/Modules.py
+++ b/src/Modules.py
 from Packages import *
-from Class_Mod import PlsR, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP
+from Class_Mod import Plsr, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP
 from Class_Mod import LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx, PlsProcess
 from Class_Mod.DATA_HANDLING import *
 from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra, local_css
 from style.header import add_header
 from Report import report
 css_file = Path("style/")
-local_css(css_file / "style.css")
\ No newline at end of file
+
+local_css(css_file / "style.css")
+
+# path = os.path.dirname(os.path.abspath(__file__)).replace('\\','/')
+# d1 = path.find('/')
+# css_file = path[:d1]+'/style'
+# st.session_state["interface"] = st.session_state.get('interface')
+# if st.session_state["interface"] == 'simple':
+#     hide_pages("Predictions")
+# local_css(css_file +"/style.css")
--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -10,6 +10,10 @@ add_header()
 st.session_state["interface"] = st.session_state.get('interface')
 if st.session_state["interface"] == 'simple':
    hide_pages("Predictions")
+
+#path = os.path.dirname(os.path.abspath(__file__)).replace('\\','/')
+#css_file = path[:path.find('/pages')]+'/style'
+#local_css(css_file +"/style_model.css")
 local_css(css_file / "style_model.css")

    ####################################### page Design #######################################
@@ -19,13 +23,17 @@ st.header("I - Data visualization", divider='blue')
 M0, M00 = st.columns([1, .4])
 st.header("II - Model creation", divider='blue')
 M1, M2, M3 = st.columns([2,2,2])
+st.header("Cross_Validation")
+cv1, cv2 = st.columns([2,2])
+cv3 = st.container()
+
 st.header("III - Model Diagnosis", divider='blue')
 M7, M8 = st.columns([2,2])
 M7.write('Predicted vs Measured values')
 M8.write('Residuals plot')
 M9 = st.container()
 M9.write("-- Save the model --")
-            ######################################################################
+    ##############################################################################################

 reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR"]
      #######################################        ###########################################
@@ -144,8 +152,7 @@ if not spectra.empty and not y.empty:
    regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
    if regression_algo == reg_algo[1]:
        # Train model with model function from application_functions.py
-        Reg = PlsProcess(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test, scale = False, Kfold=3)
-        Reg.tune(n_iter=500)
+        Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=10)
        reg_model = Reg.model_
        #M2.dataframe(Pin.pred_data_)
    elif regression_algo == reg_algo[2]:
@@ -166,7 +173,7 @@ if not spectra.empty and not y.empty:
        Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
        for i in range(len(pred)):
            Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
-            if i is not 4:
+            if i != 4:
                Reg.pred_data_[i].index = list(y_train.index)
            else:
                Reg.pred_data_[i].index = list(y_test.index)
@@ -176,16 +183,15 @@ if not spectra.empty and not y.empty:
        it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
        progress_text = "The model is being created. Please wait."
            
-        Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
+        Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it)
        pro = M1.progress(0, text="The model is being created. Please wait!")
-        rega = Reg.BandSelect(n_iter = it)
        pro.empty()
        M1.progress(100, text = "The model has successfully been  created!")            
        time.sleep(1)
        reg_model = Reg.model_
        M3.write('-- Spectral regions used for model creation --')
-        wls = rega[0]
-        M3.table(wls)
+        intervalls = Reg.bands.T
+        M3.table(intervalls)
        fig, ax = plt.subplots(figsize = (12, 6))
        X_train.mean().plot(ax = ax)
        for i in range(s):
@@ -193,12 +199,13 @@ if not spectra.empty and not y.empty:
            num = {'u','i','f','c'}
            if np.array(X_train.columns).dtype.kind in num:
                plt.plot(X_train.columns, X_train.mean(), color = 'black')
-                ax.axvspan(X_train.columns[rega[0]['from'][i]], X_train.columns[rega[0]['to'][i]], color='#2a52be', alpha=0.5, lw=0)
+                ax.axvspan(X_train.columns[intervalls['from'][i]], X_train.columns[intervalls['to'][i]],
+                            color='#2a52be', alpha=0.5, lw=0)
                plt.tight_layout()
                plt.margins(x = 0)
            else:
-                plt.plot(np.arange(X_train.shape[1]), X_train.mean())
-                ax.axvspan(rega[0]['from'][i], rega[0]['to'][i], color='#2a52be', alpha=0.5, lw=0)
+                plt.plot(np.arange(X_train.shape[1]), X_train.mean(), color = 'black')
+                ax.axvspan(intervalls['from'][i], intervalls['to'][i], color='#2a52be', alpha=0.5, lw=0)
                plt.tight_layout()
                plt.margins(x = 0)
        
@@ -211,8 +218,26 @@ if not spectra.empty and not y.empty:

        ################# Model analysis ############
    if regression_algo in reg_algo[1:]:
+        cv2.write('-- Cross-Validation Summary--')
+        cv2.write(Reg.CV_results_)
+        cv2.write('-- Out-of-Fold Predictions Visualization (All in one) --')
+
+        fig1 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", 
+                 color_discrete_sequence=px.colors.qualitative.G10)
+        fig1.add_shape(type='line', x0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), x1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), y0 = .95 * min(Reg.cv_data_[2].loc[:,'Measured']), y1 = 1.05 * max(Reg.cv_data_[2].loc[:,'Measured']), line = dict(color='black', dash = "dash"))
+        fig1.update_traces(marker_size=7, showlegend=False)
+        
+        cv2.plotly_chart(fig1)
+
+        fig0 = px.scatter(Reg.cv_data_[2], x ='Measured', y = 'Predicted' , trendline='ols', color='Folds', symbol="Folds", facet_col = 'Folds',facet_col_wrap=1,
+                 color_discrete_sequence=px.colors.qualitative.G10, text='index', width=800, height=1000)
+        fig0.update_traces(marker_size=8, showlegend=False)
+
+        cv1.write('-- Out-of-Fold Predictions Visualization (Separate plots) --')
+        cv1.plotly_chart(fig0)
+        
        yc = Reg.pred_data_[0]
-        yt = Reg.pred_data_[2]
+        yt = Reg.pred_data_[1]
            
        #if
        M2.write('-- Spectral preprocessing info --')
@@ -231,7 +256,8 @@ if not spectra.empty and not y.empty:

        M7.pyplot(reg_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index))
        M8.pyplot(resid_plot([y_train, y_test],[yc, yt], train_idx = train_index, test_idx = test_index))
-            
+        
+        # rega = Reg.important_features_  ##### ADD FEATURES IMPORTANCE PLOT
            
            #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
        model_name = M9.text_input('Give it a name')