LWPLSR CV - begin to format results (missing metrics)

996d9c7f · BARTHES Nicolas · 9fdcc282 · 996d9c7f
Commit 996d9c7f authored 8 months ago by BARTHES Nicolas
--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -174,7 +174,8 @@ if not spectra.empty and not y.empty:
        x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
        # Cross-Validation calculation
        nb_folds = 3
-        st.write('KFold = ' + str(nb_folds))
+        st.write('KFold for Cross-Validation = ' + str(nb_folds))
+        # split train data into nb_folds
        folds = KF_CV.CV(x_train_np, y_train_np, nb_folds)
        d = {}
        for i in range(nb_folds):
@@ -183,6 +184,7 @@ if not spectra.empty and not y.empty:
            data_to_work_with.append("ytr_fold{0}".format(i+1))
            data_to_work_with.append("xte_fold{0}".format(i+1))
            data_to_work_with.append("yte_fold{0}".format(i+1))
+        # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files
        temp_path = Path('temp/')
        for i in data_to_work_with:
            if 'fold' in i:
@@ -190,7 +192,7 @@ if not spectra.empty and not y.empty:
            else:
                j = globals()[i]
            np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",")
-        # run Julia Jchemo
+        # run Julia Jchemo as subprocess
        import subprocess
        subprocess_path = Path("Class_Mod/")
        subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"])
@@ -198,21 +200,38 @@ if not spectra.empty and not y.empty:
        try:
            with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
                Reg_json = json.load(outfile)
+                # delete csv files
                for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
+            # delete json file after import
            os.unlink(temp_path / "lwplsr_outputs.json")
+            # format result data into Reg object
            pred = ['pred_data_train', 'pred_data_test']
+            for i in range(nb_folds):
+                pred.append("CV" + str(i+1))
            Reg = type('obj', (object,), {'model' : Reg_json['model'], 'best_lwplsr_params' : Reg_json['best_lwplsr_params'], 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
+            Reg.CV_results_ = pd.DataFrame()
+            Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}}
+            # set indexes to Reg.pred_data (train, test, folds idx)
            for i in range(len(pred)):
                Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
-                if i != 1: # if not pred_data_test
+                if i == 0: # data_train
                    Reg.pred_data_[i].index = list(y_train.index)
-                else:
+                elif i == 1: # data_test
                    Reg.pred_data_[i].index = list(y_test.index)
-            Reg.CV_results_ = pd.DataFrame()
+                else: # CVi
-            Reg.cv_data_ = pd.DataFrame()
+                    Reg.pred_data_[i].index = folds[list(folds)[i-2]]
+                    Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]])
+                    Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = Reg.pred_data_[i]
+                    Reg.cv_data_['idxCV']['Fold' + str(i-1)] = folds[list(folds)[i-2]]
+            Reg.CV_results_.sort_index(inplace = True)
+            Reg.CV_results_.columns = ['Ypredicted_CV']
+            # if you want to display Reg.cv_data_ containing by fold YpredCV and idxCV
+            # cv2.json(Reg.cv_data_)
+            # Display end of modeling message on the interface
            info.empty()
            M1.success('Model created!')
        except FileNotFoundError as e:
+            # Display error message on the interface if modeling is wrong
            info.empty()
            M1.warning('- ERROR during model creation -')
            Reg = None