From 2d045dd9eecc05d93eb1d80d7da740cb194d0069 Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Thu, 4 Apr 2024 09:59:48 +0200 Subject: [PATCH] PLSR pinard update --- Class_Mod/PLSR_.py | 113 +++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 54 deletions(-) diff --git a/Class_Mod/PLSR_.py b/Class_Mod/PLSR_.py index c76718a..4070347 100644 --- a/Class_Mod/PLSR_.py +++ b/Class_Mod/PLSR_.py @@ -1,58 +1,63 @@ from Packages import * from Class_Mod.Miscellaneous import * +from Class_Mod.Regression_metrics import metrics -# create model module with PINARD -def model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed): - np.random.seed(rd_seed) - # hdr var correspond to column header True or False in the CSV - if hdr == 'yes': - col = 0 - else: - col = False - # loading the csv - x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) - # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) - # Assign data to training and test sets - X_train, y_train, X_test, y_test = x[train_index], y[train_index], x[test_index], y[test_index] - st.write("Size of train and test sets: train " + str(X_train.shape) + ' ' + str(y_train.shape) + ' / test ' + str(X_test.shape) + ' ' + str(y_test.shape)) - # Declare preprocessing pipeline - svgolay = [ ('_sg1',pp.SavitzkyGolay()), - ('_sg2',pp.SavitzkyGolay()) # nested pipeline to perform the Savitzky-Golay method twice for 2nd order preprocessing - ] - preprocessing = [ ('id', pp.IdentityTransformer()), # Identity transformer, no change to the data - ('savgol', pp.SavitzkyGolay()), # Savitzky-Golay smoothing filter - ('derivate', pp.Derivate()), # Calculate the first derivative of the data - ('SVG', FeatureUnion(svgolay)) - # Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())]) # nested pipeline to perform the Savitzky-Golay method twice for 2nd order preprocessing - ] - # Declare complete pipeline - pipeline = Pipeline([ - ('scaler', MinMaxScaler()), # scaling the data - ('preprocessing', FeatureUnion(preprocessing)), # preprocessing - ('PLS', PLSRegression()) # regressor - ]) - # Estimator including y values scaling - estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler()) - # Training - trained = estimator.fit(X_train, y_train) - # fit scores - st.write("fit scores / R²: " + str(estimator.score(X_test,y_test))) - # Predictions on test set - Y_preds = estimator.predict(X_test) # make predictions on test data and assign to Y_preds variable - ################################################################################################################ - met= {"MAE: ":[5], - "MSE: ":[5], - "MSE: ":[8]} - met = pd.DataFrame(met).T - ################################################################################################################ - st.table(met) - st.write("MAE: " + str(mean_absolute_error(y_test, Y_preds))) - st.write("MSE: " + str(mean_squared_error(y_test, Y_preds))) - st.write("MAPE: " + str(mean_absolute_percentage_error(y_test, Y_preds))) - - # Cross-Validate the model - CV_model(estimator, X_train, y_train, 3) - - return (trained) \ No newline at end of file +class PinardPlsr: + def __init__(self, x_train, y_train, x_test, y_test): + self.x_train = x_train + self.x_test = x_test + self.y_train = y_train + self.y_test = y_test + + # create model module with PINARD + # Declare preprocessing pipeline + svgolay = [ ('_sg1',pp.SavitzkyGolay()), + ('_sg2',pp.SavitzkyGolay()) # nested pipeline to perform the Savitzky-Golay method twice for 2nd order preprocessing + ] + preprocessing = [ ('id', pp.IdentityTransformer()), # Identity transformer, no change to the data + ('savgol', pp.SavitzkyGolay()), # Savitzky-Golay smoothing filter + ('derivate', pp.Derivate()), # Calculate the first derivative of the data + ('SVG', FeatureUnion(svgolay)) + # Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())]) # nested pipeline to perform the Savitzky-Golay method twice for 2nd order preprocessing + ] + # Declare complete pipeline + pipeline = Pipeline([ + ('scaler', MinMaxScaler()), # scaling the data + ('preprocessing', FeatureUnion(preprocessing)), # preprocessing + ('PLS', PLSRegression())]) + # Estimator including y values scaling + estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler()) + # Training + self.trained = estimator.fit(self.x_train, self.y_train) + + + # fit scores + # Predictions on test set + self.yc = pd.DataFrame(self.trained.predict(self.x_train)) # make predictions on test data and assign to Y_preds variable + self.ycv = pd.DataFrame(cross_val_predict(self.trained, self.x_train, self.y_train, cv = 3)) # make predictions on test data and assign to Y_preds variable + self.yt = pd.DataFrame(self.trained.predict(self.x_test)) # make predictions on test data and assign to Y_preds variable + + ################################################################################################################ + + + + ################################################################################################################ + + @property + def model_(self): + return self.trained + + @property + def metrics_(self): + metc = metrics(train=(self.y_train, self.yc)) + metcv = metrics(train=(self.y_train, self.ycv)) + mett = metrics( train=(self.y_test, self.yt)) + met = pd.concat([metc, metcv, mett], axis = 0) + met.index = ['calib','cv','test'] + return met + + @property + def pred_data_(self): + + return self.yc, self.ycv, self.yt \ No newline at end of file -- GitLab