PLSR_.py

from Packages import *
from Class_Mod.Miscellaneous import * 


# create model module with PINARD
def model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed):
    np.random.seed(rd_seed)
    # hdr var correspond to column header True or False in the CSV
    if hdr == 'yes':
        col = 0
    else:
        col = False
    # loading the csv
    x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
    train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
    # Assign data to training and test sets
    X_train, y_train, X_test, y_test = x[train_index], y[train_index], x[test_index], y[test_index]
    st.write("Size of train and test sets: train " + str(X_train.shape) + ' ' + str(y_train.shape) + ' / test ' + str(X_test.shape) + ' ' + str(y_test.shape))
    # Declare preprocessing pipeline
    svgolay = [   ('_sg1',pp.SavitzkyGolay()),
                  ('_sg2',pp.SavitzkyGolay())  # nested pipeline to perform the Savitzky-Golay method twice for 2nd order preprocessing
                  ]
    preprocessing = [   ('id', pp.IdentityTransformer()), # Identity transformer, no change to the data
                        ('savgol', pp.SavitzkyGolay()), # Savitzky-Golay smoothing filter
                        ('derivate', pp.Derivate()), # Calculate the first derivative of the data
                        ('SVG', FeatureUnion(svgolay))
                        # Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())])  # nested pipeline to perform the Savitzky-Golay method twice for 2nd order preprocessing
                        ]
    # Declare complete pipeline
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()), # scaling the data
        ('preprocessing', FeatureUnion(preprocessing)), # preprocessing
        ('PLS',  PLSRegression()) # regressor
    ])
    # Estimator including y values scaling
    estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())
    # Training
    trained = estimator.fit(X_train, y_train)
    # fit scores
    st.write("fit scores / R²: " + str(estimator.score(X_test,y_test)))
    # Predictions on test set
    Y_preds = estimator.predict(X_test) # make predictions on test data and assign to Y_preds variable
    ################################################################################################################
    met= {"MAE: ":[5],
          "MSE: ":[5],
          "MSE: ":[8]}
    met = pd.DataFrame(met).T
    ################################################################################################################
    st.table(met)
    st.write("MAE: " + str(mean_absolute_error(y_test, Y_preds)))
    st.write("MSE: " + str(mean_squared_error(y_test, Y_preds)))
    st.write("MAPE: " + str(mean_absolute_percentage_error(y_test, Y_preds)))

    # Cross-Validate the model
    CV_model(estimator, X_train, y_train, 3)

    return (trained)