From 5ab1468862f81f5d2fa194eea1c1b13349dffa93 Mon Sep 17 00:00:00 2001 From: Nicolas Barthes <nicolas.barthes@cnrs.fr> Date: Mon, 25 Mar 2024 17:21:07 +0100 Subject: [PATCH] # Model Creation - added structure for multiple algorithm - added Cross-Validation for SciKit Learn PLSR - bases to add LWPLSR from Jchemo (Julia) - README.md updated - added model export to disk # Prediction - working with Pinard prediction - load models from disk - export predicted values --- README.md | 9 +++++ app.py | 77 +++++++++++++++++++++++++++++++--------- application_functions.py | 61 ++++++++++++++++++++++++++++--- requirements.txt | 3 +- 4 files changed, 128 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 5af2bde..96ca7e9 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,15 @@ The process includes: This package is written in python. You can clone the repository: git clone https://src.koda.cnrs.fr/nicolas.barthes.5/nirs_workflow.git Then install the requirements: pip install -r requirements.txt +To use the Local weighted PLS Regression for creation model, you'll need to install Jchemo.jl (https://github.com/mlesnoff/Jchemo.jl), a Julia package. +From the CLI: python +> '>>> import julia +'>>> julia.install() +'>>> from julia import Pkg +'>>> Pkg.add("Jchemo") + +To check if Jchemo is installed without errors: +> '>>> Pkg.status() You can then run: streamlit run ./app.py from the CLI. diff --git a/app.py b/app.py index f9cb557..708e629 100644 --- a/app.py +++ b/app.py @@ -8,7 +8,7 @@ import pandas as pd import plotly.express as px from sklearn.cluster import KMeans as km from sklearn.metrics import pairwise_distances_argmin_min -from application_functions import pca_maker, model, predict, find_delimiter, umap_maker, find_col_index +from application_functions import pca_maker, model_PLSR, model_LWPLSR, prediction, find_delimiter, umap_maker, find_col_index, list_files, CV_model # load images for web interface img_sselect = Image.open("images\sselect.JPG") @@ -89,10 +89,7 @@ with st.container(): nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") kmeans_samples = km(n_clusters=nb_select, random_state=42) kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) - # plot the pc with clustering only (no selected samples) - # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters") - # plot = scatter_column.plotly_chart(graph) - # choose between cluster centered sample and random samples + # choose between cluster centered sample and n-random samples selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) export = [] scatter_column.write("Selected samples for chemical analysis:") @@ -107,7 +104,6 @@ with st.container(): selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) for i in np.unique(kmeans_samples.labels_): if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number: - # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index) # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster kmeans_selected_samples = km(n_clusters=selection_number, random_state=42) kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]]) @@ -131,7 +127,7 @@ with st.container(): plot = scatter_column.plotly_chart(graph_selected) # button to export the names of selected samples - by cluster if random - in a csv if scatter_column.button('Export'): - pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv') + pd.DataFrame(export).T.to_csv('./data/sample_selections/Samples_from_' + sselectx_csv.name + '_for_Chemical_Analysis.csv') else: scatter_column.write("_Please Choose a file_") # clustering via UMAP / HDBSCAN @@ -152,17 +148,34 @@ with st.container(): st.header("Create a model") st.image(img_predict) st.write("Create a model to then predict chemical values from NIRS spectra") + available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR"] # CSV files loader xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") if xcal_csv is not None and ycal_csv is not None: # Select list for CSV delimiter - sep = st.selectbox("Select csv separator - CSV Detected separator: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) - # Select list for column indexes True / False - hdr = st.selectbox("column indexes in csv?", options=["yes", "no"], key=1) - rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i") - # Train model with model function from application_functions.py - trained_model = model(xcal_csv, ycal_csv, sep, hdr, rd_seed) + sep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) + # Select list for CSV header True / False + hdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) + regression_algo = st.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12) + if regression_algo == 'SciKitLearn PLSR': + rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i") + # Train model with model function from application_functions.py + trained_model = model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed) + elif regression_algo == 'Jchemo Local Weighted PLSR': + trained_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) + # Export the model with pickle or joblib + if regression_algo != '': + st.write("-- Save the model --") + model_export = st.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) + model_name = st.text_input('Give it a name') + if st.button('Export Model'): + export_package = __import__(model_export) + with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + model_export + '.pkl','wb') as f: + export_package.dump(trained_model,f) + st.write('Model Exported') + # create a report with information on the model + ## see https://stackoverflow.com/a/59578663 # graphical delimiter st.write("---") @@ -171,7 +184,37 @@ with st.container(): st.header("Predict") st.write("---") st.write("Predict chemical values from NIRS") - NIRS_csv = st.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") - psep = st.selectbox("Select csv separator", options=[";", ","], key=2) - phdr = st.selectbox("indexes column in csv?", options=["yes", "no"], key=3) - st.button("Predict", on_click=predict) \ No newline at end of file + file_column, space, model_column = st.columns((3, 1, 3)) + NIRS_csv = file_column.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + export_name = './data/predictions/Predictions_of_' + if NIRS_csv: + export_name += str(NIRS_csv.name[:-4]) + qsep = file_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+NIRS_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+NIRS_csv.name))), key=2) + qhdr = file_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+NIRS_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+NIRS_csv.name))), key=3) + # Load the model with pickle or joblib + model_column.write("Load a saved model") + model_import = model_column.selectbox("Choose a way to import", options=["pickle", "joblib"], key=22) + model_name_import = model_column.selectbox('Choose file:', options=[' '] + list_files('data/models/', model_import), key = 21) + if model_name_import != ' ': + export_name += '_with_' + str(model_name_import[:-4]) + export_package = __import__(model_import) + with open('data/models/'+ model_name_import,'rb') as f: + model_loaded = export_package.load(f) + if model_loaded: + model_column.write('Model Imported') + result = '' + if st.button("Predict"): + result = prediction(NIRS_csv, qsep, qhdr, model_loaded) + st.write('Predicted values are: ') + st.dataframe(result) + pd.DataFrame(result).to_csv(export_name + '.csv') + st.write('Predictions exported to ' + export_name + '.csv') + # # export to local drive + # from urllib.request import urlretrieve + # url = ('http://localhost:8501' + export_name[1:] + '.csv') + # filename = export_name + '.csv' + # urlretrieve(url, filename) + # create a report with information on the prediction + ## see https://stackoverflow.com/a/59578663 + if type(result) is list: + st.write(result) \ No newline at end of file diff --git a/application_functions.py b/application_functions.py index c8ba4d8..d4ff907 100644 --- a/application_functions.py +++ b/application_functions.py @@ -83,7 +83,7 @@ def pca_maker(data_import): return output, list(categorical_data.columns), new_column_names # create model module with PINARD -def model(xcal_csv, ycal_csv, sep, hdr, rd_seed): +def model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed): from pinard import utils from pinard import preprocessing as pp from pinard.model_selection import train_test_split_idx @@ -133,11 +133,64 @@ def model(xcal_csv, ycal_csv, sep, hdr, rd_seed): st.write("MAE: " + str(mean_absolute_error(y_test, Y_preds))) st.write("MSE: " + str(mean_squared_error(y_test, Y_preds))) st.write("MAPE: " + str(mean_absolute_percentage_error(y_test, Y_preds))) + + # Cross-Validate the model + CV_model(estimator, X_train, y_train, 3) + return (trained) +# Cross-Validation of the model +def CV_model(estimator, x, y, cv): + from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate + from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score + st.write('Cross-Validation of this model') + st.write("CV_scores", cross_val_score(estimator, x, y, cv=cv)) + st.write("-- CV predict --") + Y_preds = cross_val_predict(estimator, x, y, cv=3) + st.write("MAE", mean_absolute_error(y, Y_preds)) + st.write("MSE", mean_squared_error(y, Y_preds)) + st.write("MAPE", mean_absolute_percentage_error(y, Y_preds)) + st.write("R²", r2_score(y, Y_preds)) + st.write("-- Cross Validate --") + cv_results = cross_validate(estimator, x, y, cv=cv, return_train_score=True, n_jobs=3) + for key in cv_results.keys(): + st.write(key, cv_results[key]) + +def model_LWPLSR(xcal_csv, ycal_csv, sep, hdr): + import julia + from julia import Jchemo + from pinard import utils + from pinard.model_selection import train_test_split_idx + # hdr var correspond to column header True or False in the CSV + if hdr == 'yes': + col = 0 + else: + col = False + # loading the csv + x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) + # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing + train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42) + # Assign data to training and test sets + X_train, y_train, X_test, y_test = x[train_index], y[train_index], x[test_index], y[test_index] + st.write("Size of train and test sets: train " + str(X_train.shape) + ' ' + str(y_train.shape) + ' / test ' + str(X_test.shape) + ' ' + str(y_test.shape)) + + Jchemo.lwplsr(X_train, y_train, nlvdis=4, metric = eucl, k = 10) + # predict module -def predict(): - display = "Prediction with: " + str(NIRS_csv), str(psep), str(phdr) - st.success(display) +def prediction(NIRS_csv, qsep, qhdr, model): + # hdr var correspond to column header True or False in the CSV + if qhdr == 'yes': + col = 0 + else: + col = False + X_test = pd.read_csv(NIRS_csv, sep=qsep, index_col=col) + Y_preds = model.predict(X_test) + # Y_preds = X_test + return Y_preds +def list_files(mypath, import_type): + from os import listdir + from os.path import isfile, join + list_files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith(import_type + '.pkl')] + return list_files \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 851ef9d..b833c6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ requests>=2.24.0 Pillow>=8.4.0 protobuf>=3.19.0 watchdog>=2.1.8 -pinard>=1.0 \ No newline at end of file +pinard>=1.0 +julia>=0.6.2 \ No newline at end of file -- GitLab