From 5ab1468862f81f5d2fa194eea1c1b13349dffa93 Mon Sep 17 00:00:00 2001
From: Nicolas Barthes <nicolas.barthes@cnrs.fr>
Date: Mon, 25 Mar 2024 17:21:07 +0100
Subject: [PATCH] # Model Creation - added structure for multiple algorithm -
 added Cross-Validation for SciKit Learn PLSR - bases to add LWPLSR from
 Jchemo (Julia) - README.md updated - added model export to disk # Prediction
 - working with Pinard prediction - load models from disk - export predicted
 values

---
 README.md                |  9 +++++
 app.py                   | 77 +++++++++++++++++++++++++++++++---------
 application_functions.py | 61 ++++++++++++++++++++++++++++---
 requirements.txt         |  3 +-
 4 files changed, 128 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 5af2bde..96ca7e9 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,15 @@ The process includes:
 This package is written in python. You can clone the repository: git clone https://src.koda.cnrs.fr/nicolas.barthes.5/nirs_workflow.git
 
 Then install the requirements: pip install -r requirements.txt
+To use the Local weighted PLS Regression for creation model, you'll need to install Jchemo.jl (https://github.com/mlesnoff/Jchemo.jl), a Julia package.
+From the CLI: python  
+> '>>> import julia  
+'>>> julia.install()  
+'>>> from julia import Pkg  
+'>>> Pkg.add("Jchemo")  
+
+To check if Jchemo is installed without errors:  
+> '>>> Pkg.status()
 
 You can then run: streamlit run ./app.py from the CLI.
 
diff --git a/app.py b/app.py
index f9cb557..708e629 100644
--- a/app.py
+++ b/app.py
@@ -8,7 +8,7 @@ import pandas as pd
 import plotly.express as px
 from sklearn.cluster import KMeans as km
 from sklearn.metrics import pairwise_distances_argmin_min
-from application_functions import pca_maker, model, predict, find_delimiter, umap_maker, find_col_index
+from application_functions import pca_maker, model_PLSR, model_LWPLSR, prediction, find_delimiter, umap_maker, find_col_index, list_files, CV_model
 
 # load images for web interface
 img_sselect = Image.open("images\sselect.JPG")
@@ -89,10 +89,7 @@ with st.container():
                 nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
                 kmeans_samples = km(n_clusters=nb_select, random_state=42)
                 kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
-                # plot the pc with clustering only (no selected samples)
-                # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters")
-                # plot = scatter_column.plotly_chart(graph)
-                # choose between cluster centered sample and random samples
+                # choose between cluster centered sample and n-random samples
                 selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
                 export = []
                 scatter_column.write("Selected samples for chemical analysis:")
@@ -107,7 +104,6 @@ with st.container():
                     selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
                     for i in np.unique(kmeans_samples.labels_):
                         if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
-                            # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
                             # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
                             kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
                             kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
@@ -131,7 +127,7 @@ with st.container():
                 plot = scatter_column.plotly_chart(graph_selected)
                 # button to export the names of selected samples - by cluster if random - in a csv
                 if scatter_column.button('Export'):
-                    pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
+                    pd.DataFrame(export).T.to_csv('./data/sample_selections/Samples_from_' + sselectx_csv.name + '_for_Chemical_Analysis.csv')
             else:
                 scatter_column.write("_Please Choose a file_")
             # clustering via UMAP / HDBSCAN
@@ -152,17 +148,34 @@ with st.container():
     st.header("Create a model")
     st.image(img_predict)
     st.write("Create a model to then predict chemical values from NIRS spectra")
+    available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR"]
     # CSV files loader
     xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
     ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
     if xcal_csv is not None and ycal_csv is not None:
         # Select list for CSV delimiter
-        sep = st.selectbox("Select csv separator - CSV Detected separator: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
-        # Select list for column indexes True / False
-        hdr = st.selectbox("column indexes in csv?", options=["yes", "no"], key=1)
-        rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
-        # Train model with model function from application_functions.py
-        trained_model = model(xcal_csv, ycal_csv, sep, hdr, rd_seed)
+        sep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
+        # Select list for CSV header True / False
+        hdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
+        regression_algo = st.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12)
+        if regression_algo == 'SciKitLearn PLSR':
+            rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
+            # Train model with model function from application_functions.py
+            trained_model = model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed)
+        elif regression_algo == 'Jchemo Local Weighted PLSR':
+            trained_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
+        # Export the model with pickle or joblib
+        if regression_algo != '':
+            st.write("-- Save the model --")
+            model_export = st.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
+            model_name = st.text_input('Give it a name')
+            if st.button('Export Model'):
+                export_package = __import__(model_export)
+                with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + model_export + '.pkl','wb') as f:
+                    export_package.dump(trained_model,f)
+                st.write('Model Exported')
+                # create a report with information on the model
+                ## see https://stackoverflow.com/a/59578663
 
 # graphical delimiter
 st.write("---")
@@ -171,7 +184,37 @@ with st.container():
     st.header("Predict")
     st.write("---")
     st.write("Predict chemical values from NIRS")
-    NIRS_csv = st.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
-    psep = st.selectbox("Select csv separator", options=[";", ","], key=2)
-    phdr = st.selectbox("indexes column in csv?", options=["yes", "no"], key=3)
-    st.button("Predict", on_click=predict)
\ No newline at end of file
+    file_column, space, model_column = st.columns((3, 1, 3))
+    NIRS_csv = file_column.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
+    export_name = './data/predictions/Predictions_of_'
+    if NIRS_csv:
+        export_name += str(NIRS_csv.name[:-4])
+        qsep = file_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+NIRS_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+NIRS_csv.name))), key=2)
+        qhdr = file_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+NIRS_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+NIRS_csv.name))), key=3)
+        # Load the model with pickle or joblib
+        model_column.write("Load a saved model")
+        model_import = model_column.selectbox("Choose a way to import", options=["pickle", "joblib"], key=22)
+        model_name_import = model_column.selectbox('Choose file:', options=[' '] + list_files('data/models/', model_import), key = 21)
+        if model_name_import != ' ':
+            export_name += '_with_' + str(model_name_import[:-4])
+            export_package = __import__(model_import)
+            with open('data/models/'+ model_name_import,'rb') as f:
+                model_loaded = export_package.load(f)
+            if model_loaded:
+                model_column.write('Model Imported')
+    result = ''
+    if st.button("Predict"):
+        result = prediction(NIRS_csv, qsep, qhdr, model_loaded)
+        st.write('Predicted values are: ')
+        st.dataframe(result)
+        pd.DataFrame(result).to_csv(export_name + '.csv')
+        st.write('Predictions exported to ' + export_name + '.csv')
+        # # export to local drive
+        # from urllib.request import urlretrieve
+        # url = ('http://localhost:8501' + export_name[1:] + '.csv')
+        # filename = export_name + '.csv'
+        # urlretrieve(url, filename)
+        # create a report with information on the prediction
+        ## see https://stackoverflow.com/a/59578663
+    if type(result) is list:
+        st.write(result)
\ No newline at end of file
diff --git a/application_functions.py b/application_functions.py
index c8ba4d8..d4ff907 100644
--- a/application_functions.py
+++ b/application_functions.py
@@ -83,7 +83,7 @@ def pca_maker(data_import):
     return output, list(categorical_data.columns), new_column_names
 
 # create model module with PINARD
-def model(xcal_csv, ycal_csv, sep, hdr, rd_seed):
+def model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed):
     from pinard import utils
     from pinard import preprocessing as pp
     from pinard.model_selection import train_test_split_idx
@@ -133,11 +133,64 @@ def model(xcal_csv, ycal_csv, sep, hdr, rd_seed):
     st.write("MAE: " + str(mean_absolute_error(y_test, Y_preds)))
     st.write("MSE: " + str(mean_squared_error(y_test, Y_preds)))
     st.write("MAPE: " + str(mean_absolute_percentage_error(y_test, Y_preds)))
+
+    # Cross-Validate the model
+    CV_model(estimator, X_train, y_train, 3)
+
     return (trained)
 
+# Cross-Validation of the model
+def CV_model(estimator, x, y, cv):
+    from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
+    from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
+    st.write('Cross-Validation of this model')
+    st.write("CV_scores", cross_val_score(estimator, x, y, cv=cv))
+    st.write("-- CV predict --")
+    Y_preds = cross_val_predict(estimator, x, y, cv=3)
+    st.write("MAE", mean_absolute_error(y, Y_preds))
+    st.write("MSE", mean_squared_error(y, Y_preds))
+    st.write("MAPE", mean_absolute_percentage_error(y, Y_preds))
+    st.write("RÂ²", r2_score(y, Y_preds))
+    st.write("-- Cross Validate --")
+    cv_results = cross_validate(estimator, x, y, cv=cv, return_train_score=True, n_jobs=3)
+    for key in cv_results.keys():
+        st.write(key, cv_results[key])
+
+def model_LWPLSR(xcal_csv, ycal_csv, sep, hdr):
+    import julia
+    from julia import Jchemo
+    from pinard import utils
+    from pinard.model_selection import train_test_split_idx
+    # hdr var correspond to column header True or False in the CSV
+    if hdr == 'yes':
+        col = 0
+    else:
+        col = False
+    # loading the csv
+    x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
+    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
+    train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42)
+    # Assign data to training and test sets
+    X_train, y_train, X_test, y_test = x[train_index], y[train_index], x[test_index], y[test_index]
+    st.write("Size of train and test sets: train " + str(X_train.shape) + ' ' + str(y_train.shape) + ' / test ' + str(X_test.shape) + ' ' + str(y_test.shape))
+
+    Jchemo.lwplsr(X_train, y_train, nlvdis=4, metric = eucl, k = 10)
+
 
 # predict module
-def predict():
-    display = "Prediction with: " + str(NIRS_csv), str(psep), str(phdr)
-    st.success(display)
+def prediction(NIRS_csv, qsep, qhdr, model):
+    # hdr var correspond to column header True or False in the CSV
+    if qhdr == 'yes':
+        col = 0
+    else:
+        col = False
+    X_test = pd.read_csv(NIRS_csv, sep=qsep, index_col=col)
+    Y_preds = model.predict(X_test)
+    # Y_preds = X_test
+    return Y_preds
 
+def list_files(mypath, import_type):
+    from os import listdir
+    from os.path import isfile, join
+    list_files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith(import_type + '.pkl')]
+    return list_files
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 851ef9d..b833c6c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ requests>=2.24.0
 Pillow>=8.4.0
 protobuf>=3.19.0
 watchdog>=2.1.8
-pinard>=1.0
\ No newline at end of file
+pinard>=1.0
+julia>=0.6.2
\ No newline at end of file
-- 
GitLab