Skip to content
Snippets Groups Projects
Commit 5ab14688 authored by Nicolas Barthes's avatar Nicolas Barthes
Browse files

# Model Creation

- added structure for multiple algorithm
- added Cross-Validation for SciKit Learn PLSR
- bases to add LWPLSR from Jchemo (Julia) - README.md updated
- added model export to disk
# Prediction
- working with Pinard prediction
- load models from disk
- export predicted values
parent a4a322a4
No related branches found
No related tags found
No related merge requests found
...@@ -16,6 +16,15 @@ The process includes: ...@@ -16,6 +16,15 @@ The process includes:
This package is written in python. You can clone the repository: git clone https://src.koda.cnrs.fr/nicolas.barthes.5/nirs_workflow.git This package is written in python. You can clone the repository: git clone https://src.koda.cnrs.fr/nicolas.barthes.5/nirs_workflow.git
Then install the requirements: pip install -r requirements.txt Then install the requirements: pip install -r requirements.txt
To use the Local weighted PLS Regression for creation model, you'll need to install Jchemo.jl (https://github.com/mlesnoff/Jchemo.jl), a Julia package.
From the CLI: python
> '>>> import julia
'>>> julia.install()
'>>> from julia import Pkg
'>>> Pkg.add("Jchemo")
To check if Jchemo is installed without errors:
> '>>> Pkg.status()
You can then run: streamlit run ./app.py from the CLI. You can then run: streamlit run ./app.py from the CLI.
......
...@@ -8,7 +8,7 @@ import pandas as pd ...@@ -8,7 +8,7 @@ import pandas as pd
import plotly.express as px import plotly.express as px
from sklearn.cluster import KMeans as km from sklearn.cluster import KMeans as km
from sklearn.metrics import pairwise_distances_argmin_min from sklearn.metrics import pairwise_distances_argmin_min
from application_functions import pca_maker, model, predict, find_delimiter, umap_maker, find_col_index from application_functions import pca_maker, model_PLSR, model_LWPLSR, prediction, find_delimiter, umap_maker, find_col_index, list_files, CV_model
# load images for web interface # load images for web interface
img_sselect = Image.open("images\sselect.JPG") img_sselect = Image.open("images\sselect.JPG")
...@@ -89,10 +89,7 @@ with st.container(): ...@@ -89,10 +89,7 @@ with st.container():
nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i") nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
kmeans_samples = km(n_clusters=nb_select, random_state=42) kmeans_samples = km(n_clusters=nb_select, random_state=42)
kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]]) kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
# plot the pc with clustering only (no selected samples) # choose between cluster centered sample and n-random samples
# graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters")
# plot = scatter_column.plotly_chart(graph)
# choose between cluster centered sample and random samples
selection = settings_column.select_slider('Centered samples or random ones', options=['center','random']) selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
export = [] export = []
scatter_column.write("Selected samples for chemical analysis:") scatter_column.write("Selected samples for chemical analysis:")
...@@ -107,7 +104,6 @@ with st.container(): ...@@ -107,7 +104,6 @@ with st.container():
selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3) selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
for i in np.unique(kmeans_samples.labels_): for i in np.unique(kmeans_samples.labels_):
if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number: if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
# export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
# another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
kmeans_selected_samples = km(n_clusters=selection_number, random_state=42) kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]]) kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
...@@ -131,7 +127,7 @@ with st.container(): ...@@ -131,7 +127,7 @@ with st.container():
plot = scatter_column.plotly_chart(graph_selected) plot = scatter_column.plotly_chart(graph_selected)
# button to export the names of selected samples - by cluster if random - in a csv # button to export the names of selected samples - by cluster if random - in a csv
if scatter_column.button('Export'): if scatter_column.button('Export'):
pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv') pd.DataFrame(export).T.to_csv('./data/sample_selections/Samples_from_' + sselectx_csv.name + '_for_Chemical_Analysis.csv')
else: else:
scatter_column.write("_Please Choose a file_") scatter_column.write("_Please Choose a file_")
# clustering via UMAP / HDBSCAN # clustering via UMAP / HDBSCAN
...@@ -152,17 +148,34 @@ with st.container(): ...@@ -152,17 +148,34 @@ with st.container():
st.header("Create a model") st.header("Create a model")
st.image(img_predict) st.image(img_predict)
st.write("Create a model to then predict chemical values from NIRS spectra") st.write("Create a model to then predict chemical values from NIRS spectra")
available_regression_algo = ["","SciKitLearn PLSR", "Jchemo Local Weighted PLSR"]
# CSV files loader # CSV files loader
xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if xcal_csv is not None and ycal_csv is not None: if xcal_csv is not None and ycal_csv is not None:
# Select list for CSV delimiter # Select list for CSV delimiter
sep = st.selectbox("Select csv separator - CSV Detected separator: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) sep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
# Select list for column indexes True / False # Select list for CSV header True / False
hdr = st.selectbox("column indexes in csv?", options=["yes", "no"], key=1) hdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i") regression_algo = st.selectbox("Choose the algorithm for regression", options=available_regression_algo, key = 12)
# Train model with model function from application_functions.py if regression_algo == 'SciKitLearn PLSR':
trained_model = model(xcal_csv, ycal_csv, sep, hdr, rd_seed) rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
# Train model with model function from application_functions.py
trained_model = model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed)
elif regression_algo == 'Jchemo Local Weighted PLSR':
trained_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
# Export the model with pickle or joblib
if regression_algo != '':
st.write("-- Save the model --")
model_export = st.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
model_name = st.text_input('Give it a name')
if st.button('Export Model'):
export_package = __import__(model_export)
with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + model_export + '.pkl','wb') as f:
export_package.dump(trained_model,f)
st.write('Model Exported')
# create a report with information on the model
## see https://stackoverflow.com/a/59578663
# graphical delimiter # graphical delimiter
st.write("---") st.write("---")
...@@ -171,7 +184,37 @@ with st.container(): ...@@ -171,7 +184,37 @@ with st.container():
st.header("Predict") st.header("Predict")
st.write("---") st.write("---")
st.write("Predict chemical values from NIRS") st.write("Predict chemical values from NIRS")
NIRS_csv = st.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") file_column, space, model_column = st.columns((3, 1, 3))
psep = st.selectbox("Select csv separator", options=[";", ","], key=2) NIRS_csv = file_column.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
phdr = st.selectbox("indexes column in csv?", options=["yes", "no"], key=3) export_name = './data/predictions/Predictions_of_'
st.button("Predict", on_click=predict) if NIRS_csv:
\ No newline at end of file export_name += str(NIRS_csv.name[:-4])
qsep = file_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+NIRS_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+NIRS_csv.name))), key=2)
qhdr = file_column.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+NIRS_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+NIRS_csv.name))), key=3)
# Load the model with pickle or joblib
model_column.write("Load a saved model")
model_import = model_column.selectbox("Choose a way to import", options=["pickle", "joblib"], key=22)
model_name_import = model_column.selectbox('Choose file:', options=[' '] + list_files('data/models/', model_import), key = 21)
if model_name_import != ' ':
export_name += '_with_' + str(model_name_import[:-4])
export_package = __import__(model_import)
with open('data/models/'+ model_name_import,'rb') as f:
model_loaded = export_package.load(f)
if model_loaded:
model_column.write('Model Imported')
result = ''
if st.button("Predict"):
result = prediction(NIRS_csv, qsep, qhdr, model_loaded)
st.write('Predicted values are: ')
st.dataframe(result)
pd.DataFrame(result).to_csv(export_name + '.csv')
st.write('Predictions exported to ' + export_name + '.csv')
# # export to local drive
# from urllib.request import urlretrieve
# url = ('http://localhost:8501' + export_name[1:] + '.csv')
# filename = export_name + '.csv'
# urlretrieve(url, filename)
# create a report with information on the prediction
## see https://stackoverflow.com/a/59578663
if type(result) is list:
st.write(result)
\ No newline at end of file
...@@ -83,7 +83,7 @@ def pca_maker(data_import): ...@@ -83,7 +83,7 @@ def pca_maker(data_import):
return output, list(categorical_data.columns), new_column_names return output, list(categorical_data.columns), new_column_names
# create model module with PINARD # create model module with PINARD
def model(xcal_csv, ycal_csv, sep, hdr, rd_seed): def model_PLSR(xcal_csv, ycal_csv, sep, hdr, rd_seed):
from pinard import utils from pinard import utils
from pinard import preprocessing as pp from pinard import preprocessing as pp
from pinard.model_selection import train_test_split_idx from pinard.model_selection import train_test_split_idx
...@@ -133,11 +133,64 @@ def model(xcal_csv, ycal_csv, sep, hdr, rd_seed): ...@@ -133,11 +133,64 @@ def model(xcal_csv, ycal_csv, sep, hdr, rd_seed):
st.write("MAE: " + str(mean_absolute_error(y_test, Y_preds))) st.write("MAE: " + str(mean_absolute_error(y_test, Y_preds)))
st.write("MSE: " + str(mean_squared_error(y_test, Y_preds))) st.write("MSE: " + str(mean_squared_error(y_test, Y_preds)))
st.write("MAPE: " + str(mean_absolute_percentage_error(y_test, Y_preds))) st.write("MAPE: " + str(mean_absolute_percentage_error(y_test, Y_preds)))
# Cross-Validate the model
CV_model(estimator, X_train, y_train, 3)
return (trained) return (trained)
# Cross-Validation of the model
def CV_model(estimator, x, y, cv):
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
st.write('Cross-Validation of this model')
st.write("CV_scores", cross_val_score(estimator, x, y, cv=cv))
st.write("-- CV predict --")
Y_preds = cross_val_predict(estimator, x, y, cv=3)
st.write("MAE", mean_absolute_error(y, Y_preds))
st.write("MSE", mean_squared_error(y, Y_preds))
st.write("MAPE", mean_absolute_percentage_error(y, Y_preds))
st.write("", r2_score(y, Y_preds))
st.write("-- Cross Validate --")
cv_results = cross_validate(estimator, x, y, cv=cv, return_train_score=True, n_jobs=3)
for key in cv_results.keys():
st.write(key, cv_results[key])
def model_LWPLSR(xcal_csv, ycal_csv, sep, hdr):
import julia
from julia import Jchemo
from pinard import utils
from pinard.model_selection import train_test_split_idx
# hdr var correspond to column header True or False in the CSV
if hdr == 'yes':
col = 0
else:
col = False
# loading the csv
x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
# Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=42)
# Assign data to training and test sets
X_train, y_train, X_test, y_test = x[train_index], y[train_index], x[test_index], y[test_index]
st.write("Size of train and test sets: train " + str(X_train.shape) + ' ' + str(y_train.shape) + ' / test ' + str(X_test.shape) + ' ' + str(y_test.shape))
Jchemo.lwplsr(X_train, y_train, nlvdis=4, metric = eucl, k = 10)
# predict module # predict module
def predict(): def prediction(NIRS_csv, qsep, qhdr, model):
display = "Prediction with: " + str(NIRS_csv), str(psep), str(phdr) # hdr var correspond to column header True or False in the CSV
st.success(display) if qhdr == 'yes':
col = 0
else:
col = False
X_test = pd.read_csv(NIRS_csv, sep=qsep, index_col=col)
Y_preds = model.predict(X_test)
# Y_preds = X_test
return Y_preds
def list_files(mypath, import_type):
from os import listdir
from os.path import isfile, join
list_files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith(import_type + '.pkl')]
return list_files
\ No newline at end of file
...@@ -3,4 +3,5 @@ requests>=2.24.0 ...@@ -3,4 +3,5 @@ requests>=2.24.0
Pillow>=8.4.0 Pillow>=8.4.0
protobuf>=3.19.0 protobuf>=3.19.0
watchdog>=2.1.8 watchdog>=2.1.8
pinard>=1.0 pinard>=1.0
\ No newline at end of file julia>=0.6.2
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment