diff --git a/src/packages.py b/src/packages.py deleted file mode 100644 index 82d82684541a1974b4b0ffdcca0bcfb96e557cb4..0000000000000000000000000000000000000000 --- a/src/packages.py +++ /dev/null @@ -1,81 +0,0 @@ -## Data loading, handling, and preprocessing -import os -import glob -import sys -from pathlib import Path -import csv -import re -import jcamp -import random -from datetime import datetime -import numpy as np -from shutil import rmtree, move, make_archive -from pandas import DataFrame, read_csv, concat, Series, json_normalize -from itertools import combinations -from hashlib import md5 -from matplotlib import colors -from matplotlib.colors import Normalize -from abc import ABC, abstractmethod -from typing import Optional, List -from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder -from scipy.stats import skew, kurtosis -from scipy.signal import savgol_filter, find_peaks_cwt, detrend -import scipy as sc -from kennard_stone import train_test_split as ks_train_test_split -from kennard_stone import KFold as ks_KFold - -### Exploratory data analysis-Dimensionality reduction -from umap.umap_ import UMAP -from sklearn.decomposition import PCA, NMF -from pandas.api.types import is_float_dtype -from plotly.subplots import make_subplots -from matplotlib.cm import ScalarMappable -import streamlit.components.v1 as components -# Clustering -from sklearn.cluster import KMeans, HDBSCAN,AffinityPropagation -from scipy.spatial.distance import euclidean, cdist -from scipy.sparse.csgraph import minimum_spanning_tree -from scipy.sparse import csgraph - -# Modelling -from juliacall import Main as jl - -# from pinard.model_selection import train_test_split_idx - -from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, RandomizedSearchCV -from sklearn.pipeline import Pipeline, FeatureUnion -from sklearn.compose import TransformedTargetRegressor -from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score -from sklearn.cross_decomposition import PLSRegression -from sklearn.linear_model import LinearRegression -## Images and plots -from PIL import Image -import plotly.express as px -import plotly.graph_objects as go -import plotly.io as pio -import matplotlib.pyplot as plt, mpld3 -import seaborn as sns -# import matplotlib - -### Important Metrics -from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score - -## Web app construction -import streamlit as st -from st_pages import Page, Section, show_pages, add_page_title, hide_pages -from tempfile import NamedTemporaryFile, TemporaryDirectory -# help on streamlit input https://docs.streamlit.io/library/api-reference/widgets - -#Library for connecting to SQL DB -import pyodbc - -#Library for reading the config file, which is in JSON -import json - -# save models -from joblib import dump, load, hash -# import pickle as pkl - -from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal - -st.set_option('deprecation.showPyplotGlobalUse', False) \ No newline at end of file diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index e350dc9883bdbda1ee82b0a15fed4e701d26472f..2ceafca2b5521cbeb3bf56abb3bc83a50f900943 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -38,6 +38,7 @@ tcr = DataFrame() sam = DataFrame() sam1 = DataFrame() selected_samples = DataFrame() +selected = [] l1 = [] color_palette = None dr_model = None # dimensionality reduction model @@ -112,7 +113,6 @@ if not spectra.empty: spectra.index = spectra.index.where(~mask, spectra.groupby(spectra.index).cumcount().add(1).astype(str).radd(spectra.index.astype(str) + '#')) - st.write(spectra.shape) if not spectra.empty: if not meta_data.empty: meta_data.index = [str(i) for i in spectra.index] @@ -544,7 +544,7 @@ if selected: else: - st.write(meta_data_) + st.write(meta_data) st.write(DataFrame(result)) diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index c4fad5dc4316e08e4389dc170f97e409d25000b3..1e88a2353cd1ddbc5b1acd82f714eda32a2c9833 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -31,7 +31,7 @@ filetype = c1.radio('Select files format:', options = ['csv', 'dx'] , horizontal x_block = DataFrame() # preallocate the spectral data block y_block = DataFrame() # preallocate the target(s) data block meta_data = DataFrame() - +y = DataFrame() with c1: match filetype: # load csv file @@ -99,6 +99,9 @@ with c1: stringio = StringIO(eval(f'{i}.getvalue().decode("utf-8")')) xy_str += str(stringio.read()) file_name = str(xfile.name) + str(yfile.name) + + if None in [namesx, namesy]: + st.warning('Warning: Ensure each row in one file matches the same sample in the other file to maintain correct x-y data alignment.') @@ -123,371 +126,401 @@ with c1: st.info('Info: Load your file here!') # ~~~~~~~~~~~~~~~~~~~~~~~~~~ -if x_block.shape[1]>0 and y_block.shape[1]>0 : +if x_block.shape[1] > 0 and y_block.shape[1] > 0 : + if len(x_block.index)>len(set(x_block.index)): + st.warning("X-block:Duplicate sample IDs found. Suffixes (#1, #2, ...) have been added to duplicate IDs.") + x_block.index = x_block.index.where(~x_block.index.duplicated(keep=False) , + x_block.groupby(x_block.index).cumcount().add(1).astype(str).radd(x_block.index.astype(str) + '#')) + if len(y_block.index)>len(set(y_block.index)): + st.warning("Y-block:Duplicate sample IDs found. Suffixes (#1, #2, ...) have been added to duplicate IDs.") + y_block.index = y_block.index.where(~y_block.index.duplicated(keep=False) , + y_block.groupby(y_block.index).cumcount().add(1).astype(str).radd(y_block.index.astype(str) + '#')) y = DataFrame() if y_block.shape[1] > 1: options = [''] + y_block.columns.tolist() - else: + elif y_block.shape[1] == 1: options = y_block.columns.tolist() + + # drop down list to select the target variable yname = c1.selectbox('Select a target:', options = options, - disabled= True if len(options)==1 else False, - format_func = fmt) - if yname: - y = y_block.loc[:, yname].dropna(axis=0) # 1d + disabled= True if len(options)<=1 else False, + format_func = fmt) + # define the target variable + if not x_block.empty and yname: + if len(y_block.loc[:, yname].dropna().index.intersection(x_block.dropna().index)) > 0: + y = y_block.loc[y_block.loc[:, yname].dropna().index.intersection(x_block.dropna().index), yname] # 1d + x_block = x_block.loc[y_block.loc[:, yname].dropna().index.intersection(x_block.dropna().index), :] + else: + st.error('X-Y blocks matching issue: X_block and Y_block have no common samples name !') + else: c1.info('Info: Select the target analyte from the drop down list!') - - if not y.empty: - if filetype =="csv": - # Find the intersection of index names - if not meta_y.empty and not meta_x.empty: - common_samples = meta_y.loc[y.index].index.intersection(meta_x.index) - if len(common_samples)>0: + +if not y.empty: + if len(y.index)>len(set(y.index)): + st.warning("Duplicate sample IDs found. Suffixes (#1, #2, ...) have been added to duplicate IDs.") + meta_y['names'] = y.index + mask = y.index.duplicated(keep=False) # Keep all duplicates (True for replicated) + # For the duplicated sample_ids, apply suffix (_1, _2, etc.) + y.index = y.index.where(~mask, + y.groupby(y.index).cumcount().add(1).astype(str).radd(y.index.astype(str) + '#')) + + + if filetype =="csv": + # Find the intersection of index names + if not meta_y.empty and not meta_x.empty: # both of xfile and yfile includes meta_data + common_samples = meta_y.loc[y.index].index.intersection(meta_x.index) + if len(common_samples) > 0: + lens = list(meta_y.columns) + list(meta_y.columns) + if len(lens) > len(set(lens)): + from collections import Counter + duplicates = [item for item, count in Counter(lens).items() if count > 1] + from pandas.util import hash_pandas_object + if hash_pandas_object(meta_y.loc[common_samples, duplicates]).sum() == hash_pandas_object(meta_x.loc[common_samples, duplicates]).sum(): + meta_data = concat([meta_y.loc[common_samples,:]], axis = 1) + elif len(lens) == len(set(lens)): meta_data = concat([meta_y.loc[common_samples,:], meta_x.loc[common_samples,:]], axis = 1) - else: - meta_data = DataFrame() - - elif not meta_y.empty and meta_x.empty: - meta_data = meta_y.loc[y.index, :] - elif not meta_y.empty and meta_x.empty: - common_samples = meta_y.loc[y.index].index.intersection(meta_x.index) - if len(common_samples)>0: - meta_data = meta_x.loc[common_samples, :] - else: - meta_data = DataFrame() - elif meta_y.empty and meta_x.empty: + else: meta_data = DataFrame() + elif not meta_y.empty and meta_x.empty: # only yfile that includes meta_data + meta_data = meta_y.loc[y.index, :] + elif meta_y.empty and not meta_x.empty: # only xfile that includes meta_data + common_samples = meta_x.loc[x_block.index].index.intersection(y_block.index) + if len(common_samples) > 0: + meta_data = meta_x.loc[common_samples, :] + else: + meta_data = DataFrame() + elif meta_y.empty and meta_x.empty: + meta_data = DataFrame() + ################################################### END : I- Data loading and preparation #################################################### + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BEGIN : visualize and split the data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +st.subheader("I - Data visualization", divider = 'blue') +if not x_block.empty and not y.empty: + nwls = x_block.shape[1] + #### insight on loaded data + spectra_plot = plot_spectra(x_block, mean= True , xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity") + from utils.miscellaneous import desc_stats + + c2, c3 = st.columns([1, .4]) + with c2: + st.pyplot(spectra_plot) ## Loaded graph + if st.session_state.interface =='advanced': + with st.container(): + values = st.slider('Select a range of values', min_value = 0, max_value = nwls, value = (0, nwls)) + + hash_ = ObjectHash(current= hash_, add= values) + x_block = x_block.iloc[:, values[0]:values[1]] + nwl = x_block.shape[1] -st.write(meta_data) - - - - - - - - - - - - # if not spectra.empty: - # st.success("Info: The data have been loaded successfully", icon = "✅") - - # if chem_data.shape[1]>0: - # yname = st.selectbox('Select the target analyte', options = ['']+chem_data.columns.tolist(), format_func = lambda x: x if x else "<Select>" ) - # if yname: - # measured = chem_data.loc[:, yname] > 0 - # y = chem_data.loc[:, yname].loc[measured] - # spectra = spectra.loc[measured] - - - # else: - # st.info('Info: Please select the target analyte from the dropdown list!') - # else: - # st.warning('Warning: your file includes no target variables to model !', icon = "⚠️") - - - - # ################################################### END : I- Data loading and preparation #################################################### - - - # if x_block.shape[1]>0 and y_block.shape[1]>0 : - # if y_block.shape[1] > 1: - # options = [''] + y_block.columns.tolist() - # else: - # options = y_block.columns.tolist() - # yname = c1.selectbox('Select a target:', options = options, - # disabled= True if len(options)==1 else False, - # format_func = fmt) - # if yname: - # y = y_block.loc[:, yname].dropna(axis=0) # 1d - # else: - # c1.info('Info: Select the target analyte from the drop down list!') - - -# if y in globals(): - -# ### warning -# if not y.empty: -# y.index = y.index.astype(str) -# duplicate_indices = y.index -# st.write(duplicate_indices) -# if not y.empty: -# if spectra.shape[0] == y.shape[0]: -# st.info('Info: X and Y have different number of rows') -# else: -# st.info('Info: X and Y have different number of rows') - -# if spectra.shape[0] >= y.shape[0]: -# if namesy == 0: -# pass - -# else : -# st.warning('No labels are provided for target, therefore, both target and spectra are considered well organized!') - - - + st.pyplot(plot_spectra(x_block.mean(), xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity")) -# if spectra.shape[0] < y.shape[0]: -# st.write('The number of samples chemically analyzed exceeds the number of scanned samples!') -# y = DataFrame -# spectra = DataFrame -# else: -# st.error('Error: The data has not been loaded successfully, please consider tuning the dialect settings!') + from utils.miscellaneous import data_split + X_train, X_test, y_train, y_test, train_index, test_index = data_split(x= x_block, y= y) -# # Load .dx file - + with c3: + st.write('Loaded data summary') + stats = DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =[f'{yname} (Cal)', f'{yname} (Val)', f'{yname} (Total)'] ).round(2) + st.write(stats) + ## histogramms + target_plot = hist(y = y, y_train = y_train, y_test = y_test, target_name=yname) + st.pyplot(target_plot) + st.info('Info: 70/30 split ratio was used to split the dataset into calibration and prediction subsets') -# ################################################### BEGIN : visualize and split the data #################################################### -# st.subheader("I - Data visualization", divider = 'blue') -# if not spectra.empty and not y.empty: -# # if np.array(spectra.columns).dtype.kind in ['i', 'f']: -# # colnames = spectra.columns -# # else: -# # colnames = np.arange(spectra.shape[1]) +################################################### END : visualize and split the data ####################################################### -# #### insight on loaded data -# spectra_plot = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity") -# from utils.miscellaneous import desc_stats +# ################################################### BEGIN : Create Model #################################################### +model_type = None # initialize the selected regression algorithm +Reg = None # initialize the regression model object +# intervalls_with_cols = DataFrame() -# # fig1, ax1 = plt.subplots( figsize = (12, 3)) -# # spectra.T.plot(legend = False, ax = ax1, linestyle = '-', linewidth = 0.6) -# # ax1.set_ylabel('Signal intensity') -# # ax1.margins(0) -# # plt.tight_layout() -# c2, c3 = st.columns([1, .4]) -# with c2: -# st.pyplot(spectra_plot) ######## Loaded graph -# if st.session_state.interface =='advanced': -# with st.container(): -# values = st.slider('Select a range of values', min_value = 0, max_value = nwls, value = (0, nwls)) - -# hash_ = ObjectHash(current= hash_, add= values) -# spectra = spectra.iloc[:, values[0]:values[1]] -# nwl = spectra.shape +st.subheader("II - Model creation", divider = 'blue') +if not x_block.empty and not y.empty: + c4, c5, c6 = st.columns([1, 1, 3]) + with c4: + # select type of supervised modelling problem + mode = c4.radio("The nature of the target variable :", options = ['Continuous', 'Categorical']) + hash_ = ObjectHash(current=hash_, add=mode) -# st.pyplot(plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity")) + match st.session_state["interface"]: + case 'advanced': + with c5: + if mode =="Continuous": + model_type = st.selectbox("Choose a modelling algorithm:", options = ["", "PLS", "LW-PLS", "TPE-iPLS"], + key = 12, format_func = lambda x: x+'R' if x else "<Select>", disabled=False) + elif mode == 'Categorical': + model_type = st.selectbox("Choose a modelling algorithm:", options = ["", "PLS", "LW-PLS", "TPE-iPLS"], + key = 12, format_func = lambda x: x+'DA' if x else "<Select>", disabled=False) - -# if np.array(spectra.columns).dtype.kind in ['i', 'f']: -# colnames = spectra.columns -# else: -# colnames = np.arange(spectra.shape[1]) - - - -# from utils.miscellaneous import data_split -# X_train, X_test, y_train, y_test, train_index, test_index = data_split(x=spectra, y=y) - - -# with c3: -# st.write('Loaded data summary') -# stats = DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =[f'{yname} (Cal)', f'{yname} (Val)', f'{yname} (Total)'] ).round(2) -# st.write(stats) -# ## histogramms -# target_plot = hist(y = y, y_train = y_train, y_test = y_test, target_name=yname) -# st.pyplot(target_plot) -# st.info('Info: 70/30 split ratio was used to split the dataset into calibration and prediction subsets') - - -# ################################################### END : visualize and split the data ####################################################### - - - - -# # if 'model_type' not in st.session_state: -# # st.cache_data.model_type = '' - -# # ################################################### BEGIN : Create Model #################################################### -# model_type = None # initialize the selected regression algorithm -# Reg = None # initialize the regression model object -# # intervalls_with_cols = DataFrame() - -# st.subheader("II - Model creation", divider = 'blue') -# if not spectra.empty and not y.empty: -# c4, c5, c6 = st.columns([1, 1, 3]) -# with c4: -# # select type of supervised modelling problem -# var_nature = ['Continuous', 'Categorical'] -# mode = c4.radio("The nature of the target variable :", options = var_nature) -# # p_hash(mode) -# match mode: -# case "Continuous": -# match st.session_state["interface"]: -# case 'advanced': -# reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS"] -# case 'simple': -# reg_algo = ["PLS"] - -# st.markdown(f'Example1: Quantifying the volume of nectar consumed by a pollinator during a foraging session.') -# st.markdown(f"Example2: Measure the sugar content, amino acids, or other compounds in nectar from different flower species.") -# case 'Categorical': -# reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS", 'LDA'] -# st.markdown(f"Example1: Classifying pollinators into categories such as bees, butterflies, moths, and beetles.") -# st.markdown(f"Example2: Classifying plants based on their health status, such as healthy, stressed, or diseased, using NIR spectral data.") -# with c5: -# dismod = True if st.session_state["interface"] == 'simple' else False -# model_type = c5.selectbox("Choose a modelling algorithm:", options = reg_algo, key = 12, format_func = lambda x: x if x else "<Select>", disabled=dismod) -# hash_ = ObjectHash(current= hash_, add= model_type) + + case 'simple': + if mode =="Continuous": + with c5: + model_type = st.selectbox("Choose a modelling algorithm:", options = ["PLS"], + key = 12, format_func = lambda x: x+'R' if x else "<Select>", disabled=True) + with c6: + st.markdown(f'Example1: Quantifying the volume of nectar consumed by a pollinator during a foraging session.') + st.markdown(f"Example2: Measure the sugar content, amino acids, or other compounds in nectar from different flower species.") + + elif mode == 'Categorical': + with c5: + model_type = st.selectbox("Choose a modelling algorithm:", options = ["PLS"], + key = 12, format_func = lambda x: x+'DA' if x else "<Select>", disabled=True) + with c6: + st.markdown(f"Example1: Classifying pollinators into categories such as bees, butterflies, moths, and beetles.") + st.markdown(f"Example2: Classifying plants based on their health status, such as healthy, stressed, or diseased, using NIR spectral data.") + hash_ = ObjectHash(current= hash_, add= [mode, model_type]) -# with c6: -# st.markdown("-------------") -# match model_type: -# case "PLS": -# st.markdown("#### For further details on the PLS (Partial Least Squares) algorithm, check the following reference:") -# st.markdown('##### https://www.tandfonline.com/doi/abs/10.1080/03610921003778225') + with c6: + st.markdown("-------------") + match model_type: + case "PLS": + st.markdown("#### For further details on the PLS (Partial Least Squares) algorithm, check the following reference:") + st.markdown('##### https://www.tandfonline.com/doi/abs/10.1080/03610921003778225') -# case "LW-PLS": -# st.markdown("#### For further details on the LW-PLS (Locally Weighted - Partial Least Squares) algorithm, check the following reference:") -# st.markdown('##### https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/full/10.1002/cem.3117') + case "LW-PLS": + st.markdown("#### For further details on the LW-PLS (Locally Weighted - Partial Least Squares) algorithm, check the following reference:") + st.markdown('##### https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/full/10.1002/cem.3117') -# case "TPE-iPLS": -# st.markdown("#### For further details on the TPE-iPLS (Tree-structured Parzen Estimator based interval-Partial Least Squares) algorithm, which is a wrapper method for interval selection, check the following references:") -# st.markdown("##### https://papers.nips.cc/paper_files/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf") -# st.markdown('##### https://www.tandfonline.com/doi/abs/10.1080/03610921003778225') -# st.markdown('##### https://journals.sagepub.com/doi/abs/10.1366/0003702001949500') -# st.markdown("-------------") + case "TPE-iPLS": + st.markdown("#### For further details on the TPE-iPLS (Tree-structured Parzen Estimator based interval-Partial Least Squares) algorithm, which is a wrapper method for interval selection, check the following references:") + st.markdown("##### https://papers.nips.cc/paper_files/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf") + st.markdown('##### https://www.tandfonline.com/doi/abs/10.1080/03610921003778225') + st.markdown('##### https://journals.sagepub.com/doi/abs/10.1366/0003702001949500') + st.markdown("-------------") # # if model_type != st.session_state.model_type: # # st.session_state.model_type = model_type # # increment() + + + # Training set preparation for cross-validation(CV) -# # p_hash(model_type) + with c5:# Model columns + def RequestingModelCreation(change): + global Reg + nb_folds = 3 + match model_type: + case 'PLS': + from utils.regress import Plsr + Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 100, cv = nb_folds) + + case 'TPE-iPLS': + from utils.regress import TpeIpls + Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it, cv = nb_folds) + + case 'LW-PLS': + folds = KF_CV.CV(X_train, y_train, nb_folds)# split train data into nb_folds for cross_validation + # export data to csv for Julia train/test + global x_train_np, y_train_np, x_test_np, y_test_np + data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] + x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() + # Cross-Validation calculation + d = {} + for i in range(nb_folds): + d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] + data_to_work_with.append("xtr_fold{0}".format(i+1)) + data_to_work_with.append("ytr_fold{0}".format(i+1)) + data_to_work_with.append("xte_fold{0}".format(i+1)) + data_to_work_with.append("yte_fold{0}".format(i+1)) + # check best pre-treatment with a global PLSR model + from utils.regress import Plsr + preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=100) + temp_path = Path('temp/') + with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: + json.dump(preReg.best_hyperparams_, outfile) + # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files + for i in data_to_work_with: + if 'fold' in i: + j = d[i] + else: + j = globals()[i] + # st.write(j) + np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") + open(temp_path / 'model', 'w').close() + # run Julia Jchemo as subprocess + import subprocess + subprocess_path = Path("utils/") + subprocess.run([f"{sys.executable}", subprocess_path / "lwplsr_call.py"]) + # retrieve json results from Julia JChemo + try: + with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + Reg_json = json.load(outfile) + # delete csv files + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + # delete json file after import + os.unlink(temp_path / "lwplsr_outputs.json") + os.unlink(temp_path / "lwplsr_preTreatments.json") + os.unlink(temp_path / 'model') + # format result data into Reg object + pred = ['pred_data_train', 'pred_data_test']### keys of the dict + for i in range(nb_folds): + pred.append("CV" + str(i+1)) ### add cv folds keys to pred + -# # Training set preparation for cross-validation(CV) -# nb_folds = 3 + from utils.regress import LwplsObject + Reg = LwplsObject(Reg_json = Reg_json, pred = pred) + # reg_model = Reg.model_ + Reg.CV_results_ = DataFrame() + Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} + # set indexes to Reg.pred_data (train, test, folds idx) + for i in range(len(pred)): + Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) + if i == 0: # data_train + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + Reg.pred_data_[i].index = list(y_train.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + elif i == 1: # data_test + # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + Reg.pred_data_[i].index = list(y_test.index) + Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + else: + # CVi + Reg.pred_data_[i].index = folds[list(folds)[i-2]] + # Reg.CV_results_ = concat([Reg.CV_results_, Reg.pred_data_[i]]) + Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) + Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) + + Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] + #### cross validation results print + Reg.best_hyperparams_print = Reg.best_hyperparams_ + ## plots + Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv = Reg.cv_data_['YpredCV'], folds = folds) + Reg.pretreated_spectra_ = preReg.pretreated_spectra_ + + Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} + Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} -# # Model creation-M20 columns -# with c5: -# @st.cache_data -# def RequestingModelCreation(change): -# # spectra_plot.savefig("./report/figures/spectra_plot.png") -# # target_plot.savefig("./report/figures/histogram.png") -# # st.session_state['hash_Reg'] = str(np.random.randint(2000000000)) -# folds = KF_CV.CV(X_train, y_train, nb_folds)# split train data into nb_folds for cross_validation - -# match model_type: -# case 'PLS': -# from utils.regress import Plsr -# Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 100, cv = nb_folds) -# # reg_model = Reg.model_ -# rega = Reg.selected_features_ + Reg.__hash__ = ObjectHash(current = hash_,add = Reg.best_hyperparams_print) + except FileNotFoundError as e: + Reg = None + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) -# case 'LW-PLS': -# # export data to csv for Julia train/test -# global x_train_np, y_train_np, x_test_np, y_test_np -# data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] -# x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() -# # Cross-Validation calculation -# d = {} -# for i in range(nb_folds): -# d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] -# data_to_work_with.append("xtr_fold{0}".format(i+1)) -# data_to_work_with.append("ytr_fold{0}".format(i+1)) -# data_to_work_with.append("xte_fold{0}".format(i+1)) -# data_to_work_with.append("yte_fold{0}".format(i+1)) -# # check best pre-treatment with a global PLSR model -# from utils.regress import Plsr -# preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=100) -# temp_path = Path('temp/') -# with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: -# json.dump(preReg.best_hyperparams_, outfile) -# # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files -# for i in data_to_work_with: -# if 'fold' in i: -# j = d[i] -# else: -# j = globals()[i] -# # st.write(j) -# np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") -# open(temp_path / 'model', 'w').close() -# # run Julia Jchemo as subprocess -# import subprocess -# subprocess_path = Path("utils/") -# subprocess.run([f"{sys.executable}", subprocess_path / "lwplsr_call.py"]) -# # retrieve json results from Julia JChemo -# try: -# with open(temp_path / "lwplsr_outputs.json", "r") as outfile: -# Reg_json = json.load(outfile) -# # delete csv files -# for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) -# # delete json file after import -# os.unlink(temp_path / "lwplsr_outputs.json") -# os.unlink(temp_path / "lwplsr_preTreatments.json") -# os.unlink(temp_path / 'model') -# # format result data into Reg object -# pred = ['pred_data_train', 'pred_data_test']### keys of the dict -# for i in range(nb_folds): -# pred.append("CV" + str(i+1)) ### add cv folds keys to pred + + st.write(Reg) + + s = st.number_input(label = 'Enter the maximum number of intervals', min_value = 1, + max_value = 6, value = 2, disabled= False if model_type=='TPE-iPLS' else True) + it = st.number_input(label = 'Enter the number of iterations', min_value = 2, + max_value = 500, value = 250, disabled= False if model_type=='TPE-iPLS' else True) + if model_type: + info = st.info('Info: The model is being created. This may take a few minutes.') + RequestingModelCreation(change = hash_) + st.write(Reg.__dict__.keys()) + + + # with c5: + # # @st.cache_data + # def RequestingModelCreation(change): + # # spectra_plot.savefig("./report/figures/spectra_plot.png") + # # target_plot.savefig("./report/figures/histogram.png") + # # st.session_state['hash_Reg'] = str(np.random.randint(2000000000)) + # folds = KF_CV.CV(X_train, y_train, nb_folds)# split train data into nb_folds for cross_validation + # match model_type: + # case 'PLS': + # from utils.regress import Plsr + # Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 100, cv = nb_folds) + # # reg_model = Reg.model_ + # rega = Reg.selected_features_ + + # case 'LW-PLS': + # # export data to csv for Julia train/test + # global x_train_np, y_train_np, x_test_np, y_test_np + # data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np'] + # x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() + # # Cross-Validation calculation + # d = {} + # for i in range(nb_folds): + # d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] + # data_to_work_with.append("xtr_fold{0}".format(i+1)) + # data_to_work_with.append("ytr_fold{0}".format(i+1)) + # data_to_work_with.append("xte_fold{0}".format(i+1)) + # data_to_work_with.append("yte_fold{0}".format(i+1)) + # # check best pre-treatment with a global PLSR model + # from utils.regress import Plsr + # preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=100) + # temp_path = Path('temp/') + # with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: + # json.dump(preReg.best_hyperparams_, outfile) + # # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files + # for i in data_to_work_with: + # if 'fold' in i: + # j = d[i] + # else: + # j = globals()[i] + # # st.write(j) + # np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") + # open(temp_path / 'model', 'w').close() + # # run Julia Jchemo as subprocess + # import subprocess + # subprocess_path = Path("utils/") + # subprocess.run([f"{sys.executable}", subprocess_path / "lwplsr_call.py"]) + # # retrieve json results from Julia JChemo + # try: + # with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + # Reg_json = json.load(outfile) + # # delete csv files + # for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + # # delete json file after import + # os.unlink(temp_path / "lwplsr_outputs.json") + # os.unlink(temp_path / "lwplsr_preTreatments.json") + # os.unlink(temp_path / 'model') + # # format result data into Reg object + # pred = ['pred_data_train', 'pred_data_test']### keys of the dict + # for i in range(nb_folds): + # pred.append("CV" + str(i+1)) ### add cv folds keys to pred -# # global Reg -# # Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], -# # 'pred_data_' : [json_normalize(Reg_json[i]) for i in pred]}) -# # global Reg -# from utils.regress import LwplsObject -# Reg = LwplsObject(Reg_json = Reg_json, pred = pred) -# # reg_model = Reg.model_ -# Reg.CV_results_ = DataFrame() -# Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} -# # set indexes to Reg.pred_data (train, test, folds idx) -# for i in range(len(pred)): -# Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) -# if i == 0: # data_train -# # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) -# Reg.pred_data_[i].index = list(y_train.index) -# Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] -# elif i == 1: # data_test -# # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) -# Reg.pred_data_[i].index = list(y_test.index) -# Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] -# else: -# # CVi -# Reg.pred_data_[i].index = folds[list(folds)[i-2]] -# # Reg.CV_results_ = concat([Reg.CV_results_, Reg.pred_data_[i]]) -# Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) -# Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) - -# Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] -# #### cross validation results print -# Reg.best_hyperparams_print = Reg.best_hyperparams_ -# ## plots -# Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv = Reg.cv_data_['YpredCV'], folds = folds) -# Reg.pretreated_spectra_ = preReg.pretreated_spectra_ + + # global Reg + # from utils.regress import LwplsObject + # Reg = LwplsObject(Reg_json = Reg_json, pred = pred) + # # reg_model = Reg.model_ + # Reg.CV_results_ = DataFrame() + # Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} + # # set indexes to Reg.pred_data (train, test, folds idx) + # for i in range(len(pred)): + # Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index']) + # if i == 0: # data_train + # # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + # Reg.pred_data_[i].index = list(y_train.index) + # Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + # elif i == 1: # data_test + # # Reg.pred_data_[i] = np.array(Reg.pred_data_[i]) + # Reg.pred_data_[i].index = list(y_test.index) + # Reg.pred_data_[i] = Reg.pred_data_[i].iloc[:,0] + # else: + # # CVi + # Reg.pred_data_[i].index = folds[list(folds)[i-2]] + # # Reg.CV_results_ = concat([Reg.CV_results_, Reg.pred_data_[i]]) + # Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) + # Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) + + # Reg.CV_results_= KF_CV.metrics_cv(y = y_train, ypcv = Reg.cv_data_['YpredCV'], folds = folds)[1] + # #### cross validation results print + # Reg.best_hyperparams_print = Reg.best_hyperparams_ + # ## plots + # Reg.cv_data_ = KF_CV().meas_pred_eq(y = np.array(y_train), ypcv = Reg.cv_data_['YpredCV'], folds = folds) + # Reg.pretreated_spectra_ = preReg.pretreated_spectra_ -# Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} -# Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} + # Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} + # Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_} -# Reg.__hash__ = ObjectHash(current = hash_,add = Reg.best_hyperparams_print) -# except FileNotFoundError as e: -# Reg = None -# for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + # Reg.__hash__ = ObjectHash(current = hash_,add = Reg.best_hyperparams_print) + # except FileNotFoundError as e: + # Reg = Nonefor i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) -# case 'TPE-iPLS': -# from utils.regress import TpeIpls -# Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it, cv = nb_folds) -# # reg_model = Reg.model_ - -# global intervalls, intervalls_with_cols -# intervalls = Reg.selected_features_.T.copy() -# intervalls_with_cols = Reg.selected_features_.T.copy().astype(str) - -# for i in range(intervalls.shape[0]): -# for j in range(intervalls.shape[1]): -# intervalls_with_cols.iloc[i,j] = spectra.columns[intervalls.iloc[i,j]] -# rega = Reg.selected_features_ - -# st.session_state.intervalls = Reg.selected_features_.T -# st.session_state.intervalls_with_cols = intervalls_with_cols -# return Reg - + + # return Reg + # Reg = RequestingModelCreation(change = hash_) @@ -500,10 +533,10 @@ st.write(meta_data) # else: # s, it = None, None # hash_ = ObjectHash( current = hash_,add = str(s)+str(it)) - -# remodel_button = st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True, on_click=increment) + # if Reg: + # remodel_button = st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True, on_click=increment) # hash_ = ObjectHash(current = hash_, add = st.session_state.counter) -# Reg = RequestingModelCreation(change = hash_) + # Reg = RequestingModelCreation(change = hash_) # reg_model = Reg.model_ # # hash_ = ObjectHash(current = hash_, add = Reg) # else: diff --git a/src/utils/data_parsing.py b/src/utils/data_parsing.py index f5484f18363fcd4313d780969d224b2c5418cce5..b33215744fb5b7ada247f7d38ebec812bd801dc8 100644 --- a/src/utils/data_parsing.py +++ b/src/utils/data_parsing.py @@ -83,11 +83,13 @@ def jcamp_parser(path, include, change = None): # Extract target concentrations if 'y_block' or 'all' is included if 'y_block' in include or 'all' in include: - targets_tuple[i] = conc(sample=block['concentrations'], pattern=pattern) + targets_tuple[i] = conc(sample=block['concentrations'], pattern= pattern) # Create DataFrame for target concentrations if 'y_block' in include or 'all' in include: - y_block = DataFrame(targets_tuple, index=elements_name, columns=idx).T + y_block = DataFrame(targets_tuple).T + y_block.columns = elements_name + y_block.index = idx else: y_block = DataFrame diff --git a/src/utils/visualize.py b/src/utils/visualize.py index 4a338d9c9071a8e105f6ac4229890c894a0b19a4..cd3078d086da607f5c5c696632ec573747476946 100644 --- a/src/utils/visualize.py +++ b/src/utils/visualize.py @@ -28,7 +28,7 @@ def pred_hist(pred): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ predictions histogram ~~~~~~~~~~~~~~~~~~~~~~~~~~ @st.cache_data -def plot_spectra(specdf = None, color = None, cmap =None, xunits = None, yunits = None): +def plot_spectra(specdf = None, color = None, cmap =None, xunits = None, yunits = None, mean = False): # pass import matplotlib.pyplot as plt import numpy as np @@ -48,6 +48,8 @@ def plot_spectra(specdf = None, color = None, cmap =None, xunits = None, yunits for key, value in cmap.items(): idx = color.index[color == key].tolist() specdf.loc[idx].T.plot(legend=False, ax = ax, color = value) + if mean: + specdf.mean().T.plot(legend=False, ax = ax, color = "black", linewidth = 5) ax.set_xlabel(xunits, fontsize=30) @@ -76,7 +78,7 @@ def barhplot(metadf, cmap): @st.cache_data def hist(y, y_train, y_test, target_name = 'y'): fig, ax = plt.subplots(figsize = (5,2)) - sns.histplot(y, color = "#004e9e", kde = True, label = str(target_name), ax = ax, fill = True) + sns.histplot(y, color = "#004e9e", kde = True, label = str(target_name) + " (Total)", ax = ax, fill = True) sns.histplot(y_train, color = "#2C6B6F", kde = True, label = str(target_name)+" (Cal)", ax = ax, fill = True) sns.histplot(y_test, color = "#d0f7be", kde = True, label = str(target_name)+" (Val)", ax = ax, fill = True) ax.set_xlabel(str(target_name))