diff --git a/src/common.py b/src/common.py index 92d27f25c22f702ce991850b649e690bcf29e552..24e7991b1f99f245677ca80750f9a9bb7337f4aa 100644 --- a/src/common.py +++ b/src/common.py @@ -24,7 +24,7 @@ import plotly.express as px from tempfile import NamedTemporaryFile import numpy as np from datetime import datetime - +import json from utils.data_parsing import JcampParser, CsvParser from style.layout import UiComponents diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 26aa959d7ae221979b5c0264b75d780c7d3b82a8..c6384e619b13190708f9e44540185ebac82fb78a 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -8,12 +8,8 @@ st.set_page_config(page_title = "NIRS Utils", page_icon = ":goat:", layout = "wi # layout UiComponents(pagespath = pages_folder, csspath= css_file,imgpath=image_path , header=True, sidebar= True, bgimg=False, colborders=True) - hash_ = '' -def p_hash(add): - global hash_ - hash_ = hash_data(hash_+str(add)) - return hash_ + # Initialize the variable in session state if it doesn't exist for st.cache_data if 'counter' not in st.session_state: st.session_state.counter = 0 @@ -89,7 +85,7 @@ match file: for i in ["xcal_csv", "ycal_csv"]: stringio = StringIO(eval(f'{i}.getvalue().decode("utf-8")')) xy_str += str(stringio.read()) - p_hash([xy_str + str(xcal_csv.name) + str(ycal_csv.name), hdrx, sepx, hdry, sepy]) + # p_hash([xy_str + str(xcal_csv.name) + str(ycal_csv.name), hdrx, sepx, hdry, sepy]) # p_hash(add = ) @st.cache_data @@ -152,15 +148,17 @@ match file: tmp_path = tmp.name with open(tmp.name, 'r') as dd: dxdata = dd.read() - p_hash(str(dxdata)+str(data_file.name)) + # p_hash(str(dxdata)+str(data_file.name)) ## load and parse the temp dx file @st.cache_data - def dx_loader(change): - chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path) - os.unlink(tmp_path) - return chem_data, spectra, meta_data, meta_data_st - chem_data, spectra, meta_data, meta_data_st = dx_loader(change = hash_) + def read_dx(tmp_path): + M = JcampParser(path = tmp_path) + M.parse() + # chem_data, spectra, meta_data, meta_data_st = read_dx(file = tmp_path) + # os.unlink(tmp_path) + return M.chem_data, M.specs_df_, M.md_df_, M.md_df_st_ + chem_data, spectra, meta_data, meta_data_st = read_dx(tmp_path = tmp_path) if not spectra.empty: st.success("Info: The data have been loaded successfully", icon = "✅") @@ -191,13 +189,14 @@ match file: ################################################### BEGIN : visualize and split the data #################################################### st.subheader("I - Data visualization", divider = 'blue') if not spectra.empty and not y.empty: - p_hash(y) - p_hash(np.mean(spectra)) + # p_hash(y) + # p_hash(np.mean(spectra)) if np.array(spectra.columns).dtype.kind in ['i', 'f']: colnames = spectra.columns else: colnames = np.arange(spectra.shape[1]) - + + from utils.miscellaneous import data_split X_train, X_test, y_train, y_test, train_index, test_index = data_split(x=spectra, y=y) @@ -205,6 +204,7 @@ if not spectra.empty and not y.empty: #### insight on loaded data spectra_plot = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity") target_plot = hist(y = y, y_train = y_train, y_test = y_test, target_name=yname) + from utils.miscellaneous import desc_stats stats = DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =['train', 'test', 'total'] ).round(2) # fig1, ax1 = plt.subplots( figsize = (12, 3)) @@ -241,7 +241,7 @@ if not spectra.empty and not y.empty: # select type of supervised modelling problem var_nature = ['Continuous', 'Categorical'] mode = c4.radio("The nature of the target variable :", options = var_nature) - p_hash(mode) + # p_hash(mode) match mode: case "Continuous": reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS"] @@ -276,7 +276,7 @@ if not spectra.empty and not y.empty: # st.session_state.model_type = model_type # increment() - p_hash(model_type) + # p_hash(model_type) # Training set preparation for cross-validation(CV) @@ -293,6 +293,7 @@ if not spectra.empty and not y.empty: match model_type: case 'PLS': + from utils.regress import Plsr Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 100, cv = nb_folds) # reg_model = Reg.model_ rega = Reg.selected_features_ @@ -412,10 +413,10 @@ if not spectra.empty and not y.empty: it = st.number_input(label = 'Enter the number of iterations', min_value = 2, max_value = 500, value = 250) else: s, it = None, None - p_hash(str(s)+str(it)) + # p_hash(str(s)+str(it)) remodel_button = st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True, on_click=increment) - p_hash(st.session_state.counter) + # p_hash(st.session_state.counter) Reg = RequestingModelCreation(change = hash_) reg_model = Reg.model_ hash_ = hash(Reg) diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 40891d9ea67507a805eb89c2bac9ff8983d8cac3..bcb7a25cf83e706623706601c9f04fcd9276ca03 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -10,4 +10,3 @@ Here are all the classes to perform your analysis # from .clustering import * # from .samsel import * # from .regress import * -# from .eval_metrics import * \ No newline at end of file diff --git a/src/utils/data_handling.py b/src/utils/data_handling.py index 1c222742d66d71f3b931fe1279f4d6960eed0004..cfa413e3ebc8f999c907e261fb5eb6d7ebc4a08f 100644 --- a/src/utils/data_handling.py +++ b/src/utils/data_handling.py @@ -1,4 +1,6 @@ from utils.eval_metrics import metrics +import numpy as np +from pandas import DataFrame ## try to automatically detect the field separator within the CSV def find_delimiter(filename): @@ -86,6 +88,7 @@ class KF_CV: ### KFCV(dict) returns a testset indices/Fold @staticmethod def CV(x, y, n_folds:int): + from kennard_stone import KFold as ks_KFold test_folds = {} folds_name = [f'Fold{i+1}' for i in range(n_folds)] kf = ks_KFold(n_splits=n_folds, device='cpu') @@ -132,12 +135,14 @@ class KF_CV: r = DataFrame() r['Predicted'] = ypcv[Fname] r['Measured'] = y[folds[Fname]] + from sklearn.linear_model import LinearRegression ols = LinearRegression().fit(DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1)) r.index = folds[Fname] r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0] cvcv[i] = r coeff[Fname] = [ols.coef_[0][0], ols.intercept_[0]] + from pandas import concat data = concat(cvcv, axis = 0) data['index'] = [data.index[i][1] for i in range(data.shape[0])] data.index = data['index'] diff --git a/src/utils/eval_metrics.py b/src/utils/eval_metrics.py index 5cba400d1b1c107838e27cebf2a2ff866aec0f27..4202feafe399c91f63d54f77068b7cf11d8bb523 100644 --- a/src/utils/eval_metrics.py +++ b/src/utils/eval_metrics.py @@ -1,4 +1,9 @@ +from pandas import DataFrame +import numpy as np + + + class metrics: from typing import Optional, List from pandas import DataFrame diff --git a/src/utils/miscellaneous.py b/src/utils/miscellaneous.py index 5a2ba421a43e0d7b1f7b1eedf18984890162582e..fd98143a5397415dc091388eaa9cdf963a815070 100644 --- a/src/utils/miscellaneous.py +++ b/src/utils/miscellaneous.py @@ -1,5 +1,6 @@ import streamlit as st - +from pandas import DataFrame +import numpy as np # predict module def prediction(NIRS_csv, qsep, qhdr, model): @@ -21,16 +22,16 @@ def download_results(data, export_name): @st.cache_data(show_spinner =True) def data_split(x, y): + from kennard_stone import train_test_split # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(x , y = y, method = "kennard_stone", metric = "correlation", test_size = 0.25, random_state = 42) - # Assign data to training and test sets - X_train, y_train = DataFrame(x.iloc[train_index,:]), y.iloc[train_index] - X_test, y_test = DataFrame(x.iloc[test_index,:]), y.iloc[test_index] + X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42) + train_index, test_index = X_train.index, X_test.index return X_train, X_test, y_train, y_test, train_index, test_index ## descriptive stat @st.cache_data(show_spinner =True) def desc_stats(x): + from scipy.stats import skew, kurtosis a = {} a['N samples'] = x.shape[0] a['Min'] = np.min(x) diff --git a/src/utils/regress.py b/src/utils/regress.py index 62c5f23322225da664721c8cab1b0928e262bde5..f4c3f9b6542369092e8cfd17c9a1ce7ea4cf8c2f 100644 --- a/src/utils/regress.py +++ b/src/utils/regress.py @@ -1,10 +1,15 @@ -from utils import metrics, Snv, No_transformation, KF_CV, sel_ratio +import numpy as np +from pandas import DataFrame +from utils.eval_metrics import metrics +from scipy.signal import savgol_filter +from sklearn.cross_decomposition import PLSRegression +from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal + +from utils.data_handling import Snv, No_transformation, KF_CV, sel_ratio class Regmodel(object): - from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal def __init__(self, train, test, n_iter, add_hyperparams = None, nfolds = 3, **kwargs): - from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal self.SCORE = 100000000 self._xc, self._xt, self._ytrain, self._ytest = train[0], test[0], train[1], test[1] diff --git a/src/utils/visualize.py b/src/utils/visualize.py index 5ac80c34c96ab0bd1df4308a4f9ee36581d50e37..a122dd867fa4c54970181b549ddd62fd2d84edab 100644 --- a/src/utils/visualize.py +++ b/src/utils/visualize.py @@ -1,5 +1,8 @@ import streamlit as st +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ predictions histogram ~~~~~~~~~~~~~~~~~~~~~~~~~~ @st.cache_data def pred_hist(pred):