packages import

48636f7f · DIANE · b7cf71c8 · 48636f7f · 48636f7f · 48636f7f
Commit 48636f7f authored 6 months ago by DIANE
--- a/src/common.py
+++ b/src/common.py
@@ -24,7 +24,7 @@ import plotly.express as px
 from tempfile import NamedTemporaryFile
 import numpy as np
 from datetime import datetime
-
+import json

 from utils.data_parsing import JcampParser, CsvParser
 from style.layout import  UiComponents

--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -8,12 +8,8 @@ st.set_page_config(page_title = "NIRS Utils", page_icon = ":goat:", layout = "wi
 # layout
 UiComponents(pagespath = pages_folder, csspath= css_file,imgpath=image_path ,
             header=True, sidebar= True, bgimg=False, colborders=True)
-
 hash_ = ''
-def p_hash(add):
-    global hash_
-    hash_ = hash_data(hash_+str(add))
-    return hash_
+
 # Initialize the variable in session state if it doesn't exist for st.cache_data
 if 'counter' not in st.session_state:
    st.session_state.counter = 0
@@ -89,7 +85,7 @@ match file:
                for i in ["xcal_csv", "ycal_csv"]:
                    stringio = StringIO(eval(f'{i}.getvalue().decode("utf-8")'))
                    xy_str += str(stringio.read())
-                p_hash([xy_str + str(xcal_csv.name) + str(ycal_csv.name), hdrx, sepx, hdry, sepy])
+                # p_hash([xy_str + str(xcal_csv.name) + str(ycal_csv.name), hdrx, sepx, hdry, sepy])
                # p_hash(add = )
                
                @st.cache_data
@@ -152,15 +148,17 @@ match file:
                    tmp_path = tmp.name
                    with open(tmp.name, 'r') as dd:
                        dxdata = dd.read()
-                        p_hash(str(dxdata)+str(data_file.name))
+                        # p_hash(str(dxdata)+str(data_file.name))

                ## load and parse the temp dx file
                @st.cache_data
-                def dx_loader(change):
-                    chem_data, spectra, meta_data, meta_data_st = read_dx(file =  tmp_path)    
-                    os.unlink(tmp_path)
-                    return chem_data, spectra, meta_data, meta_data_st
-                chem_data, spectra, meta_data, meta_data_st = dx_loader(change = hash_)
+                def read_dx(tmp_path):
+                    M = JcampParser(path = tmp_path)
+                    M.parse()
+                    # chem_data, spectra, meta_data, meta_data_st = read_dx(file =  tmp_path)    
+                    # os.unlink(tmp_path)
+                    return M.chem_data, M.specs_df_, M.md_df_, M.md_df_st_
+                chem_data, spectra, meta_data, meta_data_st = read_dx(tmp_path = tmp_path)
                
                if not spectra.empty:
                    st.success("Info: The data have been loaded successfully", icon = "✅")
@@ -191,13 +189,14 @@ match file:
 ################################################### BEGIN : visualize and split the data ####################################################
 st.subheader("I - Data visualization", divider = 'blue')
 if not spectra.empty and not y.empty:
-    p_hash(y)
-    p_hash(np.mean(spectra))
+    # p_hash(y)
+    # p_hash(np.mean(spectra))
    if np.array(spectra.columns).dtype.kind in ['i', 'f']:
        colnames = spectra.columns
    else:
        colnames = np.arange(spectra.shape[1])
-    
+
+    from utils.miscellaneous import data_split
    X_train, X_test, y_train, y_test, train_index, test_index = data_split(x=spectra, y=y)
    

@@ -205,6 +204,7 @@ if not spectra.empty and not y.empty:
    #### insight on loaded data
    spectra_plot = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity")
    target_plot = hist(y = y, y_train = y_train, y_test = y_test, target_name=yname)
+    from utils.miscellaneous import desc_stats
    stats = DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =['train', 'test', 'total'] ).round(2) 

    # fig1, ax1 = plt.subplots( figsize = (12, 3))
@@ -241,7 +241,7 @@ if not spectra.empty and not y.empty:
        # select type of supervised modelling problem
        var_nature = ['Continuous', 'Categorical']
        mode = c4.radio("The nature of the target variable :", options = var_nature)
-        p_hash(mode)
+        # p_hash(mode)
        match mode:
            case "Continuous":
                reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS"]
@@ -276,7 +276,7 @@ if not spectra.empty and not y.empty:
    #     st.session_state.model_type = model_type
    #     increment()
    
-    p_hash(model_type)
+    # p_hash(model_type)


    # Training set preparation for cross-validation(CV)
@@ -293,6 +293,7 @@ if not spectra.empty and not y.empty:

            match model_type:
                case 'PLS':
+                    from utils.regress import Plsr
                    Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 100, cv = nb_folds)
                    # reg_model = Reg.model_
                    rega = Reg.selected_features_
@@ -412,10 +413,10 @@ if not spectra.empty and not y.empty:
                it = st.number_input(label = 'Enter the number of iterations', min_value = 2, max_value = 500, value = 250)
            else:
                s, it = None, None
-            p_hash(str(s)+str(it))
+            # p_hash(str(s)+str(it))
                
            remodel_button = st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True, on_click=increment)
-            p_hash(st.session_state.counter)
+            # p_hash(st.session_state.counter)
            Reg = RequestingModelCreation(change = hash_)
            reg_model = Reg.model_
            hash_ = hash(Reg)

--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -10,4 +10,3 @@ Here are all the classes to perform your analysis
 # from .clustering import *
 # from .samsel import *
 # from .regress import *
-# from .eval_metrics import *
\ No newline at end of file
--- a/src/utils/data_handling.py
+++ b/src/utils/data_handling.py
 from utils.eval_metrics import metrics
+import numpy as np
+from pandas import DataFrame

 ## try to automatically detect the field separator within the CSV
 def find_delimiter(filename):
@@ -86,6 +88,7 @@ class KF_CV:
    ### KFCV(dict) returns a testset indices/Fold 
    @staticmethod
    def CV(x, y, n_folds:int):
+        from kennard_stone import KFold as ks_KFold
        test_folds = {}
        folds_name = [f'Fold{i+1}' for i in range(n_folds)]
        kf = ks_KFold(n_splits=n_folds, device='cpu')
@@ -132,12 +135,14 @@ class KF_CV:
            r = DataFrame()
            r['Predicted'] = ypcv[Fname]
            r['Measured'] = y[folds[Fname]]
+            from sklearn.linear_model import LinearRegression
            ols = LinearRegression().fit(DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1))
            r.index = folds[Fname]
            r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0]
            cvcv[i] = r
            coeff[Fname] = [ols.coef_[0][0], ols.intercept_[0]]

+        from pandas import concat
        data = concat(cvcv, axis = 0)
        data['index'] = [data.index[i][1] for i in range(data.shape[0])]
        data.index = data['index']

--- a/src/utils/eval_metrics.py
+++ b/src/utils/eval_metrics.py

+from pandas import DataFrame
+import numpy as np
+
+
+
 class metrics:
    from typing import Optional, List
    from pandas import DataFrame

--- a/src/utils/miscellaneous.py
+++ b/src/utils/miscellaneous.py
 import streamlit as st
-
+from pandas import DataFrame
+import numpy as np

 # predict module
 def prediction(NIRS_csv, qsep, qhdr, model):
@@ -21,16 +22,16 @@ def download_results(data, export_name):

 @st.cache_data(show_spinner =True)
 def data_split(x, y):
+    from kennard_stone import train_test_split
    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
-    train_index, test_index = train_test_split_idx(x , y = y, method = "kennard_stone", metric = "correlation", test_size = 0.25, random_state = 42)
-    # Assign data to training and test sets
-    X_train, y_train = DataFrame(x.iloc[train_index,:]), y.iloc[train_index]
-    X_test, y_test = DataFrame(x.iloc[test_index,:]), y.iloc[test_index]
+    X_train, X_test, y_train, y_test  = train_test_split(x, y, test_size = 0.25, random_state = 42)
+    train_index, test_index = X_train.index, X_test.index
    return X_train, X_test, y_train, y_test, train_index, test_index

 ## descriptive stat
 @st.cache_data(show_spinner =True)
 def desc_stats(x):
+    from scipy.stats import skew, kurtosis
    a = {}
    a['N samples'] = x.shape[0]
    a['Min'] =  np.min(x)

--- a/src/utils/regress.py
+++ b/src/utils/regress.py
-from utils import metrics, Snv, No_transformation, KF_CV, sel_ratio
+import numpy as np
+from pandas import DataFrame
+from utils.eval_metrics import metrics
+from scipy.signal import savgol_filter
+from sklearn.cross_decomposition import PLSRegression
+from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal
+
+from utils.data_handling import Snv, No_transformation, KF_CV, sel_ratio


 class Regmodel(object):
-    from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal
    def __init__(self, train, test, n_iter, add_hyperparams = None, nfolds = 3, **kwargs):
-        from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal

        self.SCORE = 100000000
        self._xc, self._xt, self._ytrain, self._ytest = train[0], test[0], train[1], test[1]

--- a/src/utils/visualize.py
+++ b/src/utils/visualize.py

 import streamlit as st
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ predictions histogram ~~~~~~~~~~~~~~~~~~~~~~~~~~
 @st.cache_data
 def pred_hist(pred):