Merge remote-tracking branch 'origin/master'

29ea9c4a · Nicolas Barthes · a05d597b · fc727f39 · 29ea9c4a · 29ea9c4a
Commit 29ea9c4a authored 5 months ago by Nicolas Barthes
--- a/src/common.py
+++ b/src/common.py
@@ -17,6 +17,7 @@ pages_folder = Path("pages/")
 image_path = Path('./images/img-sky.jpg')
 import os
+import sys
 from shutil import rmtree
 from pandas import DataFrame, concat
 from PIL import Image

--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -54,24 +54,58 @@ y = DataFrame() # preallocate the target(s) data block
 match file:
    # load csv file
    case 'csv':
+        from utils.data_parsing import CsvParser
+        def read_csv(file = file, change = None, dec = None, sep= None, names = None, hdr = None):
+            delete_files(keep = ['.py', '.pyc','.bib'])
+            from utils.data_parsing import CsvParser
+            par = CsvParser(file= file)
+            par.parse(decimal = dec, separator = sep, index_col = names, header = hdr)
+            return par.float, par.meta_data, par.meta_data_st_, par.df
        with c1:
            # Load X-block data
            xcal_csv = st.file_uploader("Select NIRS Data", type = "csv", help = " :mushroom: select a csv matrix with samples as rows and lambdas as columns")
            if xcal_csv:
-                sepx = st.radio("Select separator (X file): " , options = [";", ","], key = 0,horizontal = True)
+                c1_1, c2_2 = st.columns([.5, .5])
-                hdrx = st.checkbox("Samples name (X file): ")
+                with c1_1:
-                colx = 0 if hdrx else False
+                    decx = st.radio('decimal(x):', options= [".", ","], horizontal = True)
+                    sepx = st.radio("separator(x):", options = [";", ","], horizontal = True)
+                with c2_2:
+                    phdrx = st.radio("header(x): ", options = ["yes", "no"], horizontal = True)
+                    pnamesx = st.radio("samples name(x):", options = ["yes", "no"], horizontal = True)
+                hdrx = 0 if phdrx =="yes" else None
+                namesx = 0 if pnamesx =="yes" else None
+                try:
+                    spectra, meta_data, md_df_st_, xfile = read_csv(file= xcal_csv, change = hash_, dec = decx, sep = sepx, names =namesx, hdr = hdrx)
+                    st.success('xfile has been loaded successfully')
+                except:
+                    st.error('Error: The xfile has not been loaded successfully, please consider tuning the dialect settings!')
            else:
                st.info('Info: Insert your spectral data file above!')
            # Load Y-block data
            ycal_csv = st.file_uploader("Select corresponding Chemical Data", type = "csv", help = " :mushroom: select a csv matrix with samples as rows and chemical values as a column")
            if ycal_csv:
-                sepy = st.radio("Select separator (Y file): ", options = [";",  ","], key = 2, horizontal = True)
+                c1_1, c2_2 = st.columns([.5, .5])
-                hdry = st.checkbox("samples name (Y file)?: ")
+                with c1_1:
-                coly = 0 if hdry else False
+                    decy = st.radio('decimal(y):', options= [".", ","], horizontal = True)
+                    sepy = st.radio("separator(y):", options = [";", ","], horizontal = True)
+                with c2_2:
+                    phdry = st.radio("header(y): ", options = ["yes", "no"], horizontal = True)
+                    pnamesy = st.radio("samples name(y):", options = ["yes", "no"], horizontal = True)
+                hdry = 0 if phdry =="yes" else None
+                namesy = 0 if pnamesy =="yes" else None
+                try:
+                    chem_data, meta_data, md_df_st_, yfile = read_csv(file= ycal_csv, change = hash_, dec = decy, sep = sepy, names =namesy, hdr = hdry)
+                    st.success('yfile has been loaded successfully')
+                except:
+                    st.error('Error: The yfile has not been loaded successfully, please consider tuning the dialect settings!')
            else:
                st.info('Info: Insert your target data file above!')
@@ -86,37 +120,27 @@ match file:
                    stringio = StringIO(eval(f'{i}.getvalue().decode("utf-8")'))
                    xy_str += str(stringio.read())
                # p_hash([xy_str + str(xcal_csv.name) + str(ycal_csv.name), hdrx, sepx, hdry, sepy])
-                # p_hash(add = )
+                hash_ = ObjectHash(current=hash_,add = xy_str)
-                @st.cache_data
-                def csv_loader(change):
-                    delete_files(keep = ['.py', '.pyc','.bib'])
-                    file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name)
-                    xfile = read_csv(xcal_csv, decimal = '.', sep = sepx, index_col = colx, header = 0)
-                    yfile =  read_csv(ycal_csv, decimal = '.', sep = sepy, index_col = coly)
-                    return xfile, yfile, file_name
-                xfile, yfile, file_name = csv_loader(change = hash_)
-                if yfile.shape[1]>0 and xfile.shape[1]>0 :
-                    # prepare x data
-                    try: 
-                        spectra, meta_data = col_cat(xfile)
-                    except:
-                        st.error('Error: The format of the X-file does not correspond to the expected dialect settings. To read the file correctly, please adjust the separator parameters.')
-                    spectra = DataFrame(spectra).astype(float)
-                    # prepare y data
-                    try:
-                        chem_data, idx = col_cat(yfile)
-                    except:
-                        st.error('Error: The format of the Y-file does not correspond to the expected dialect settings. To read the file correctly, please adjust the separator parameters.')
+                # xfile, yfile, file_name = csv_loader(change = hash_)
+                # yfile =  read_csv(file= ycal_csv, change = hash_)
+                if yfile.shape[1]>0 and xfile.shape[1]>0 :    
                    if 'chem_data' in globals():
-                        if chem_data.shape[1]>1:
+                        if chem_data.shape[1] > 1:
                            yname = c1.selectbox('Select a target', options = ['']+chem_data.columns.tolist(), format_func = lambda x: x if x else "<Select>")
                            if yname:
                                y = chem_data.loc[:, yname]
@@ -312,6 +336,7 @@ if not spectra.empty and not y.empty:
                        data_to_work_with.append("xte_fold{0}".format(i+1))
                        data_to_work_with.append("yte_fold{0}".format(i+1))
                    # check best pre-treatment with a global PLSR model
+                    from utils.regress import Plsr
                    preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=100)
                    temp_path = Path('temp/')
                    with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile:
@@ -328,7 +353,7 @@ if not spectra.empty and not y.empty:
                    # run Julia Jchemo as subprocess
                    import subprocess
                    subprocess_path = Path("utils/")
-                    subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"])
+                    subprocess.run([f"{sys.executable}", subprocess_path / "lwplsr_call.py"])
                    # retrieve json results from Julia JChemo
                    try:
                        with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
@@ -380,12 +405,13 @@ if not spectra.empty and not y.empty:
                        Reg.best_hyperparams_print = {**preReg.best_hyperparams_, **Reg.best_hyperparams_}
                        Reg.best_hyperparams_ = {**preReg.best_hyperparams_, **Reg.best_hyperparams_}
-                        Reg.__hash__ = hash_data(Reg.best_hyperparams_print)
+                        Reg.__hash__ = ObjectHash(current = hash_,add = Reg.best_hyperparams_print)
                    except FileNotFoundError as e:
                        Reg = None
                        for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
                case 'TPE-iPLS':
+                    from utils.regress import TpeIpls
                    Reg = TpeIpls(train = [X_train, y_train], test=[X_test, y_test], n_intervall = s, n_iter=it, cv = nb_folds)
                    # reg_model = Reg.model_
@@ -413,10 +439,10 @@ if not spectra.empty and not y.empty:
                it = st.number_input(label = 'Enter the number of iterations', min_value = 2, max_value = 500, value = 250)
            else:
                s, it = None, None
-            # p_hash(str(s)+str(it))
+            hash_ = ObjectHash( current = hash_,add = str(s)+str(it))
            remodel_button = st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True, on_click=increment)
-            # p_hash(st.session_state.counter)
+            hash_ = ObjectHash(current = hash_, add = st.session_state.counter)
            Reg = RequestingModelCreation(change = hash_)
            reg_model = Reg.model_
            hash_ = hash(Reg)

--- a/src/utils/miscellaneous.py
+++ b/src/utils/miscellaneous.py
@@ -25,7 +25,7 @@ def data_split(x, y):
    from kennard_stone import train_test_split
    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
    X_train, X_test, y_train, y_test  = train_test_split(x, y, test_size = 0.25, random_state = 42)
-    train_index, test_index = X_train.index, X_test.index
+    train_index, test_index = np.array(X_train.index), np.array(X_test.index)
    return X_train, X_test, y_train, y_test, train_index, test_index
 ## descriptive stat

--- a/src/utils/visualize.py
+++ b/src/utils/visualize.py
@@ -70,6 +70,7 @@ def reg_plot( meas, pred, train_idx, test_idx):
        meas[i] = np.array(meas[i]).reshape(-1, 1) 
        pred[i] = np.array(pred[i]).reshape(-1, 1)
+        from sklearn.linear_model import LinearRegression
        M = LinearRegression()
        M.fit(meas[i], pred[i])
        a1[i] = np.round(M.coef_[0][0],2)
@@ -107,6 +108,7 @@ def resid_plot( meas, pred, train_idx, test_idx):
    e = [np.subtract(meas[0] ,pred[0]), np.subtract(meas[1], pred[1])]
    for i in range(len(meas)):
+        from sklearn.linear_model import LinearRegression
        M = LinearRegression()
        M.fit( np.array(meas[i]).reshape(-1,1), np.array(e[i]).reshape(-1,1))
        a1[i] = np.round(M.coef_[0],2)