commit all

cce2ab51 · DIANE · 8a11ddaa · cce2ab51 · cce2ab51 · cce2ab51
Commit cce2ab51 authored 5 months ago by DIANE
--- a/src/data/hash/cat.exe
+++ b/src/data/hash/cat.exe
--- a/src/data/hash/grep.exe
+++ b/src/data/hash/grep.exe
--- a/src/data/hash/hash.txt
+++ b/src/data/hash/hash.txt
--- a/src/data/models/.gitignore
+++ b/src/data/models/.gitignore
--- a/src/data/models/model_sd_2024_06_07__created_on_Xcal_and_Ycal_data_.pkl
+++ b/src/data/models/model_sd_2024_06_07__created_on_Xcal_and_Ycal_data_.pkl
--- a/src/data/models/model_sd_2024_06_07__on_Xcal_and_Ycal_data_Wavelengths_index.csv
+++ b/src/data/models/model_sd_2024_06_07__on_Xcal_and_Ycal_data_Wavelengths_index.csv
--- a/src/data/params/.gitignore
+++ b/src/data/params/.gitignore
--- a/src/data/params/Preprocessing.json
+++ b/src/data/params/Preprocessing.json
--- a/src/data/predictions/.gitignore
+++ b/src/data/predictions/.gitignore
--- a/docs/Clustering.md
+++ b/docs/Clustering.md
 # Clustering Methods
 ## K-Means clustering
-::: src.utils.KMEANS_.Sk_Kmeans
+::: src.utils.KMEANS_.SkKmeans
 ## HDBSCAN clustering
 ::: src.utils.HDBSCAN_Clustering.Hdbscan
--- a/docs/model_creation.md
+++ b/docs/model_creation.md
 # Models creation
 ## PLSR from Pinard (scikit learn)
-::: src.utils.KMEANS_.Sk_Kmeans
+::: src.utils.KMEANS_.SkKmeans
 ## lwPlsR from Jchemo (Julia)
 ::: src.utils.LWPLSR_.LWPLSR
\ No newline at end of file
--- a/src/Untitled-1.ipynb
+++ b/src/Untitled-1.ipynb
--- a/src/common.py
+++ b/src/common.py
@@ -35,5 +35,5 @@ from utils.data_parsing import *
 from utils.hash import *
 from utils.visualize import *
 from utils.miscellaneous import ObjectHash
-from utils.samsel import RDM, KS
+from utils.samsel import Samplers
 from report import report
\ No newline at end of file
--- a/src/form_data.json
+++ b/src/form_data.json
-{"meta_project": "cs", "meta_sample_species": "cs", "meta_sample_category": "Animal", "meta_sample_pretreatment": "Powder", "meta_machine_ID": "cs", "meta_sample_sub_category": "Leaf litter", "meta_sample_humidity": "Fresh", "meta_scan_place": "Pace"}
+{"meta_project": "ds", "meta_sample_species": "ds", "meta_sample_category": "Other", "meta_sample_pretreatment": "Pastile", "meta_machine_ID": "ds", "meta_sample_sub_category": "Animal part", "meta_sample_humidity": "Wet", "meta_scan_place": "Pace"}
\ No newline at end of file
--- a/src/pages/1-samples_selection.py
+++ b/src/pages/1-samples_selection.py
--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py
@@ -9,7 +9,6 @@ st.set_page_config(page_title = "NIRS Utils", page_icon = ":goat:", layout = "wi
 UiComponents(pagespath = pages_folder, csspath= css_file,imgpath=image_path ,
             header=True, sidebar= True, bgimg=False, colborders=True)
 hash_ = ''
 # Initialize the variable in session state if it doesn't exist for st.cache_data
 if 'counter' not in st.session_state:
    st.session_state.counter = 0
@@ -18,13 +17,6 @@ def increment():
 # ####################################  Methods ##############################################
-def delete_files(keep):
-    supp = []
-    # Walk through the directory
-    for root, dirs, files in os.walk('report/', topdown=False):
-        for file in files:
-            if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep):
-                os.remove(os.path.join(root, file))
 class lw:
    def __init__(self, Reg_json, pred):
@@ -35,9 +27,9 @@ class lw:
 ################ clean the results dir #############
-delete_files(keep = ['.py', '.pyc','.bib'])
+HandleItems.delete_files(keep = ['.py', '.pyc','.bib'])
 for i in ['model', 'dataset', 'figures']:
-    dirpath = Path('./report/out/')/i
+    dirpath = Path('./report/results/')
    if not dirpath.exists():
        dirpath.mkdir(parents=True, exist_ok=True)
 # ####################################### page preamble #######################################
@@ -48,7 +40,7 @@ c0.image("./images/model_creation.png", use_column_width = True) # graphical abs
 ################################################################# Begin : I- Data loading and preparation ######################################
 files_format = ['csv', 'dx'] # Supported files format
-file = c1.radio('Select files format:', options = files_format,horizontal = True) # Select a file format
+file = c1.radio('Select files format:', options = files_format, horizontal = True) # Select a file format
 spectra = DataFrame() # preallocate the spectral data block
 y = DataFrame() # preallocate the target(s) data block
@@ -57,12 +49,13 @@ match file:
    # load csv file
    case 'csv':
        from utils.data_parsing import CsvParser
+        # @st.cache_data
        def read_csv(file = file, change = None, dec = None, sep= None, names = None, hdr = None):
-            delete_files(keep = ['.py', '.pyc','.bib'])
+            HandleItems.delete_files(keep = ['.py', '.pyc','.bib'])
            from utils.data_parsing import CsvParser
            par = CsvParser(file= file)
            par.parse(decimal = dec, separator = sep, index_col = names, header = hdr)
-            return par.float, par.meta_data, par.meta_data_st_, par.df
+            return par.float, par.meta_data, par.meta_data_st_, par.df, par.rownames
        with c1:
            # Load X-block data
@@ -79,9 +72,8 @@ match file:
                hdrx = 0 if phdrx =="yes" else None
                namesx = 0 if pnamesx =="yes" else None
                try:
-                    spectra, _, _, xfile = read_csv(file= xcal_csv, change = hash_, dec = decx, sep = sepx, names =namesx, hdr = hdrx)
+                    spectra, _, meta_spec, xfile, spec_labels = read_csv(file = xcal_csv, change = hash_, dec = decx, sep = sepx, names = namesx, hdr = hdrx)
-                    N,P = spectra.shape
+                    N_specs, nwls = spectra.shape
-                    st.success('xfile has been loaded successfully')
                except:
                    st.error('Error: The xfile has not been loaded successfully, please consider tuning the dialect settings!')
@@ -105,8 +97,7 @@ match file:
                hdry = 0 if phdry =="yes" else None
                namesy = 0 if pnamesy =="yes" else None
                try:
-                    chem_data, _, _, yfile = read_csv(file= ycal_csv, change = hash_, dec = decy, sep = sepy, names =namesy, hdr = hdry)
+                    chem_data, _, _, yfile, y_labels = read_csv(file= ycal_csv, change = hash_, dec = decy, sep = sepy, names =namesy, hdr = hdry)
-                    st.success('yfile has been loaded successfully')
                except:
                    st.error('Error: The yfile has not been loaded successfully, please consider tuning the dialect settings!')
@@ -124,27 +115,18 @@ match file:
                    xy_str += str(stringio.read())
                # p_hash([xy_str + str(xcal_csv.name) + str(ycal_csv.name), hdrx, sepx, hdry, sepy])
                hash_ = ObjectHash(current=hash_,add = xy_str)
                file_name = str(xcal_csv.name) + str(ycal_csv.name)
                # yfile =  read_csv(file= ycal_csv, change = hash_)
-                if yfile.shape[1]>0 and xfile.shape[1]>0 :    
+                if yfile.shape[1]>0 and xfile.shape[1]>0 :
                    if 'chem_data' in globals():
                        if chem_data.shape[1] > 1:
-                            yname = c1.selectbox('Select a target', options = ['']+chem_data.columns.tolist(), format_func = lambda x: x if x else "<Select>")
+                            yname = c1.selectbox('Select a target', options = [''] + chem_data.columns.tolist(), format_func = lambda x: x if x else "<Select>")
                            if yname:
                                y = chem_data.loc[:, yname]
                            else:
@@ -155,8 +137,27 @@ match file:
                    ### warning
                    if not y.empty:
-                        if spectra.shape[0] != y.shape[0]:
+                        y.index = y.index.astype(str)
-                            st.error('Error: X and Y have different sample size')
+                        duplicate_indices = y.index
+                        st.write(duplicate_indices)
+                    if not y.empty:
+                        if spectra.shape[0] == y.shape[0]:
+                            st.info('Info: X and Y have different number of rows')
+                        else:
+                            st.info('Info: X and Y have different number of rows')
+                        if spectra.shape[0] >= y.shape[0]:
+                            if namesy == 0:
+                                pass
+                            else :
+                                st.warning('No labels are provided for target, therefore, both target and spectra are considered well organized!')
+                        if spectra.shape[0] < y.shape[0]:
+                            st.write('The number of samples chemically analyzed exceeds the number of scanned samples!')
                            y = DataFrame
                            spectra = DataFrame
@@ -178,7 +179,7 @@ match file:
                        # p_hash(str(dxdata)+str(data_file.name))
                ## load and parse the temp dx file
-                @st.cache_data
+                # @st.cache_data
                def read_dx(tmp_path):
                    M = JcampParser(path = tmp_path)
                    M.parse()
@@ -238,21 +239,21 @@ if not spectra.empty and not y.empty:
        st.pyplot(spectra_plot) ######## Loaded graph
        if st.session_state.interface =='advanced':
            with st.container():
-                values = st.slider('Select a range of values', min_value = 0, max_value = 100, value = (0, P))
+                values = st.slider('Select a range of values', min_value = 0, max_value = nwls, value = (0, nwls))
-            hash_ = ObjectHash(current=hash_, add= values)
-            spectra = spectra.iloc[:,values[0]:values[1]]
+            hash_ = ObjectHash(current= hash_, add= values)
+            spectra = spectra.iloc[:, values[0]:values[1]]
            nwl = spectra.shape
+            st.pyplot(plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity"))
-            if np.array(spectra.columns).dtype.kind in ['i', 'f']:
-                colnames = spectra.columns
-            else:
-                colnames = np.arange(spectra.shape[1])
+        if np.array(spectra.columns).dtype.kind in ['i', 'f']:
+            colnames = spectra.columns
+        else:
+            colnames = np.arange(spectra.shape[1])
-            hash_ = ObjectHash(current= hash_, add=values)
-            st.pyplot(plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity"))
    from utils.miscellaneous import data_split
    X_train, X_test, y_train, y_test, train_index, test_index = data_split(x=spectra, y=y)
@@ -510,7 +511,7 @@ if Reg:
        st.write(Reg.best_hyperparams_print)
        @st.cache_data(show_spinner =False)
        def preprocessings(change):
-            with open('report/out/Preprocessing.json', "w") as outfile:
+            with open('report/results/Preprocessing.json', "w") as outfile:
                json.dump(Reg.best_hyperparams_, outfile)
        preprocessings(change=hash_)
@@ -670,17 +671,17 @@ if Reg:
            match file:
                # load csv file
                case 'csv':
-                    xfile.to_csv('report/out/dataset/'+ xcal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a')
+                    xfile.to_csv('report/results/dataset/'+ xcal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a')
-                    yfile.to_csv('report/out/dataset/'+ ycal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a')
+                    yfile.to_csv('report/results/dataset/'+ ycal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a')
                case 'dx':
-                    with open('report/out/dataset/'+data_file.name, 'w') as dd:
+                    with open('report/results/dataset/'+data_file.name, 'w') as dd:
                        dd.write(dxdata)
-            with open('./report/out/model/'+ model_type + '.pkl','wb') as f:# export model
+            with open('./report/results/model/'+ model_type + '.pkl','wb') as f:# export model
                from joblib import dump
                dump(reg_model, f)
-            figpath =Path('./report/out/figures/')
+            figpath =Path('./report/results/figures/')
            spectra_plot.savefig(figpath / "spectra_plot.png")
            target_plot.savefig(figpath / "histogram.png")
            imp_fig.savefig(figpath / "variable_importance.png")
@@ -688,11 +689,11 @@ if Reg:
            fig0.write_image(figpath / "meas_vs_pred_cv_onebyone.png")
            measured_vs_predicted.savefig(figpath / 'measured_vs_predicted.png')
            residuals_plot.savefig(figpath / 'residuals_plot.png')
-            # with open('report/out/Preprocessing.json', "w") as outfile:
+            # with open('report/results/Preprocessing.json', "w") as outfile:
            #     json.dump(Reg.best_hyperparams_, outfile)
            if model_type == 'TPE-iPLS': # export selected wavelengths
-                wlfilename = './report/out/model/'+ model_type+'-selected_wavelengths.xlsx'
+                wlfilename = './report/results/model/'+ model_type+'-selected_wavelengths.xlsx'
                all = concat([intervalls_with_cols.T, Reg.selected_features_], axis = 0,  ignore_index=True).T
                all.columns=['wl_from','wl_to','idx_from', 'idx_to']
                all.to_excel(wlfilename)
@@ -701,7 +702,7 @@ if Reg:
            if Path("./report/report.tex").exists():
                report.generate_report(change = hash_)
            if Path("./report/report.pdf").exists():
-                move("./report/report.pdf", "./report/out/report.pdf")
+                move("./report/report.pdf", "./report/results/report.pdf")
            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
            # pklfile = {'model_': Reg.model_,"model_type" : model_type, 'training_data':{'raw-spectra':spectra,'target':y, },
@@ -720,7 +721,7 @@ if Reg:
            else:
                pklfile['selected-wls'] = {'idx':None, "wls":None }
-            with open('./report/out/file_system.pkl', "wb") as pkl:
+            with open('./report/results/file_system.pkl', "wb") as pkl:
                dump(pklfile, pkl)
            return change
@@ -733,7 +734,7 @@ if Reg:
-                if len(os.listdir('./report/out/figures/'))>2:
+                if len(os.listdir('./report/results/figures/'))>2:
                    make_archive(base_name="./report/Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file
                    move("./report/Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir
                    with open(f"./report/{tempdirname}/Results.zip", "rb") as f:
@@ -746,4 +747,4 @@ if Reg:
                args = None, kwargs = None,type = "primary",use_container_width = True, disabled = disabled_down)
-    delete_files(keep = ['.py', '.pyc','.bib'])
+    HandleItems.delete_files(keep = ['.py', '.pyc','.bib'])
--- a/src/pages/3-prediction.py
+++ b/src/pages/3-prediction.py
@@ -25,23 +25,14 @@ hash_ = ''
 #     hash_ = hash_data(hash_+str(add))
 #     return hash_
-dirpath = Path('report/out/model')
+dirpath = Path('report/results/model')
 if dirpath.exists() and dirpath.is_dir():
    rmtree(dirpath)
 if 'Predict' not in st.session_state:
    st.session_state['Predict'] = False
-# ####################################  Methods ##############################################
-# empty temp figures
+####################################  Methods ##############################################
-def delete_files(keep):
-    supp = []
-    # Walk through the directory
-    for root, dirs, files in os.walk('report/', topdown=False):
-        for file in files:
-            if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep):
-                os.remove(os.path.join(root, file))
-###################################################################
 st.header("Prediction making using a previously developed model")
 c1, c2 = st.columns([2, 1])
 c1.image("./images/prediction making.png", use_column_width=True)
@@ -140,7 +131,7 @@ with c2:
                def read_csv(file = None, change = None, dec = None, sep= None, names = None, hdr = None):
-                    delete_files(keep = ['.py', '.pyc','.bib'])
+                    HandleItems.delete_files(keep = ['.py', '.pyc','.bib'])
                    from utils.data_parsing import CsvParser
                    if file is not None:
                        par = CsvParser(file= file)
@@ -354,15 +345,15 @@ if not pred_data.empty:# Load the model with joblib
                match test:
                    # load csv file
                    case 'csv':
-                        df.to_csv('report/out/dataset/'+ new_data.name, sep = ';', encoding = 'utf-8', mode = 'a')
+                        df.to_csv('report/results/dataset/'+ new_data.name, sep = ';', encoding = 'utf-8', mode = 'a')
                    case 'dx':
-                        with open('report/out/dataset/'+new_data.name, 'w') as dd:
+                        with open('report/results/dataset/'+new_data.name, 'w') as dd:
                            dd.write(dxdata)
-                prepspectraplot.savefig('./report/out/figures/raw_spectra.png')
+                prepspectraplot.savefig('./report/results/figures/raw_spectra.png')
-                rawspectraplot.savefig('./report/out/figures/preprocessed_spectra.png')
+                rawspectraplot.savefig('./report/results/figures/preprocessed_spectra.png')
-                hist.savefig('./report/out/figures/histogram.png')
+                hist.savefig('./report/results/figures/histogram.png')
-                result.round(4).to_csv('./report/out/The_analysis_result.csv', sep = ";")
+                result.round(4).to_csv('./report/results/The_analysis_result.csv', sep = ";")
                return change
            preparing_results_for_downloading(change = hash_)
@@ -372,7 +363,7 @@ if not pred_data.empty:# Load the model with joblib
                from tempfile import TemporaryDirectory
                with  TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory
                    tempdirname = os.path.split(temp_dir)[1]
-                    if len(os.listdir('./report/out/figures/'))==3:
+                    if len(os.listdir('./report/results/figures/'))==3:
                        make_archive(base_name="./report/Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file
                        move("./report/Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir
                        with open(f"./report/{tempdirname}/Results.zip", "rb") as f:

--- a/src/report/out/.gitkeep
+++ b/src/report/out/.gitkeep
--- a/src/report/out/dataset/.gitkeep
+++ b/src/report/out/dataset/.gitkeep
--- a/src/report/out/figures/.gitkeep
+++ b/src/report/out/figures/.gitkeep