diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 3314d99a0c20720b5e245e18b8cc15ff50d0dc04..80891e047da9447f9141e11b0fb30722c61b53dc 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -78,6 +78,9 @@ selected_samples_idx = [] +hash_ = '' + + if not file: c2.info('Info: Please load data file !') @@ -89,26 +92,34 @@ else: match extension: case 'csv':# Load .csv file with c2: - psep = st.radio("Select csv separator - _detected_: ", options = [";", ","], horizontal=True, key=9) - phdr = st.radio("indexes column in csv? - _detected_: ", options = ["no", "yes"], horizontal=True, key=31) + c2_1, c2_2 = st.columns([.5, .5]) + with c2_1: + dec = st.radio('decimal:', options= [".", ","], horizontal = True) + sep = st.radio("separator:", options = [";", ","], horizontal = True) + with c2_2: + phdr = st.radio("header: ", options = ["yes", "no"], horizontal = True) + pnames = st.radio("samples name:", options = ["yes", "no"], horizontal = True) + + hdr = 0 if phdr =="yes" else None + names = 0 if pnames =="yes" else None + hash_ = ObjectHash(current=hash_, add= [userfilename, hdr, names, dec, sep]) - if phdr == 'yes':col = 0 - else:col = False - from io import StringIO stringio = StringIO(file.getvalue().decode("utf-8")) data_str = str(stringio.read()) @st.cache_data - def csv_loader(file = file): + def csv_loader(file = file, change = None): from utils.data_parsing import CsvParser - par = CsvParser(file=file ) - float_data, non_float = par.parse() + par = CsvParser(file= file) + float_data, non_float = par.parse(decimal = dec, separator = sep, index_col = names, header = hdr) return float_data, non_float + try : - spectra, meta_data = csv_loader(file= file) + spectra, meta_data = csv_loader(file= file, change = hash_) st.success("The data have been loaded successfully", icon="✅") + except: st.error('''Error: The format of the file does not correspond to the expected dialect settings. To read the file correctly, please adjust the separator parameters.''') @@ -125,10 +136,12 @@ else: ## load and parse the temp dx file @st.cache_data - def read_dx(tmp_path): + def read_dx(tmp_path, change = None): M = JcampParser(path = tmp_path) M.parse() - return M.chem_data, M.specs_df_, M.md_df_, M.md_df_st_ + return M.chem_data, M.specs_df_, M.meta_data, M.meta_data_st_ + + hash_ = ObjectHash(current=hash_, add= dxdata) _, spectra, meta_data, md_df_st_ = read_dx(tmp_path = tmp_path) st.success("The data have been loaded successfully", icon="✅") diff --git a/src/utils/data_parsing.py b/src/utils/data_parsing.py index 9e536071725fd6f4647c34e80be24bfd6f0c4df4..35e7790090a2b80ead55abd8b0f4c9b7ff6bd2a3 100644 --- a/src/utils/data_parsing.py +++ b/src/utils/data_parsing.py @@ -1,4 +1,6 @@ import jcamp as jc +import numpy as np +from tempfile import NamedTemporaryFile class JcampParser: '''This module is designed to help retrieve spectral data as well as metadata of smaples from jcamp file''' @@ -13,7 +15,6 @@ class JcampParser: self.__wl = self.__list_of_blocks[0]["x"] # Wavelengths/frequencies/range def parse(self): - import numpy as np # Start retreiving the data specs = np.zeros((self.__nb, len(self.__list_of_blocks[0]["y"])), dtype=float) # preallocate a np matrix for sotoring spectra self.idx = np.arange(self.__nb) # This list is designed to store samples name @@ -75,7 +76,6 @@ class JcampParser: ### Method for retrieving the concentration of a single sample def conc(self,sample): import re - import numpy as np prep = '\n'.join(line for line in sample.split('\n') if "NCU" not in line and "<<undef>>" not in line) c = [] for match in re.findall(self.pattern, prep): @@ -87,16 +87,16 @@ class JcampParser: def specs_df_(self): return self.spectra @property - def md_df_(self): + def meta_data_st_(self): me = self.metadata_.drop("concentrations", axis = 1) - me = me.drop(me.columns[(me == '').all()], axis = 1) - return me + me = me.drop(me.columns[(me == '').all()], axis = 1).applymap(lambda x: x.upper() if isinstance(x, str) else x) + meta_data_st = me.loc[:,me.nunique(axis=0) > 1] + return meta_data_st + @property - def md_df_st_(self): - rt = ['origin','date'] - cl = self.metadata_.loc[:,rt] - return cl - + def meta_data(self): + return self.metadata_.drop("concentrations", axis = 1) + @property def chem_data_(self): return self.chem_data @@ -105,94 +105,103 @@ class JcampParser: class CsvParser: import clevercsv - import numpy as np def __init__(self, file): with NamedTemporaryFile(delete = False, suffix = ".csv") as tmp: - tmp.write(file.read()) - self.file = tmp.name + tmp.write(file.read()) + self.file = tmp.name + + def parse(self, decimal, separator, index_col, header): + from pandas import read_csv + df = read_csv(self.file, decimal = decimal, sep = separator, index_col = index_col, header = header) + if len(set(df.index))<df.shape[0]: + df = read_csv(self.file, decimal = decimal, sep = separator, index_col = None, header = header) + + float, non_float = df.select_dtypes(include='float'), df.select_dtypes(exclude='float') + return float, non_float + - def parse(self): - import pandas as pd + # def parse(self): + # import pandas as pd - dec_dia = ['.', ','] - sep_dia = [',', ';'] - dec, sep = [], [] + # dec_dia = ['.', ','] + # sep_dia = [',', ';'] + # dec, sep = [], [] - with open(self.file, mode = 'r') as csvfile: - lines = [csvfile.readline() for i in range(3)] - for i in lines: - for j in range(2): - dec.append(i.count(dec_dia[j])) - sep.append(i.count(sep_dia[j])) + # with open(self.file, mode = 'r') as csvfile: + # lines = [csvfile.readline() for i in range(3)] + # for i in lines: + # for j in range(2): + # dec.append(i.count(dec_dia[j])) + # sep.append(i.count(sep_dia[j])) - if dec[0] != dec[2]: - header = 0 - else: - header = 0 + # if dec[0] != dec[2]: + # header = 0 + # else: + # header = 0 - semi = np.sum([sep[2*i+1] for i in range(3)]) - commas = np.sum([sep[2*i] for i in range(3)]) + # semi = np.sum([sep[2*i+1] for i in range(3)]) + # commas = np.sum([sep[2*i] for i in range(3)]) - if semi>commas:separator = ';' - elif semi<commas: separator = ',' + # if semi>commas:separator = ';' + # elif semi<commas: separator = ',' - elif semi ==0 and commas == 0: separator = ';' + # elif semi ==0 and commas == 0: separator = ';' - commasdec = np.sum([dec[2*i+1] for i in range(1,3)]) - dot = np.sum([dec[2*i] for i in range(1,3)]) - if commasdec>dot:decimal = ',' - elif commasdec<=dot:decimal = '.' + # commasdec = np.sum([dec[2*i+1] for i in range(1,3)]) + # dot = np.sum([dec[2*i] for i in range(1,3)]) + # if commasdec>dot:decimal = ',' + # elif commasdec<=dot:decimal = '.' - if decimal == separator or len(np.unique(dec)) <= 2: - decimal = "." + # if decimal == separator or len(np.unique(dec)) <= 2: + # decimal = "." - df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=None, index_col=None) - try: - rat = np.mean(df.iloc[0,50:60]/df.iloc[5,50:60])>10 - header = 0 if rat or np.nan else None - except: - header = 0 - - from pandas.api.types import is_float_dtype - - if is_float_dtype(df.iloc[1:,0]): - index_col = None - else: - try: - te = df.iloc[1:,0].to_numpy().astype(float).dtype + # df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=None, index_col=None) + # try: + # rat = np.mean(df.iloc[0,50:60]/df.iloc[5,50:60])>10 + # header = 0 if rat or np.nan else None + # except: + # header = 0 + + # from pandas.api.types import is_float_dtype + + # if is_float_dtype(df.iloc[1:,0]): + # index_col = None + # else: + # try: + # te = df.iloc[1:,0].to_numpy().astype(float).dtype - except: - te = set(df.iloc[1:,0]) - - if len(te) == df.shape[0]-1: - index_col = 0 - elif len(te) < df.shape[0]-1: - index_col = None - else: - index_col = None - - # index_col = 0 if len(set(df.iloc[1:,0])) == df.shape[0]-1 and is_float_dtype(df.iloc[:,0])==False else None - df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=header, index_col=index_col) - # st.write(decimal, separator, index_col, header) + # except: + # te = set(df.iloc[1:,0]) + + # if len(te) == df.shape[0]-1: + # index_col = 0 + # elif len(te) < df.shape[0]-1: + # index_col = None + # else: + # index_col = None + + # # index_col = 0 if len(set(df.iloc[1:,0])) == df.shape[0]-1 and is_float_dtype(df.iloc[:,0])==False else None + # df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=header, index_col=index_col) + # # st.write(decimal, separator, index_col, header) - if df.select_dtypes(exclude='float').shape[1] >0: - non_float = df.select_dtypes(exclude='float') + # if df.select_dtypes(exclude='float').shape[1] >0: + # non_float = df.select_dtypes(exclude='float') - else: - non_float = pd.DataFrame() + # else: + # non_float = pd.DataFrame() - if df.select_dtypes(include='float').shape[1] >0: - float_data = df.select_dtypes(include='float') + # if df.select_dtypes(include='float').shape[1] >0: + # float_data = df.select_dtypes(include='float') - else: - float_data = pd.DataFrame() - return float_data, non_float + # else: + # float_data = pd.DataFrame() + # return float_data, non_float diff --git a/src/utils/miscellaneous.py b/src/utils/miscellaneous.py index fd98143a5397415dc091388eaa9cdf963a815070..db29a4aa7c7819581cc8e7b2cb14f338ef4ed34b 100644 --- a/src/utils/miscellaneous.py +++ b/src/utils/miscellaneous.py @@ -87,7 +87,7 @@ def ObjectHash(current = None, add = None): elif current == None and add != None: object = DatatoStr(add) elif current != None and add == None: - object = DatatoStr(current) + object = "None" # Compute the MD5 hash