Skip to content
Snippets Groups Projects
data_parsing.py 10.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • DIANE's avatar
    DIANE committed
    import jcamp as jc
    
    DIANE's avatar
    DIANE committed
    import numpy as np
    from tempfile import NamedTemporaryFile
    
    DIANE's avatar
    DIANE committed
    
    class JcampParser:
        '''This module is designed to help retrieve spectral data as well as metadata of smaples  from jcamp file'''
        def __init__(self, path):
            #self.__path = path.replace('\\','/')
            self.__path = path
            self.__dxfile = jc.jcamp_readfile(self.__path)
            
            # Access samples data
            self.__nb = self.__dxfile['blocks'] # Get the total number of blocks = The total number of scanned samples
            self.__list_of_blocks = self.__dxfile['children']  # Store all blocks within a a list
            self.__wl = self.__list_of_blocks[0]["x"] # Wavelengths/frequencies/range 
    
    DIANE's avatar
    DIANE committed
    
        def parse(self):
    
    DIANE's avatar
    DIANE committed
            # Start retreiving the data
            specs = np.zeros((self.__nb, len(self.__list_of_blocks[0]["y"])), dtype=float) # preallocate a np matrix for sotoring spectra
            self.idx = np.arange(self.__nb) # This list is designed to store samples name
            self.__met = {}
            for i in range(self.__nb): # Loop over the blocks
                specs[i] = self.__list_of_blocks[i]['y']
                block = self.__list_of_blocks[i]
                block_met = {   'name': block['title'],
                                'origin': block['origin'],
                                'date': block['date'],
                                #'time': block['time'],
                                'spectrometer': block['spectrometer/data system'].split('\n$$')[0],
                                'n_scans':block['spectrometer/data system'].split('\n$$')[6].split('=')[1],
                                'resolution': block['spectrometer/data system'].split('\n$$')[8].split('=')[1],
                                #'instrumental parameters': block['instrumental parameters'],
                                'xunits': block['xunits'],
                                'yunits': block['yunits'],
                                #'xfactor': block['xfactor'],
                                #'yfactor': block['yfactor'],
                                'firstx': block['firstx'],
                                'lastx': block['lastx'],
                                #'firsty':block['firsty'],
                                #'miny': block['miny'],
                                #'maxy': block['maxy'],
                                'npoints': block['npoints'],
                                'concentrations':block['concentrations'],
                                #'deltax':block['deltax']
                                }
                
                self.__met[f'{i}'] = block_met
    
    DIANE's avatar
    DIANE committed
                from pandas import DataFrame
    
    DIANE's avatar
    DIANE committed
            self.metadata_ = DataFrame(self.__met).T
            self.spectra = DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a dataframe
    
    
    
            #### Concentrarions
            self.pattern = r"\(([^,]+),(\d+(\.\d+)?),([^)]+)"
            aa = self.__list_of_blocks[0]['concentrations']
            a = '\n'.join(line for line in aa.split('\n') if "NCU" not in line and "<<undef>>" not in line)
            n_elements = a.count('(')
    
            ## Get the name of analyzed chamical elements
    
    DIANE's avatar
    DIANE committed
            import re
    
    DIANE's avatar
    DIANE committed
            elements_name = []
            for match in re.findall(self.pattern, a):
                    elements_name.append(match[0])
    
    
    DIANE's avatar
    DIANE committed
            ## Retrieve concentrations
    
    DIANE's avatar
    DIANE committed
            df = self.metadata_['concentrations']
            cc = {}
            for i in range(self.metadata_.shape[0]):
                cc[df.index[i]] = self.conc(df[str(i)])
    
            ### dataframe conntaining chemical data
            self.chem_data = DataFrame(cc, index=elements_name).T.astype(float)
            self.chem_data.index = self.metadata_['name']
    
        ### Method for retrieving the concentration of a single sample
        def conc(self,sample):
    
    DIANE's avatar
    DIANE committed
            import re
    
    DIANE's avatar
    DIANE committed
            prep = '\n'.join(line for line in sample.split('\n') if "NCU" not in line and "<<undef>>" not in line)
            c = []
            for match in re.findall(self.pattern, prep):
                    c.append(match[1])
            concentration = np.array(c)
            return concentration
    
        @property
        def specs_df_(self):
            return self.spectra
        @property
    
    DIANE's avatar
    DIANE committed
        def meta_data_st_(self):
    
    DIANE's avatar
    DIANE committed
            me = self.metadata_.drop("concentrations", axis = 1)
    
    DIANE's avatar
    DIANE committed
            me = me.drop(me.columns[(me == '').all()], axis = 1).applymap(lambda x: x.upper() if isinstance(x, str) else x)  
            meta_data_st = me.loc[:,me.nunique(axis=0) > 1]
            return meta_data_st
    
    
    DIANE's avatar
    DIANE committed
        @property
    
    DIANE's avatar
    DIANE committed
        def meta_data(self):
            return self.metadata_.drop("concentrations", axis = 1)
                     
    
    DIANE's avatar
    DIANE committed
        @property
        def chem_data_(self):
             return self.chem_data
        
    
    
    class CsvParser:
    
    DIANE's avatar
    DIANE committed
        import clevercsv
    
    
        def __init__(self, file):
    
    DIANE's avatar
    DIANE committed
            with NamedTemporaryFile(delete = False, suffix = ".csv") as tmp:
    
    DIANE's avatar
    DIANE committed
                tmp.write(file.read())
                self.file = tmp.name
    
        def parse(self, decimal, separator, index_col, header):
            from pandas import read_csv
            df = read_csv(self.file, decimal = decimal, sep = separator, index_col = index_col, header = header)
            if len(set(df.index))<df.shape[0]:
                df = read_csv(self.file, decimal = decimal, sep = separator, index_col = None, header = header)
            
            float, non_float = df.select_dtypes(include='float'), df.select_dtypes(exclude='float')
            return float, non_float
             
    
    DIANE's avatar
    DIANE committed
        # def parse(self):
        #     import pandas as pd
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        #     dec_dia = ['.', ',']
        #     sep_dia = [',', ';']
        #     dec, sep = [], []
    
    DIANE's avatar
    DIANE committed
            
    
    DIANE's avatar
    DIANE committed
        #     with open(self.file, mode = 'r') as csvfile:
        #         lines = [csvfile.readline() for i in range(3)]
        #         for i in lines:
        #             for j in range(2):
        #                 dec.append(i.count(dec_dia[j]))
        #                 sep.append(i.count(sep_dia[j]))
    
    DIANE's avatar
    DIANE committed
        
    
    DIANE's avatar
    DIANE committed
        #     if dec[0] != dec[2]:
        #         header = 0
        #     else:
        #         header = 0
    
    DIANE's avatar
    DIANE committed
        #     semi = np.sum([sep[2*i+1] for i in range(3)])
        #     commas = np.sum([sep[2*i] for i in range(3)])
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        #     if semi>commas:separator = ';'
        #     elif semi<commas: separator = ','
    
    DIANE's avatar
    DIANE committed
            
    
    DIANE's avatar
    DIANE committed
        #     elif semi ==0 and commas == 0: separator = ';'
    
    DIANE's avatar
    DIANE committed
        #     commasdec = np.sum([dec[2*i+1] for i in range(1,3)])
        #     dot = np.sum([dec[2*i] for i in range(1,3)])
        #     if commasdec>dot:decimal = ','
        #     elif commasdec<=dot:decimal = '.'
    
    DIANE's avatar
    DIANE committed
            
    
    DIANE's avatar
    DIANE committed
        #     if decimal == separator or len(np.unique(dec)) <= 2:
        #         decimal = "."
    
    DIANE's avatar
    DIANE committed
            
    
    DIANE's avatar
    DIANE committed
        #     df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=None, index_col=None)
        #     try:
        #         rat = np.mean(df.iloc[0,50:60]/df.iloc[5,50:60])>10
        #         header = 0 if rat or np.nan else None
        #     except:
        #         header = 0
    
        #     from pandas.api.types import is_float_dtype
    
        #     if is_float_dtype(df.iloc[1:,0]):
        #         index_col = None
        #     else:
        #         try:
        #             te = df.iloc[1:,0].to_numpy().astype(float).dtype
    
    DIANE's avatar
    DIANE committed
                    
    
    DIANE's avatar
    DIANE committed
        #         except:
        #             te = set(df.iloc[1:,0])
    
        #         if len(te) == df.shape[0]-1:
        #             index_col = 0
        #         elif len(te) < df.shape[0]-1:
        #             index_col = None
        #         else:
        #             index_col = None
    
        #     # index_col = 0 if len(set(df.iloc[1:,0])) == df.shape[0]-1 and is_float_dtype(df.iloc[:,0])==False else None
        #     df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=header, index_col=index_col)
        #     # st.write(decimal, separator, index_col, header)
    
    DIANE's avatar
    DIANE committed
            
    
    DIANE's avatar
    DIANE committed
        #     if df.select_dtypes(exclude='float').shape[1] >0:
        #         non_float = df.select_dtypes(exclude='float')
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
        #     else:
        #         non_float = pd.DataFrame()
    
    DIANE's avatar
    DIANE committed
        #     if df.select_dtypes(include='float').shape[1] >0:
        #         float_data = df.select_dtypes(include='float')
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
        #     else:
        #         float_data = pd.DataFrame()
        #     return float_data, non_float
    
    DIANE's avatar
    DIANE committed
                
    
    
    
    
    
    
    # ############## new function
    # def csv_loader(file):
    #     import clevercsv
    #     import numpy as np
    #     import pandas as pd
    
    #     dec_dia = ['.',',']
    #     sep_dia = [',',';']
    #     dec, sep = [], []
    #     with open(file, mode = 'r') as csvfile:
    #         lines = [csvfile.readline() for i in range(3)]
    #         for i in lines:
    #             for j in range(2):
    #                 dec.append(i.count(dec_dia[j]))
    #                 sep.append(i.count(sep_dia[j]))
                
    #     if dec[0] != dec[2]:
    #         header = 0
    #     else:
    #         header = 0
    
    
    #     semi = np.sum([sep[2*i+1] for i in range(3)])
    #     commas = np.sum([sep[2*i] for i in range(3)])
    
    #     if semi>commas:separator = ';'
    #     elif semi<commas: separator = ','
        
    #     elif semi ==0 and commas == 0: separator = ';'
        
    
    #     commasdec = np.sum([dec[2*i+1] for i in range(1,3)])
    #     dot = np.sum([dec[2*i] for i in range(1,3)])
    #     if commasdec>dot:decimal = ','
    #     elif commasdec<=dot:decimal = '.'
        
    #     if decimal == separator or len(np.unique(dec)) <= 2:
    #         decimal = "."
        
    #     df = pd.read_csv(file, decimal=decimal, sep=separator, header=None, index_col=None)
    #     try:
    #         rat = np.mean(df.iloc[0,50:60]/df.iloc[5,50:60])>10
    #         header = 0 if rat or np.nan else None
    #     except:
    #         header = 0
    
    #     from pandas.api.types import is_float_dtype
    
    #     if is_float_dtype(df.iloc[1:,0]):
    #         index_col = None
    #     else:
    #         try:
    #             te = df.iloc[1:,0].to_numpy().astype(float).dtype
                
    #         except:
    #             te = set(df.iloc[1:,0])
    
    #         if len(te) == df.shape[0]-1:
    #             index_col = 0
    #         elif len(te) < df.shape[0]-1:
    #             index_col = None
    #         else:
    #             index_col = None
    
    #     # index_col = 0 if len(set(df.iloc[1:,0])) == df.shape[0]-1 and is_float_dtype(df.iloc[:,0])==False else None
    #     df = pd.read_csv(file, decimal=decimal, sep=separator, header=header, index_col=index_col)
    #     # st.write(decimal, separator, index_col, header)
        
    #     if df.select_dtypes(exclude='float').shape[1] >0:
    #         non_float = df.select_dtypes(exclude='float')
            
    #     else:
    #         non_float = pd.DataFrame()
    
    
    #     if df.select_dtypes(include='float').shape[1] >0:
    #         float_data = df.select_dtypes(include='float')
            
    #     else:
    #         float_data = pd.DataFrame()
    #     return float_data, non_float