data_parsing.py

from packages import *
import jcamp as jc

class JcampParser:
    '''This module is designed to help retrieve spectral data as well as metadata of smaples  from jcamp file'''
    def __init__(self, path):
        #self.__path = path.replace('\\','/')
        self.__path = path
        self.__dxfile = jc.jcamp_readfile(self.__path)
        
        # Access samples data
        self.__nb = self.__dxfile['blocks'] # Get the total number of blocks = The total number of scanned samples
        self.__list_of_blocks = self.__dxfile['children']  # Store all blocks within a a list
        self.__wl = self.__list_of_blocks[0]["x"] # Wavelengths/frequencies/range 
    
        # Start retreiving the data
        specs = np.zeros((self.__nb, len(self.__list_of_blocks[0]["y"])), dtype=float) # preallocate a np matrix for sotoring spectra
        self.idx = np.arange(self.__nb) # This list is designed to store samples name
        self.__met = {}
        for i in range(self.__nb): # Loop over the blocks
            specs[i] = self.__list_of_blocks[i]['y']
            block = self.__list_of_blocks[i]
            block_met = {   'name': block['title'],
                            'origin': block['origin'],
                            'date': block['date'],
                            #'time': block['time'],
                            'spectrometer': block['spectrometer/data system'].split('\n$$')[0],
                            'n_scans':block['spectrometer/data system'].split('\n$$')[6].split('=')[1],
                            'resolution': block['spectrometer/data system'].split('\n$$')[8].split('=')[1],
                            #'instrumental parameters': block['instrumental parameters'],
                            'xunits': block['xunits'],
                            'yunits': block['yunits'],
                            #'xfactor': block['xfactor'],
                            #'yfactor': block['yfactor'],
                            'firstx': block['firstx'],
                            'lastx': block['lastx'],
                            #'firsty':block['firsty'],
                            #'miny': block['miny'],
                            #'maxy': block['maxy'],
                            'npoints': block['npoints'],
                            'concentrations':block['concentrations'],
                            #'deltax':block['deltax']
                            }
            
            self.__met[f'{i}'] = block_met
        self.metadata_ = DataFrame(self.__met).T
        self.spectra = DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a dataframe


        #### Concentrarions
        self.pattern = r"\(([^,]+),(\d+(\.\d+)?),([^)]+)"
        aa = self.__list_of_blocks[0]['concentrations']
        a = '\n'.join(line for line in aa.split('\n') if "NCU" not in line and "<<undef>>" not in line)
        n_elements = a.count('(')

        ## Get the name of analyzed chamical elements
        elements_name = []
        for match in re.findall(self.pattern, a):
                elements_name.append(match[0])

        ## Retrieve concentrationds
        df = self.metadata_['concentrations']
        cc = {}
        for i in range(self.metadata_.shape[0]):
            cc[df.index[i]] = self.conc(df[str(i)])

        ### dataframe conntaining chemical data
        self.chem_data = DataFrame(cc, index=elements_name).T.astype(float)
        self.chem_data.index = self.metadata_['name']

    ### Method for retrieving the concentration of a single sample
    def conc(self,sample):
        prep = '\n'.join(line for line in sample.split('\n') if "NCU" not in line and "<<undef>>" not in line)
        c = []
        for match in re.findall(self.pattern, prep):
                c.append(match[1])
        concentration = np.array(c)
        return concentration

    @property
    def specs_df_(self):
        return self.spectra
    @property
    def md_df_(self):
        me = self.metadata_.drop("concentrations", axis = 1)
        me = me.drop(me.columns[(me == '').all()], axis = 1)
        return me
    @property
    def md_df_st_(self):
         rt = ['origin','date']
         cl = self.metadata_.loc[:,rt]
         return cl
             
    @property
    def chem_data_(self):
         return self.chem_data
    

class CsvParser:
     def __init__(self) -> None:
          pass