import jcamp as jc import numpy as np from tempfile import NamedTemporaryFile class JcampParser: '''This module is designed to help retrieve spectral data as well as metadata of smaples from jcamp file''' def __init__(self, path): #self.__path = path.replace('\\','/') self.__path = path self.__dxfile = jc.jcamp_readfile(self.__path) # Access samples data self.__nb = self.__dxfile['blocks'] # Get the total number of blocks = The total number of scanned samples self.__list_of_blocks = self.__dxfile['children'] # Store all blocks within a a list self.__wl = self.__list_of_blocks[0]["x"] # Wavelengths/frequencies/range def parse(self): # Start retreiving the data specs = np.zeros((self.__nb, len(self.__list_of_blocks[0]["y"])), dtype=float) # preallocate a np matrix for sotoring spectra self.idx = np.arange(self.__nb) # This list is designed to store samples name self.__met = {} for i in range(self.__nb): # Loop over the blocks specs[i] = self.__list_of_blocks[i]['y'] block = self.__list_of_blocks[i] block_met = { 'name': block['title'], 'origin': block['origin'], 'date': block['date'], #'time': block['time'], 'spectrometer': block['spectrometer/data system'].split('\n$$')[0], 'n_scans':block['spectrometer/data system'].split('\n$$')[6].split('=')[1], 'resolution': block['spectrometer/data system'].split('\n$$')[8].split('=')[1], #'instrumental parameters': block['instrumental parameters'], 'xunits': block['xunits'], 'yunits': block['yunits'], #'xfactor': block['xfactor'], #'yfactor': block['yfactor'], 'firstx': block['firstx'], 'lastx': block['lastx'], #'firsty':block['firsty'], #'miny': block['miny'], #'maxy': block['maxy'], 'npoints': block['npoints'], 'concentrations':block['concentrations'], #'deltax':block['deltax'] } self.__met[f'{i}'] = block_met from pandas import DataFrame self.metadata_ = DataFrame(self.__met).T self.spectra = DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a dataframe #### Concentrarions self.pattern = r"\(([^,]+),(\d+(\.\d+)?),([^)]+)" aa = self.__list_of_blocks[0]['concentrations'] a = '\n'.join(line for line in aa.split('\n') if "NCU" not in line and "<<undef>>" not in line) n_elements = a.count('(') ## Get the name of analyzed chamical elements import re elements_name = [] for match in re.findall(self.pattern, a): elements_name.append(match[0]) ## Retrieve concentrations df = self.metadata_['concentrations'] cc = {} for i in range(self.metadata_.shape[0]): cc[df.index[i]] = self.conc(df[str(i)]) ### dataframe conntaining chemical data self.chem_data = DataFrame(cc, index=elements_name).T.astype(float) self.chem_data.index = self.metadata_['name'] ### Method for retrieving the concentration of a single sample def conc(self,sample): import re prep = '\n'.join(line for line in sample.split('\n') if "NCU" not in line and "<<undef>>" not in line) c = [] for match in re.findall(self.pattern, prep): c.append(match[1]) concentration = np.array(c) return concentration @property def specs_df_(self): return self.spectra @property def meta_data_st_(self): me = self.metadata_.drop("concentrations", axis = 1) me = me.drop(me.columns[(me == '').all()], axis = 1).applymap(lambda x: x.upper() if isinstance(x, str) else x) meta_data_st = me.loc[:,me.nunique(axis=0) > 1] return meta_data_st @property def meta_data(self): return self.metadata_.drop("concentrations", axis = 1) @property def chem_data_(self): return self.chem_data class CsvParser: import clevercsv def __init__(self, file): with NamedTemporaryFile(delete = False, suffix = ".csv") as tmp: tmp.write(file.read()) self.file = tmp.name def parse(self, decimal, separator, index_col, header): from pandas import read_csv self.df = read_csv(self.file, decimal = decimal, sep = separator, index_col = index_col, header = header) if len(set(self.df.index))<self.df.shape[0]: self.df = read_csv(self.file, decimal = decimal, sep = separator, index_col = None, header = header) self.float, self.non_float = self.df.select_dtypes(include='float'), self.df.select_dtypes(exclude='float') @property def meta_data_st_(self): me = self.non_float.applymap(lambda x: x.upper() if isinstance(x, str) else x) meta_data_st = me.loc[:,me.nunique(axis=0) > 1] return meta_data_st @property def meta_data(self): return self.non_float # def parse(self): # import pandas as pd # dec_dia = ['.', ','] # sep_dia = [',', ';'] # dec, sep = [], [] # with open(self.file, mode = 'r') as csvfile: # lines = [csvfile.readline() for i in range(3)] # for i in lines: # for j in range(2): # dec.append(i.count(dec_dia[j])) # sep.append(i.count(sep_dia[j])) # if dec[0] != dec[2]: # header = 0 # else: # header = 0 # semi = np.sum([sep[2*i+1] for i in range(3)]) # commas = np.sum([sep[2*i] for i in range(3)]) # if semi>commas:separator = ';' # elif semi<commas: separator = ',' # elif semi ==0 and commas == 0: separator = ';' # commasdec = np.sum([dec[2*i+1] for i in range(1,3)]) # dot = np.sum([dec[2*i] for i in range(1,3)]) # if commasdec>dot:decimal = ',' # elif commasdec<=dot:decimal = '.' # if decimal == separator or len(np.unique(dec)) <= 2: # decimal = "." # df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=None, index_col=None) # try: # rat = np.mean(df.iloc[0,50:60]/df.iloc[5,50:60])>10 # header = 0 if rat or np.nan else None # except: # header = 0 # from pandas.api.types import is_float_dtype # if is_float_dtype(df.iloc[1:,0]): # index_col = None # else: # try: # te = df.iloc[1:,0].to_numpy().astype(float).dtype # except: # te = set(df.iloc[1:,0]) # if len(te) == df.shape[0]-1: # index_col = 0 # elif len(te) < df.shape[0]-1: # index_col = None # else: # index_col = None # # index_col = 0 if len(set(df.iloc[1:,0])) == df.shape[0]-1 and is_float_dtype(df.iloc[:,0])==False else None # df = pd.read_csv(self.file, decimal=decimal, sep=separator, header=header, index_col=index_col) # # st.write(decimal, separator, index_col, header) # if df.select_dtypes(exclude='float').shape[1] >0: # non_float = df.select_dtypes(exclude='float') # else: # non_float = pd.DataFrame() # if df.select_dtypes(include='float').shape[1] >0: # float_data = df.select_dtypes(include='float') # else: # float_data = pd.DataFrame() # return float_data, non_float # ############## new function # def csv_loader(file): # import clevercsv # import numpy as np # import pandas as pd # dec_dia = ['.',','] # sep_dia = [',',';'] # dec, sep = [], [] # with open(file, mode = 'r') as csvfile: # lines = [csvfile.readline() for i in range(3)] # for i in lines: # for j in range(2): # dec.append(i.count(dec_dia[j])) # sep.append(i.count(sep_dia[j])) # if dec[0] != dec[2]: # header = 0 # else: # header = 0 # semi = np.sum([sep[2*i+1] for i in range(3)]) # commas = np.sum([sep[2*i] for i in range(3)]) # if semi>commas:separator = ';' # elif semi<commas: separator = ',' # elif semi ==0 and commas == 0: separator = ';' # commasdec = np.sum([dec[2*i+1] for i in range(1,3)]) # dot = np.sum([dec[2*i] for i in range(1,3)]) # if commasdec>dot:decimal = ',' # elif commasdec<=dot:decimal = '.' # if decimal == separator or len(np.unique(dec)) <= 2: # decimal = "." # df = pd.read_csv(file, decimal=decimal, sep=separator, header=None, index_col=None) # try: # rat = np.mean(df.iloc[0,50:60]/df.iloc[5,50:60])>10 # header = 0 if rat or np.nan else None # except: # header = 0 # from pandas.api.types import is_float_dtype # if is_float_dtype(df.iloc[1:,0]): # index_col = None # else: # try: # te = df.iloc[1:,0].to_numpy().astype(float).dtype # except: # te = set(df.iloc[1:,0]) # if len(te) == df.shape[0]-1: # index_col = 0 # elif len(te) < df.shape[0]-1: # index_col = None # else: # index_col = None # # index_col = 0 if len(set(df.iloc[1:,0])) == df.shape[0]-1 and is_float_dtype(df.iloc[:,0])==False else None # df = pd.read_csv(file, decimal=decimal, sep=separator, header=header, index_col=index_col) # # st.write(decimal, separator, index_col, header) # if df.select_dtypes(exclude='float').shape[1] >0: # non_float = df.select_dtypes(exclude='float') # else: # non_float = pd.DataFrame() # if df.select_dtypes(include='float').shape[1] >0: # float_data = df.select_dtypes(include='float') # else: # float_data = pd.DataFrame() # return float_data, non_float