Skip to content
Snippets Groups Projects
DATA_HANDLING.py 2.23 KiB
Newer Older
  • Learn to ignore specific revisions
  • DIANE's avatar
    DIANE committed
    from Packages import * 
    
    
    ## try to automatically detect the field separator within the CSV
    def find_delimiter(filename):
        sniffer = csv.Sniffer()
        with open(filename) as fp:
            delimiter = sniffer.sniff(fp.read(5000)).delimiter
        return delimiter
    
    def find_col_index(filename):
        with open(filename) as fp:
            lines = pd.read_csv(fp, skiprows=3, nrows=3, index_col=False, sep=str(find_delimiter(filename)))
            col_index = 'yes' if lines.iloc[:,0].dtypes != np.float64 else 'no'
        return col_index
    
    
    # detection of columns categories and scaling
    def col_cat(data_import):
        # detect numerical and categorical columns in the csv
        numerical_columns_list = []
        categorical_columns_list = []
        for i in data_import.columns:
            if data_import[i].dtype == np.dtype("float64") or data_import[i].dtype == np.dtype("int64"):
                numerical_columns_list.append(data_import[i])
            else:
                categorical_columns_list.append(data_import[i])
        if len(numerical_columns_list) == 0:
            empty = [0 for x in range(len(data_import))]
            numerical_columns_list.append(empty)
        if len(categorical_columns_list) > 0:
            categorical_data = pd.concat(categorical_columns_list, axis=1)
        if len(categorical_columns_list) == 0:
    
            categorical_data = pd.DataFrame
    
    DIANE's avatar
    DIANE committed
        # Create numerical data matrix from the numerical columns list and fill na with the mean of the column
        numerical_data = pd.concat(numerical_columns_list, axis=1)
        numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x)))
    
    
        return numerical_data, categorical_data
    
    
    DIANE's avatar
    DIANE committed
    
    
    def list_files(mypath, import_type):
        list_files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith(import_type + '.pkl')]
        if list_files == []:
            list_files = ['Please, create a model before - no model available yet']
    
    DIANE's avatar
    DIANE committed
        return list_files
    
    
    
    def standardize(X):
        t = X
        sk = StandardScaler()
        sc = pd.DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns)
        return sc
    
    def MinMaxScale(X):
        t = X
        sk = MinMaxScaler(feature_range=(0,1))
        sc = pd.DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns)
        return sc