Source code for hed.tools.util.data_util

""" Data handling utilities involving dataframes. """

import pandas as pd
import numpy as np
from hed.errors.exceptions import HedFileError


[docs]def add_columns(df, column_list, value='n/a'): """ Add specified columns to df if not there. Parameters: df (DataFrame): Pandas dataframe. column_list (list): List of columns to append to the dataframe. value (str): Default fill value for the column. """ add_cols = list(set(column_list) - set(list(df))) for col in add_cols: df[col] = value
[docs]def check_match(ds1, ds2, numeric=False): """ Check two Pandas data series have the same values. Parameters: ds1 (DataSeries): Pandas data series to check. ds2 (DataSeries): Pandas data series to check. numeric (bool): If true, treat as numeric and do close-to comparison. Returns: list: Error messages indicating the mismatch or empty if the series match. """ if len(ds1.index) != len(ds2.index): return f"First series has length {len(ds1.index)} and {len(ds2.index)} events" if numeric: close_test = np.isclose(pd.to_numeric(ds1, errors='coerce'), pd.to_numeric(ds2, errors='coerce'), equal_nan=True) if sum(np.logical_not(close_test)): return f"Series differ at positions {list(ds1.loc[np.logical_not(close_test)].index)}" else: unequal = ds1.map(str) != ds2.map(str) if sum(unequal) > 0: return f"Series differ at positions {list(ds1.loc[unequal].index)}" return []
[docs]def delete_columns(df, column_list): """ Delete the specified columns from a dataframe. Parameters: df (DataFrame): Pandas dataframe from which to delete columns. column_list (list): List of candidate column names for deletion. Notes: - The deletion of columns is done in place. - This does not raise an error if df does not have a column in the list. """ delete_cols = list(set(column_list).intersection(set(list(df)))) df.drop(columns=delete_cols, axis=1, inplace=True)
[docs]def delete_rows_by_column(df, value, column_list=None): """ Delete rows where columns have this value. Parameters: df (DataFrame): Pandas dataframe from which to delete rows. value (str): Specified value to indicate row should be deleted. column_list (list): List of columns to search for value. Notes: - All values are converted to string before testing. - Deletion is done in place. """ if column_list: cols = list(set(column_list).intersection(set(list(df)))) else: cols = list(df) for col in cols: map_col = df[col].map(str) == str(value) df.drop(df[map_col].index, axis=0, inplace=True)
[docs]def get_eligible_values(values, values_included): """ Return a list of the items from values that are in values_included or None if no values_included Parameters: values (list): List of strings against which to test. values_included (list): List of items to be selected from values if they are present. Returns: list: list of selected values or None if values_included is empty or None. """ if values_included: eligible_columns = [x for x in values_included if x in frozenset(values)] else: eligible_columns = None return eligible_columns
[docs]def get_key_hash(key_tuple): """ Calculate a hash key for tuple of values. Parameters: key_tuple (tuple, list): The key values in the correct order for lookup. Returns: int: A hash key for the tuple. """ return hash(tuple((str(n) for n in key_tuple)))
[docs]def get_new_dataframe(data): """ Get a new dataframe representing a tsv file. Parameters: data (DataFrame or str): DataFrame or filename representing a tsv file. Returns: DataFrame: A dataframe containing the contents of the tsv file or if data was a DataFrame to start with, a new copy of the DataFrame. :raises HedFileError: - A filename is given, and it cannot be read into a Dataframe. """ if isinstance(data, str): df = pd.read_csv(data, delimiter='\t', header=0, keep_default_na=True, na_values=[",", "null"]) elif isinstance(data, pd.DataFrame): df = data.copy() else: raise HedFileError("BadDataFrame", "get_new_dataframe could not extract DataFrame from data", "") return df
[docs]def get_row_hash(row, key_list): """ Get a hash key from key column values for row. Parameters: row (DataSeries) A Pandas data series corresponding to a row in a spreadsheet. key_list (list) List of column names to create the hash value from. Returns: str: Hash key constructed from the entries of row in the columns specified by key_list. :raises HedFileError: - If row doesn't have all the columns in key_list HedFileError is raised. """ columns_present, columns_missing = separate_values(list(row.index.values), key_list) if columns_missing: raise HedFileError("lookup_row", f"row must have all keys, missing{str(columns_missing)}", "") new_row = row[key_list].fillna('n/a').astype(str) return get_key_hash(new_row)
[docs]def get_value_dict(tsv_path, key_col='file_basename', value_col='sampling_rate'): """ Get a dictionary of two columns of a dataframe. Parameters: tsv_path (str): Path to a tsv file with a header row to be read into a DataFrame. key_col (str): Name of the column which should be the key. value_col (str): Name of the column which should be the value. Returns: dict: Dictionary with key_col values as the keys and the corresponding value_col values as the values. :raises HedFileError: - When tsv_path does not correspond to a file that can be read into a DataFrame. """ value_dict = {} df = get_new_dataframe(tsv_path) for index, row in df.iterrows(): if row[key_col] in value_dict: raise HedFileError("DuplicateKeyInValueDict", "The key column must have unique values", "") value_dict[row[key_col]] = row[value_col] return value_dict
[docs]def make_info_dataframe(col_info, selected_col): """ Get a dataframe from selected columns. Parameters: col_info (dict): Dictionary of dictionaries of column values and counts. selected_col (str): Name of the column used as top level key for col_info. Returns: dataframe: A two-column dataframe with first column containing values from the dictionary whose key is selected_col and whose second column are the corresponding counts. The returned value is None if selected_col is not a top-level key in col_info. """ col_dict = col_info.get(selected_col, None) if not col_dict: return None col_values = col_dict.keys() df = pd.DataFrame(sorted(list(col_values)), columns=[selected_col]) return df
[docs]def replace_values(df, values=None, replace_value='n/a', column_list=None): """ Replace string values in specified columns. Parameters: df (DataFrame): Dataframe whose values will be replaced. values (list, None): List of strings to replace. If None, only empty strings are replaced. replace_value (str): String replacement value. column_list (list, None): List of columns in which to do replacement. If None all columns are processed. Returns: int: number of values replaced. """ num_replaced = 0 if column_list: cols = list(set(column_list).intersection(set(list(df)))) else: cols = list(df) if not values: values = [''] for col in cols: for value in values: value_mask = df[col].map(str) == str(value) num_replaced += sum(value_mask) index = df[value_mask].index df.loc[index, col] = replace_value return num_replaced
[docs]def reorder_columns(data, col_order, skip_missing=True): """ Create a new dataframe with columns reordered. Parameters: data (DataFrame, str): Dataframe or filename of dataframe whose columns are to be reordered. col_order (list): List of column names in desired order. skip_missing (bool): If true, col_order columns missing from data are skipped, otherwise error. Returns: DataFrame: A new reordered dataframe. :raises HedFileError: - If col_order contains columns not in data and skip_missing is False. - If data corresponds to a filename from which a dataframe cannot be created. """ df = get_new_dataframe(data) present_cols, missing_cols = separate_values(df.columns.values.tolist(), col_order) if missing_cols and not skip_missing: raise HedFileError("MissingKeys", f"Events file must have columns {str(missing_cols)}", "") df = df[present_cols] return df
[docs]def separate_values(values, target_values): """ Get target values from the target_values list. Parameters: values (list): List of values to be tested. target_values (list): List of desired values. Returns: tuples: list: Target values present in values. list: Target values missing from values. Notes: - The function computes the set difference of target_cols and base_cols and returns a list of columns of target_cols that are in base_cols and a list of those missing. """ if not target_values: return [], [] elif not values: return [], target_values present_values = [x for x in target_values if x in frozenset(values)] missing_values = list(set(target_values).difference(set(values))) return present_values, missing_values
[docs]def get_indices(df, column, start, stop): start_event = [i for (i, v) in enumerate(df[column].tolist()) if v in start] end_event = [i for (i, v) in enumerate(df[column].tolist()) if v in stop] lst = [] next_start = start_event[0] while 1: try: next_end = _find_next(next_start, end_event) lst.append((next_start, next_end)) next_start = _find_next_start(next_end, start_event) except IndexError: break return lst
def _find_next(v, lst): return [x for x in sorted(lst) if x > v][0]
[docs]def tuple_to_range(tuple_list, inclusion): # change normal range inclusion behaviour based on user input [k, m] = [0, 0] if inclusion[0] == 'exclude': k += 1 if inclusion[1] == 'include': m += 1 range_list = [] for tup in tuple_list: range_list.append([*range(tup[0] + k, tup[1] + m)]) return range_list
def _find_next_start(v, lst): return [x for x in sorted(lst) if x >= v][0]