Source code for hed.tools.util.data_util

""" Data handling utilities involving dataframes. """

import pandas as pd
import numpy as np
from hed.errors.exceptions import HedFileError


[docs]def add_columns(df, column_list, value='n/a'):
    """ Add specified columns to df if not there.

    Parameters:
        df (DataFrame):      Pandas dataframe.
        column_list (list):  List of columns to append to the dataframe.
        value (str):         Default fill value for the column.

    """

    add_cols = list(set(column_list) - set(list(df)))
    for col in add_cols:
        df[col] = value


[docs]def check_match(ds1, ds2, numeric=False):
    """ Check two Pandas data series have the same values.

    Parameters:
        ds1 (DataSeries):      Pandas data series to check.
        ds2 (DataSeries):      Pandas data series to check.
        numeric (bool):        If true, treat as numeric and do close-to comparison.

    Returns:
        list: Error messages indicating the mismatch or empty if the series match.

    """

    if len(ds1.index) != len(ds2.index):
        return f"First series has length {len(ds1.index)} and {len(ds2.index)} events"
    if numeric:
        close_test = np.isclose(pd.to_numeric(ds1, errors='coerce'), pd.to_numeric(ds2, errors='coerce'),
                                equal_nan=True)
        if sum(np.logical_not(close_test)):
            return f"Series differ at positions {list(ds1.loc[np.logical_not(close_test)].index)}"
    else:
        unequal = ds1.map(str) != ds2.map(str)
        if sum(unequal) > 0:
            return f"Series differ at positions {list(ds1.loc[unequal].index)}"
    return []


[docs]def delete_columns(df, column_list):
    """ Delete the specified columns from a dataframe.

    Parameters:
        df (DataFrame):      Pandas dataframe from which to delete columns.
        column_list (list):  List of candidate column names for deletion.

    Notes:
        - The deletion of columns is done in place.
        - This does not raise an error if df does not have a column in the list.

    """

    delete_cols = list(set(column_list).intersection(set(list(df))))
    df.drop(columns=delete_cols, axis=1, inplace=True)


[docs]def delete_rows_by_column(df, value, column_list=None):
    """ Delete rows where columns have this value.

    Parameters:
        df (DataFrame):      Pandas dataframe from which to delete rows.
        value (str):         Specified value to indicate row should be deleted.
        column_list (list):  List of columns to search for value.

    Notes:
        - All values are converted to string before testing.
        - Deletion is done in place.

    """
    if column_list:
        cols = list(set(column_list).intersection(set(list(df))))
    else:
        cols = list(df)

    for col in cols:
        map_col = df[col].map(str) == str(value)
        df.drop(df[map_col].index, axis=0, inplace=True)


[docs]def get_eligible_values(values, values_included):
    """ Return a list of the items from values that are in values_included or None if no values_included

    Parameters:
        values (list): List of strings against which to test.
        values_included (list): List of items to be selected from values if they are present.

    Returns:
        list:  list of selected values or None if values_included is empty or None.


    """

    if values_included:
        eligible_columns = [x for x in values_included if x in frozenset(values)]
    else:
        eligible_columns = None
    return eligible_columns


[docs]def get_key_hash(key_tuple):
    """ Calculate a hash key for tuple of values.

    Parameters:
        key_tuple (tuple, list):  The key values in the correct order for lookup.

    Returns:
        int:  A hash key for the tuple.

    """

    return hash(tuple((str(n) for n in key_tuple)))


[docs]def get_new_dataframe(data):
    """ Get a new dataframe representing a tsv file.

    Parameters:
        data (DataFrame or str):  DataFrame or filename representing a tsv file.

    Returns:
        DataFrame:  A dataframe containing the contents of the tsv file or if data was
             a DataFrame to start with, a new copy of the DataFrame.

    :raises HedFileError:
        - A filename is given, and it cannot be read into a Dataframe.

    """

    if isinstance(data, str):
        df = pd.read_csv(data, delimiter='\t', header=0, keep_default_na=True, na_values=[",", "null"])
    elif isinstance(data, pd.DataFrame):
        df = data.copy()
    else:
        raise HedFileError("BadDataFrame", "get_new_dataframe could not extract DataFrame from data", "")
    return df


[docs]def get_row_hash(row, key_list):
    """ Get a hash key from key column values for row.

    Parameters:
        row (DataSeries)   A Pandas data series corresponding to a row in a spreadsheet.
        key_list (list)    List of column names to create the hash value from.

    Returns:
        str: Hash key constructed from the entries of row in the columns specified by key_list.

    :raises HedFileError:
        - If row doesn't have all the columns in key_list HedFileError is raised.

    """
    columns_present, columns_missing = separate_values(list(row.index.values), key_list)
    if columns_missing:
        raise HedFileError("lookup_row", f"row must have all keys, missing{str(columns_missing)}", "")
    new_row = row[key_list].fillna('n/a').astype(str)
    return get_key_hash(new_row)


[docs]def get_value_dict(tsv_path, key_col='file_basename', value_col='sampling_rate'):
    """ Get a dictionary of two columns of a dataframe.

    Parameters:
        tsv_path (str):   Path to a tsv file with a header row to be read into a DataFrame.
        key_col (str):    Name of the column which should be the key.
        value_col (str):  Name of the column which should be the value.

    Returns:
        dict:  Dictionary with key_col values as the keys and the corresponding value_col values as the values.

    :raises HedFileError:
        - When tsv_path does not correspond to a file that can be read into a DataFrame.

    """

    value_dict = {}
    df = get_new_dataframe(tsv_path)
    for index, row in df.iterrows():
        if row[key_col] in value_dict:
            raise HedFileError("DuplicateKeyInValueDict", "The key column must have unique values", "")
        value_dict[row[key_col]] = row[value_col]
    return value_dict


[docs]def make_info_dataframe(col_info, selected_col):
    """ Get a dataframe from selected columns.

    Parameters:
        col_info (dict):      Dictionary of dictionaries of column values and counts.
        selected_col (str):   Name of the column used as top level key for col_info.

    Returns:
        dataframe:  A two-column dataframe with first column containing values from the
                    dictionary whose key is selected_col and whose second column are the corresponding counts.
                    The returned value is None if selected_col is not a top-level key in col_info.

    """
    col_dict = col_info.get(selected_col, None)
    if not col_dict:
        return None
    col_values = col_dict.keys()
    df = pd.DataFrame(sorted(list(col_values)), columns=[selected_col])
    return df


[docs]def replace_values(df, values=None, replace_value='n/a', column_list=None):
    """ Replace string values in specified columns.

    Parameters:
        df (DataFrame):            Dataframe whose values will be replaced.
        values (list, None):       List of strings to replace. If None, only empty strings are replaced.
        replace_value (str):       String replacement value.
        column_list (list, None):  List of columns in which to do replacement. If None all columns are processed.

    Returns:
        int: number of values replaced.
    """

    num_replaced = 0
    if column_list:
        cols = list(set(column_list).intersection(set(list(df))))
    else:
        cols = list(df)
    if not values:
        values = ['']
    for col in cols:
        for value in values:
            value_mask = df[col].map(str) == str(value)
            num_replaced += sum(value_mask)
            index = df[value_mask].index
            df.loc[index, col] = replace_value
    return num_replaced


[docs]def reorder_columns(data, col_order, skip_missing=True):
    """ Create a new dataframe with columns reordered.

    Parameters:
        data (DataFrame, str):      Dataframe or filename of dataframe whose columns are to be reordered.
        col_order (list):           List of column names in desired order.
        skip_missing (bool):        If true, col_order columns missing from data are skipped, otherwise error.

    Returns:
        DataFrame:                  A new reordered dataframe.

    :raises HedFileError:
        - If col_order contains columns not in data and skip_missing is False.
        - If data corresponds to a filename from which a dataframe cannot be created.

    """
    df = get_new_dataframe(data)
    present_cols, missing_cols = separate_values(df.columns.values.tolist(), col_order)
    if missing_cols and not skip_missing:
        raise HedFileError("MissingKeys", f"Events file must have columns {str(missing_cols)}", "")
    df = df[present_cols]
    return df


[docs]def separate_values(values, target_values):
    """ Get target values from the target_values list.

    Parameters:
        values (list):          List of values to be tested.
        target_values (list):   List of desired values.

     Returns:
        tuples:
            list:  Target values present in values.
            list:  Target values missing from values.

     Notes:
         - The function computes the set difference of target_cols and base_cols and returns a list
           of columns of target_cols that are in base_cols and a list of those missing.

     """

    if not target_values:
        return [], []
    elif not values:
        return [], target_values
    present_values = [x for x in target_values if x in frozenset(values)]
    missing_values = list(set(target_values).difference(set(values)))
    return present_values, missing_values


[docs]def get_indices(df, column, start, stop):
    start_event = [i for (i, v) in enumerate(df[column].tolist())
                   if v in start]
    end_event = [i for (i, v) in enumerate(df[column].tolist())
                 if v in stop]

    lst = []

    next_start = start_event[0]
    while 1:
        try:
            next_end = _find_next(next_start, end_event)
            lst.append((next_start, next_end))
            next_start = _find_next_start(next_end, start_event)
        except IndexError:
            break

    return lst


def _find_next(v, lst):
    return [x for x in sorted(lst) if x > v][0]


[docs]def tuple_to_range(tuple_list, inclusion):
    # change normal range inclusion behaviour based on user input
    [k, m] = [0, 0]
    if inclusion[0] == 'exclude':
        k += 1
    if inclusion[1] == 'include':
        m += 1

    range_list = []
    for tup in tuple_list:
        range_list.append([*range(tup[0] + k, tup[1] + m)])
    return range_list


def _find_next_start(v, lst):
    return [x for x in sorted(lst) if x >= v][0]