Source code for hed.tools.remodeling.operations.split_rows_op

""" Split rows in a tabular file into multiple rows based on a column. """

import numpy as np
import pandas as pd
from hed.tools.remodeling.operations.base_op import BaseOp


[docs]class SplitRowsOp(BaseOp): """ Split rows in a tabular file into multiple rows based on parameters. Required remodeling parameters: - **anchor_column** (*str*): The column in which the names of new items are stored. - **new_events** (*dict*): Mapping of new values based on values in the original row. - **remove_parent_row** (*bool*): If true, the original row that was split is removed. """ PARAMS = { "operation": "split_rows", "required_parameters": { "anchor_column": str, "new_events": dict, "remove_parent_row": bool }, "optional_parameters": {} }
[docs] def __init__(self, parameters): """ Constructor for the split rows operation. Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. :raises KeyError: - If a required parameter is missing. - If an unexpected parameter is provided. :raises TypeError: - If a parameter has the wrong type. """ super().__init__(self.PARAMS, parameters) self.anchor_column = parameters['anchor_column'] self.new_events = parameters['new_events'] self.remove_parent_row = parameters['remove_parent_row']
[docs] def do_op(self, dispatcher, df, name, sidecar=None): """ Split a row representing a particular event into multiple rows. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. :raises TypeError: -If bad onset or duration. """ df_new = df.copy() if self.anchor_column not in df_new.columns: df_new[self.anchor_column] = np.nan if self.remove_parent_row: df_list = [] else: df_list = [df_new] self._split_rows(df, df_list) df_ret = pd.concat(df_list, axis=0, ignore_index=True) df_ret["onset"] = df_ret["onset"].apply(pd.to_numeric) df_ret = df_ret.sort_values('onset').reset_index(drop=True) return df_ret
def _split_rows(self, df, df_list): """ Split the rows based on an anchor and different columns. Parameters: df (DataFrame): The DataFrame to be split. df_list (list): The list of split events and possibly the """ for event, event_parms in self.new_events.items(): add_events = pd.DataFrame([], columns=df.columns) add_events['onset'] = self._create_onsets(df, event_parms['onset_source']) add_events[self.anchor_column] = event self._add_durations(df, add_events, event_parms['duration']) if len(event_parms['copy_columns']) > 0: for column in event_parms['copy_columns']: add_events[column] = df[column] # add_events['event_type'] = event add_events = add_events.dropna(axis='rows', subset=['onset']) df_list.append(add_events) @staticmethod def _add_durations(df, add_events, duration_sources): add_events['duration'] = 0 for duration in duration_sources: if isinstance(duration, float) or isinstance(duration, int): add_events['duration'] = add_events['duration'].add(duration) elif isinstance(duration, str) and duration in list(df.columns): add_events['duration'] = add_events['duration'].add(pd.to_numeric(df[duration], errors='coerce')) else: raise TypeError("BadDurationInModel", f"Remodeling duration {str(duration)} must either be numeric or a column name", "") @staticmethod def _create_onsets(df, onset_source): """ Create a vector of onsets for the new events. Parameters: df (DataFrame): The dataframe to process. onset_source (list): List of onsets of process. Returns: list: list of same length as df with the onsets. :raises HedFileError: - If one of the onset specifiers is invalid. """ onsets = pd.to_numeric(df['onset'], errors='coerce') for onset in onset_source: if isinstance(onset, float) or isinstance(onset, int): onsets = onsets + onset elif isinstance(onset, str) and onset in list(df.columns): onsets = onsets.add(pd.to_numeric(df[onset], errors='coerce')) else: raise TypeError("BadOnsetInModel", f"Remodeling onset {str(onset)} must either be numeric or a column name.", "") return onsets