Source code for hed.tools.remodeling.operations.factor_hed_tags_op
""" Create tabular file factors from tag queries. """
import pandas as pd
import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.models.tabular_input import TabularInput
from hed.models.sidecar import Sidecar
from hed.models.df_util import get_assembled
from hed.tools.analysis.analysis_util import get_expression_parsers, search_strings
from hed.tools.analysis.event_manager import EventManager
[docs]class FactorHedTagsOp(BaseOp):
""" Create tabular file factors from tag queries.
Required remodeling parameters:
- **queries** (*list*): Queries to be applied successively as filters.
- **query_names** (*list*): Column names for the query factors.
- **remove_types** (*list*): Structural HED tags to be removed.
- **expand_context** (*bool*): Expand the context if True.
Notes:
- If factor column names are not provided, *query1*, *query2*, ... are used.
- When the context is expanded, the effect of events for temporal extent is accounted for.
- Context expansion is not implemented in the current version.
"""
PARAMS = {
"operation": "factor_hed_tags",
"required_parameters": {
"queries": list,
"query_names": list,
"remove_types": list
},
"optional_parameters": {
"expand_context": bool
}
}
[docs] def __init__(self, parameters):
""" Constructor for the factor HED tags operation.
Parameters:
parameters (dict): Actual values of the parameters for the operation.
:raises KeyError:
- If a required parameter is missing.
- If an unexpected parameter is provided.
:raises TypeError:
- If a parameter has the wrong type.
:raises ValueError:
- If the specification is missing a valid operation.
- If the length of query names is not empty and not same length as queries.
- If there are duplicate query names.
"""
super().__init__(self.PARAMS, parameters)
self.queries = parameters['queries']
self.query_names = parameters['query_names']
self.remove_types = parameters['remove_types']
self.expression_parsers, self.query_names = get_expression_parsers(self.queries,
query_names=parameters['query_names'])
[docs] def do_op(self, dispatcher, df, name, sidecar=None):
""" Factor the column using HED tag queries.
Parameters:
dispatcher (Dispatcher): Manages the operation I/O.
df (DataFrame): The DataFrame to be remodeled.
name (str): Unique identifier for the dataframe -- often the original file path.
sidecar (Sidecar or file-like): Only needed for HED operations.
Returns:
Dataframe: A new dataframe after processing.
:raises ValueError:
- If a name for a new query factor column is already a column.
"""
if sidecar and not isinstance(sidecar, Sidecar):
sidecar = Sidecar(sidecar)
input_data = TabularInput(df.copy(), sidecar=sidecar, name=name)
column_names = list(df.columns)
for query_name in self.query_names:
if query_name in column_names:
raise ValueError("QueryNameAlreadyColumn",
f"Query [{query_name}]: is already a column name of the data frame")
df_list = [input_data.dataframe]
event_man = EventManager(input_data, dispatcher.hed_schema)
hed_strings, _ = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None,
join_columns=True, shrink_defs=False, expand_defs=True)
df_factors = search_strings(hed_strings, self.expression_parsers, query_names=self.query_names)
if len(df_factors.columns) > 0:
df_list.append(df_factors)
df_new = pd.concat(df_list, axis=1)
df_new.replace('n/a', np.NaN, inplace=True)
return df_new