""" Utilities for assembly, analysis, and searching. """
import pandas as pd
from hed.models.tabular_input import TabularInput
from hed.tools.util.data_util import separate_values
from hed.models.hed_tag import HedTag
from hed.models.hed_group import HedGroup
from hed.models import df_util
from hed.models import QueryParser
[docs]def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False):
""" Return assembled HED annotations in a dataframe.
Parameters:
data_input (TabularInput): The tabular input file whose HED annotations are to be assembled.
sidecar (Sidecar): Sidecar with definitions.
schema (HedSchema): Hed schema
columns_included (list or None): A list of additional column names to include.
If None, only the list of assembled tags is included.
expand_defs (bool): If True, definitions are expanded when the events are assembled.
Returns:
DataFrame or None: A DataFrame with the assembled events.
dict: A dictionary with definition names as keys and definition content strings as values.
"""
eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included)
hed_string_list = data_input.series_a
definitions = sidecar.get_def_dict(hed_schema=schema)
if expand_defs:
df_util.expand_defs(hed_string_list, schema, definitions)
# Keep in mind hed_string_list is now a Series. The rest of the function should probably
# also be modified
# hed_obj_list, defs = get_assembled(data_input, sidecar, schema, extra_def_dicts=None, join_columns=True,
# shrink_defs=False, expand_defs=True)
# hed_string_list = [str(hed) for hed in hed_obj_list]
if not eligible_columns:
df = pd.DataFrame({"HED_assembled": hed_string_list})
else:
df = data_input.dataframe[eligible_columns].copy(deep=True)
df['HED_assembled'] = hed_string_list
return df, definitions
[docs]def get_expression_parsers(queries, query_names=None):
""" Returns a list of expression parsers and query_names.
Parameters:
queries (list): A list of query strings or QueryParser objects
query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc.
Returns:
DataFrame - containing the search strings
:raises ValueError:
- If query names are invalid or duplicated.
"""
expression_parsers = []
if not query_names:
query_names = [f"query_{index}" for index in range(len(queries))]
elif len(queries) != len(query_names):
raise ValueError("QueryNamesLengthBad",
f"The query_names length {len(query_names)} must be empty or equal" +
f"to the queries length {len(queries)}.")
elif len(set(query_names)) != len(query_names):
raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates")
for index, query in enumerate(queries):
if not query:
raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be empty")
elif isinstance(query, str):
try:
next_query = QueryParser(query)
except Exception:
raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed")
else:
next_query = query
expression_parsers.append(next_query)
return expression_parsers, query_names
[docs]def search_strings(hed_strings, queries, query_names=None):
""" Returns a DataFrame of factors based on results of queries.
Parameters:
hed_strings (list): A list of HedString objects (empty entries or None entries are 0's)
queries (list): A list of query strings or QueryParser objects
query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc.
Returns:
DataFrame - containing the factor vectors with results of the queries
:raises ValueError:
- If query names are invalid or duplicated.
"""
expression_parsers, query_names = get_expression_parsers(queries, query_names=query_names)
df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names)
for parse_ind, parser in enumerate(expression_parsers):
for index, next_item in enumerate(hed_strings):
match = parser.search(next_item)
if match:
df_factors.at[index, query_names[parse_ind]] = 1
return df_factors
# def get_assembled_strings(table, hed_schema=None, expand_defs=False):
# """ Return HED string objects for a tabular file.
#
# Parameters:
# table (TabularInput): The input file to be searched.
# hed_schema (HedSchema or HedschemaGroup): If provided the HedStrings are converted to canonical form.
# expand_defs (bool): If True, definitions are expanded when the events are assembled.
#
# Returns:
# list: A list of HedString objects.
#
# """
# hed_list = list(table.iter_dataframe(hed_ops=[hed_schema], return_string_only=True,
# expand_defs=expand_defs, remove_definitions=True))
# return hed_list
#
# def search_tabular(data_input, sidecar, hed_schema, query, extra_def_dicts=None, columns_included=None):
# """ Return a dataframe with results of query.
#
# Parameters:
# data_input (TabularInput): The tabular input file (e.g., events) to be searched.
# hed_schema (HedSchema or HedSchemaGroup): The schema(s) under which to make the query.
# query (str or list): The str query or list of string queries to make.
# columns_included (list or None): List of names of columns to include
#
# Returns:
# DataFrame or None: A DataFrame with the results of the query or None if no events satisfied the query.
#
# """
#
# eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included)
# hed_list, definitions = df_util.get_assembled(data_input, sidecar, hed_schema, extra_def_dicts=None, join_columns=True,
# shrink_defs=False, expand_defs=True)
# expression = QueryParser(query)
# hed_tags = []
# row_numbers = []
# for index, next_item in enumerate(hed_list):
# match = expression.search(next_item)
# if not match:
# continue
# hed_tags.append(next_item)
# row_numbers.append(index)
#
# if not row_numbers:
# df = None
# elif not eligible_columns:
# df = pd.DataFrame({'row_number': row_numbers, 'HED_assembled': hed_tags})
# else:
# df = data_input.dataframe.iloc[row_numbers][eligible_columns].reset_index()
# df.rename(columns={'index': 'row_number'})
# return df
# def remove_defs(hed_strings):
# """ This removes any def or Def-expand from a list of HedStrings.
#
# Parameters:
# hed_strings (list): A list of HedStrings
#
# Returns:
# list: A list of the removed Defs.
#
# """
# def_groups = [[] for i in range(len(hed_strings))]
# for index, hed in enumerate(hed_strings):
# def_groups[index] = extract_defs(hed)
# return def_groups
#
#
# def extract_defs(hed_string_obj):
# """ This removes any def or Def-expand from a list of HedStrings.
#
# Parameters:
# hed_string_obj (HedString): A HedString
#
# Returns:
# list: A list of the removed Defs.
#
# Notes:
# - the hed_string_obj passed in no longer has definitions.
#
# """
# to_remove = []
# to_append = []
# tuples = hed_string_obj.find_def_tags(recursive=True, include_groups=3)
# for tup in tuples:
# if len(tup[2].children) == 1:
# to_append.append(tup[0])
# else:
# to_append.append(tup[2])
# to_remove.append(tup[2])
# hed_string_obj.remove(to_remove)
# return to_append
[docs]def hed_to_str(contents, remove_parentheses=False):
if contents is None:
return ''
if isinstance(contents, str):
return contents
if isinstance(contents, HedTag):
return str(contents)
if isinstance(contents, list):
converted = [hed_to_str(element, remove_parentheses) for element in contents if element]
return ",".join(converted)
if not isinstance(contents, HedGroup):
raise TypeError("ContentsWrongClass", "OnsetGroup excepts contents that can be converted to string.")
if not remove_parentheses or len(contents.children) != 1:
return str(contents)
return _handle_remove(contents)
def _handle_remove(contents):
if contents.is_group or isinstance(contents.children[0], HedTag):
return str(contents.children[0])
child = contents.children[0]
if child.is_group and len(child.children) == 1:
return str(child.children[0])
return str(child)