Source code for hed.tools.analysis.analysis_util

""" Utilities for assembly, analysis, and searching. """

import pandas as pd
from hed.models.tabular_input import TabularInput
from hed.tools.util.data_util import separate_values
from hed.models.hed_tag import HedTag
from hed.models.hed_group import HedGroup
from hed.models import df_util
from hed.models import QueryParser


[docs]def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False):
    """ Return assembled HED annotations in a dataframe.

    Parameters:
        data_input (TabularInput): The tabular input file whose HED annotations are to be assembled.
        sidecar (Sidecar):  Sidecar with definitions.
        schema (HedSchema):  Hed schema
        columns_included (list or None):  A list of additional column names to include.
            If None, only the list of assembled tags is included.
        expand_defs (bool): If True, definitions are expanded when the events are assembled.

    Returns:
        DataFrame or None: A DataFrame with the assembled events.
        dict: A dictionary with definition names as keys and definition content strings as values.
    """

    eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included)
    hed_string_list = data_input.series_a
    definitions = sidecar.get_def_dict(hed_schema=schema)
    if expand_defs:
        df_util.expand_defs(hed_string_list, schema, definitions)
    # Keep in mind hed_string_list is now a Series.  The rest of the function should probably
    # also be modified

    # hed_obj_list, defs = get_assembled(data_input, sidecar, schema, extra_def_dicts=None, join_columns=True,
    #                                    shrink_defs=False, expand_defs=True)
    # hed_string_list = [str(hed) for hed in hed_obj_list]
    if not eligible_columns:
        df = pd.DataFrame({"HED_assembled": hed_string_list})
    else:
        df = data_input.dataframe[eligible_columns].copy(deep=True)
        df['HED_assembled'] = hed_string_list
    return df, definitions


[docs]def get_expression_parsers(queries, query_names=None):
    """ Returns a list of expression parsers and query_names.

        Parameters:
            queries (list):  A list of query strings or QueryParser objects
            query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc.

        Returns:
            DataFrame - containing the search strings

        :raises ValueError:
            - If query names are invalid or duplicated.

        """
    expression_parsers = []
    if not query_names:
        query_names = [f"query_{index}" for index in range(len(queries))]
    elif len(queries) != len(query_names):
        raise ValueError("QueryNamesLengthBad",
                         f"The query_names length {len(query_names)} must be empty or equal" +
                         f"to the queries length {len(queries)}.")
    elif len(set(query_names)) != len(query_names):
        raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates")
    for index, query in enumerate(queries):
        if not query:
            raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be empty")
        elif isinstance(query, str):
            try:
                next_query = QueryParser(query)
            except Exception:
                raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed")
        else:
            next_query = query
        expression_parsers.append(next_query)
    return expression_parsers, query_names


[docs]def search_strings(hed_strings, queries, query_names=None):
    """ Returns a DataFrame of factors based on results of queries.

    Parameters:
        hed_strings (list):  A list of HedString objects (empty entries or None entries are 0's)
        queries (list):  A list of query strings or QueryParser objects
        query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc.

    Returns:
        DataFrame - containing the factor vectors with results of the queries

    :raises ValueError:
        - If query names are invalid or duplicated.

    """

    expression_parsers, query_names = get_expression_parsers(queries, query_names=query_names)
    df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names)
    for parse_ind, parser in enumerate(expression_parsers):
        for index, next_item in enumerate(hed_strings):
            match = parser.search(next_item)
            if match:
                df_factors.at[index, query_names[parse_ind]] = 1
    return df_factors

# def get_assembled_strings(table, hed_schema=None, expand_defs=False):
#     """ Return HED string objects for a tabular file.
# 
#     Parameters:
#         table (TabularInput): The input file to be searched.
#         hed_schema (HedSchema or HedschemaGroup): If provided the HedStrings are converted to canonical form.
#         expand_defs (bool): If True, definitions are expanded when the events are assembled.
# 
#     Returns:
#         list: A list of HedString objects.
# 
#     """
#     hed_list = list(table.iter_dataframe(hed_ops=[hed_schema], return_string_only=True,
#                                          expand_defs=expand_defs, remove_definitions=True))
#     return hed_list
# 

# def search_tabular(data_input, sidecar, hed_schema, query, extra_def_dicts=None, columns_included=None):
#     """ Return a dataframe with results of query.
# 
#     Parameters:
#         data_input (TabularInput): The tabular input file (e.g., events) to be searched.
#         hed_schema (HedSchema or HedSchemaGroup):  The schema(s) under which to make the query.
#         query (str or list):     The str query or list of string queries to make.
#         columns_included (list or None):  List of names of columns to include
# 
#     Returns:
#         DataFrame or None: A DataFrame with the results of the query or None if no events satisfied the query.
# 
#     """
# 
#     eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included)
#     hed_list, definitions = df_util.get_assembled(data_input, sidecar, hed_schema, extra_def_dicts=None, join_columns=True,
#                                                   shrink_defs=False, expand_defs=True)
#     expression = QueryParser(query)
#     hed_tags = []
#     row_numbers = []
#     for index, next_item in enumerate(hed_list):
#         match = expression.search(next_item)
#         if not match:
#             continue
#         hed_tags.append(next_item)
#         row_numbers.append(index)
# 
#     if not row_numbers:
#         df = None
#     elif not eligible_columns:
#         df = pd.DataFrame({'row_number': row_numbers, 'HED_assembled': hed_tags})
#     else:
#         df = data_input.dataframe.iloc[row_numbers][eligible_columns].reset_index()
#         df.rename(columns={'index': 'row_number'})
#     return df


# def remove_defs(hed_strings):
#     """ This removes any def or Def-expand from a list of HedStrings.
#
#     Parameters:
#         hed_strings (list):  A list of HedStrings
#
#     Returns:
#         list: A list of the removed Defs.
#
#     """
#     def_groups = [[] for i in range(len(hed_strings))]
#     for index, hed in enumerate(hed_strings):
#         def_groups[index] = extract_defs(hed)
#     return def_groups
#
#
# def extract_defs(hed_string_obj):
#     """ This removes any def or Def-expand from a list of HedStrings.
#
#     Parameters:
#         hed_string_obj (HedString):  A HedString
#
#     Returns:
#         list: A list of the removed Defs.
#
#     Notes:
#         - the hed_string_obj passed in no longer has definitions.
#
#     """
#     to_remove = []
#     to_append = []
#     tuples = hed_string_obj.find_def_tags(recursive=True, include_groups=3)
#     for tup in tuples:
#         if len(tup[2].children) == 1:
#             to_append.append(tup[0])
#         else:
#             to_append.append(tup[2])
#         to_remove.append(tup[2])
#     hed_string_obj.remove(to_remove)
#     return to_append


[docs]def hed_to_str(contents, remove_parentheses=False):

    if contents is None:
        return ''
    if isinstance(contents, str):
        return contents
    if isinstance(contents, HedTag):
        return str(contents)
    if isinstance(contents, list):
        converted = [hed_to_str(element, remove_parentheses) for element in contents if element]
        return ",".join(converted)
    if not isinstance(contents, HedGroup):
        raise TypeError("ContentsWrongClass", "OnsetGroup excepts contents that can be converted to string.")
    if not remove_parentheses or len(contents.children) != 1:
        return str(contents)
    return _handle_remove(contents)


def _handle_remove(contents):
    if contents.is_group or isinstance(contents.children[0], HedTag):
        return str(contents.children[0])
    child = contents.children[0]
    if child.is_group and len(child.children) == 1:
        return str(child.children[0])
    return str(child)