Source code for hed.tools.analysis.annotation_util

""" Utilities to facilitate annotation of events in BIDS. """

import re
from pandas import DataFrame
from hed.errors.exceptions import HedFileError


[docs]def check_df_columns(df, required_cols=('column_name', 'column_value', 'description', 'HED')): """ Return a list of the specified columns that are missing from a dataframe. Parameters: df (DataFrame): Spreadsheet to check the columns of. required_cols (tuple): List of column names that must be present. Returns: list: List of column names that are missing. """ missing_cols = [] column_list = list(df.columns.values) for col in required_cols: if col not in column_list: missing_cols.append(col) return missing_cols
[docs]def df_to_hed(dataframe, description_tag=True): """ Create sidecar-like dictionary from a 4-column dataframe. Parameters: dataframe (DataFrame): A four-column Pandas DataFrame with specific columns. description_tag (bool): If True description tag is included. Returns: dict: A dictionary compatible with BIDS JSON tabular file that includes HED. Notes: - The DataFrame must have the columns with names: column_name, column_value, description, and HED. """ df = dataframe.fillna('n/a') missing_cols = check_df_columns(df) if missing_cols: raise HedFileError("RequiredColumnsMissing", f"Columns {str(missing_cols)} are missing from dataframe", "") hed_dict = {} for index, row in df.iterrows(): if row['HED'] == 'n/a' and row['description'] == 'n/a': continue if row['column_value'] == 'n/a': hed_dict[row['column_name']] = _get_value_entry(row['HED'], row['description'], description_tag=description_tag) continue cat_dict = hed_dict.get(row['column_name'], {}) _update_cat_dict(cat_dict, row['column_value'], row['HED'], row['description'], description_tag=description_tag) hed_dict[row['column_name']] = cat_dict return hed_dict
[docs]def extract_tags(hed_string, search_tag): """ Extract all instances of specified tag from a tag_string. Parameters: hed_string (str): Tag string from which to extract tag. search_tag (str): HED tag to extract. Returns: tuple: - str: Tag string without the tags. - list: A list of the tags that were extracted, for example descriptions. """ extracted = [] remainder = "" back_piece = hed_string while back_piece: ind = back_piece.find(search_tag) if ind == -1: remainder = _update_remainder(remainder, back_piece) break first_pos = _find_last_pos(back_piece[:ind]) remainder = _update_remainder(remainder, trim_back(back_piece[:first_pos])) next_piece = back_piece[first_pos:] last_pos = _find_first_pos(next_piece) extracted.append(trim_back(next_piece[:last_pos])) back_piece = trim_front(next_piece[last_pos:]) return remainder, extracted
[docs]def generate_sidecar_entry(column_name, column_values=None): """ Create a sidecar column dictionary for column. Parameters: column_name (str): Name of the column. column_values (list): List of column values. Returns: dict: A dictionary representing a template for a sidecar entry. """ name_label = re.sub(r'[^A-Za-z0-9-]+', '_', column_name) sidecar_entry = {"Description": f"Description for {column_name}", "HED": ""} if not column_values: sidecar_entry["HED"] = f"(Label/{name_label}, Label/#)" else: levels = {} hed = {} for column_value in column_values: if column_value == "n/a": continue value_label = re.sub(r'[^A-Za-z0-9-]+', '_', column_value) levels[column_value] = f"Here describe column value {column_value} of column {column_name}" hed[column_value] = f"(Label/{name_label}, Label/{value_label})" sidecar_entry["Levels"] = levels sidecar_entry["HED"] = hed return sidecar_entry
[docs]def hed_to_df(sidecar_dict, col_names=None): """ Return a 4-column dataframe of HED portions of sidecar. Parameters: sidecar_dict (dict): A dictionary conforming to BIDS JSON events sidecar format. col_names (list, None): A list of the cols to include in the flattened side car. Returns: DataFrame: Four-column spreadsheet representing HED portion of sidecar. Notes: - The returned DataFrame has columns: column_name, column_value, description, and HED. """ if not col_names: col_names = sidecar_dict.keys() column_name = [] column_value = [] column_description = [] hed_tags = [] for col_key, col_dict in sidecar_dict.items(): if col_key not in col_names or not isinstance(col_dict, dict) or 'HED' not in col_dict: continue elif 'Levels' in col_dict or isinstance(col_dict['HED'], dict): keys, values, descriptions, tags = _flatten_cat_col(col_key, col_dict) else: keys, values, descriptions, tags = _flatten_val_col(col_key, col_dict) column_name = column_name + keys column_value = column_value + values column_description = column_description + descriptions hed_tags = hed_tags + tags data = {"column_name": column_name, "column_value": column_value, "description": column_description, "HED": hed_tags} dataframe = DataFrame(data).astype(str) return dataframe
[docs]def merge_hed_dict(sidecar_dict, hed_dict): """ Update a JSON sidecar based on the hed_dict values. Parameters: sidecar_dict (dict): Dictionary representation of a BIDS JSON sidecar. hed_dict(dict): Dictionary derived from a dataframe representation of HED in sidecar. """ for key, value_dict in hed_dict.items(): if key not in sidecar_dict: sidecar_dict[key] = value_dict continue sidecar_dict[key]['HED'] = value_dict['HED'] if isinstance(value_dict['HED'], str) and value_dict.get('Description', "n/a") != "n/a": sidecar_dict[key]['Description'] = value_dict['Description'] continue if isinstance(value_dict['HED'], dict) and 'Levels' in value_dict: sidecar_dict[key]['Levels'] = value_dict['Levels']
[docs]def trim_back(tag_string): """ Return a trimmed copy of tag_string. Parameters: tag_string (str): A tag string to be trimmed. Returns: str: A copy of tag_string that has been trimmed. Notes: - The trailing blanks and commas are removed from the copy. """ last_pos = 0 for ind, char in enumerate(reversed(tag_string)): if char not in [',', ' ']: last_pos = ind break return_str = tag_string[:(len(tag_string)-last_pos)] return return_str
[docs]def trim_front(tag_string): """ Return a copy of tag_string with leading blanks and commas removed. Parameters: tag_string (str): A tag string to be trimmed. Returns: str: A copy of tag_string that has been trimmed. """ first_pos = len(tag_string) for ind, char in enumerate(tag_string): if char not in [',', ' ']: first_pos = ind break return_str = tag_string[first_pos:] return return_str
def _find_first_pos(tag_string): """ Return the position of the first comma or closing parenthesis in tag_string. Parameters: tag_string (str): String to be analyzed Returns: int: Position of first comma or closing parenthesis or length of tag_string if none. """ for ind, char in enumerate(tag_string): if char in [',', ')']: return ind return len(tag_string) def _find_last_pos(tag_string): """ Find the position of the last comma, blank, or opening parenthesis in tag_string. Parameters: tag_string (str): String to be analyzed Returns: int: Position of last comma or opening parenthesis or 0 if none. """ for index, char in enumerate(reversed(tag_string)): if char in [',', ' ', '(']: return len(tag_string) - index return 0 def _flatten_cat_col(col_key, col_dict): """ Flatten a sidecar entry corresponding to a categorical column. Parameters: col_key (str): Name of the column. col_dict (dict): Dictionary corresponding to categorical of column (must include HED key). Returns: list: A list of keys list: A list of values. list: A list of descriptions. list: A list of HED tag strings. """ keys = [] values = [] descriptions = [] tags = [] hed_dict = col_dict['HED'] level_dict = col_dict.get('Levels', {}) for col_value, entry_value in hed_dict.items(): keys.append(col_key) values.append(col_value) remainder, extracted = extract_tags(entry_value, 'Description/') if remainder: tags.append(remainder) else: tags.append('n/a') if extracted: descriptions.append(_tag_list_to_str(extracted, "Description/")) else: descriptions.append(level_dict.get(col_value, 'n/a')) return keys, values, descriptions, tags def _flatten_val_col(col_key, col_dict): """ Flatten a sidecar entry corresponding to a value column. Parameters: col_key (str): Name of the column. col_dict (dict): Dictionary corresponding to value of column (must include HED key). Returns: list: A one-element list containing the name of the column. list: The list ['n/a']. list: A one-element list containing the description. list: A one-element list containing the HED string. """ tags, extracted = extract_tags(col_dict['HED'], 'Description/') if extracted: description = _tag_list_to_str(extracted, removed_tag="Description/") else: description = col_dict.get('Description', 'n/a') return [col_key], ['n/a'], [description], [tags] # def _get_row_tags(row, description_tag=True): # """ Return the HED string associated with row, possibly without the description. # # Parameters: # row (DataSeries): Pandas data frame containing a row of a tagging spreadsheet. # description_tag (bool): If True, include any Description tags in the returned string. # # Returns: # str: A HED string extracted from the row. # str: A string representing the description (without the Description tag). # # Notes: # If description_tag is True the entire tag string is included with description. # If there was a description extracted, it is appended to any existing description. # # """ # remainder, extracted = extract_tags(row['HED'], 'Description/') # if description_tag: # tags = row["HED"] # else: # tags = remainder # # if row["description"] != 'n/a': # description = row["description"] # else: # description = "" # if extracted: # description = " ".join([description, extracted]) # return tags, description def _get_value_entry(hed_entry, description_entry, description_tag=True): """ Return a HED dictionary representing a value entry in a HED tagging spreadsheet. Parameters: hed_entry (str): The string found in the HED column of the row. description_entry (str): The string found in the description column of the row. description_tag (bool): If True, include the description column as part of the HED entry. Returns: dict: A dictionary with containing only HED and Description keys (as in for a value column of a JSON sidecar.) """ value_dict = {} tags = "" if hed_entry and hed_entry != 'n/a': tags = hed_entry if description_entry and description_entry != 'n/a': value_dict['Description'] = description_entry if description_tag and tags: tags = tags + ", Description/" + description_entry elif description_tag and not tags: tags = "Description/" + description_entry if tags: value_dict["HED"] = tags return value_dict def _tag_list_to_str(extracted, removed_tag=None): """ Return a concatenation of the strings in extracted, with removed_tag prefix deleted. Parameters: extracted (list): List of tag strings to be concatenated. removed_tag (str, None): A HED tag prefix to be removed before concatenation. Returns: (str) concatenated string Note: This function is designed to concatenate strings containing Description tags into a single description. """ if not removed_tag: return " ".join(extracted) str_list = [] for ind, item in enumerate(extracted): ind = item.lower().find(removed_tag.lower()) if ind >= 0: str_list.append(item[ind+len(removed_tag):]) else: str_list.append(item) return " ".join(str_list) def _update_cat_dict(cat_dict, value_entry, hed_entry, description_entry, description_tag=True): """ Update a category entry in the sidecar dictionary based on a row of the spreadsheet. Parameters: cat_dict (dict): A dictionary representing a category column in a JSON sidecar. value_entry (str): The value of the key in the category dictionary. hed_entry (str): HED tag string corresponding to the key. description_entry (str): The description column for the Level entry and possible as a Description tag. description_tag (bool): If True then the description entry is used for Level and as Description tag. Returns: dict: An updated dictionary representing a category column. """ value_dict = _get_value_entry(hed_entry, description_entry, description_tag) if 'Description' in value_dict: level_part = cat_dict.get('Levels', {}) level_part[value_entry] = value_dict['Description'] cat_dict['Levels'] = level_part if 'HED' in value_dict: hed_part = cat_dict.get('HED', {}) hed_part[value_entry] = value_dict['HED'] cat_dict['HED'] = hed_part def _update_remainder(remainder, update_piece): """ Update remainder with update piece. Parameters: remainder (str): A tag string without trailing comma. update_piece (str): A tag string to be appended. Returns: str: A concatenation of remainder and update_piece, paying attention to separating commas. """ if not update_piece: return remainder elif not remainder: return update_piece elif remainder.endswith('(') or update_piece.startswith(')'): return remainder + update_piece else: return remainder + ", " + update_piece