Source code for hed.models.string_util
""" Utilities for manipulating HedString objects. """
import re
from hed.models.hed_string import HedString
[docs]def gather_descriptions(hed_string):
"""Remove any description tags from the HedString and concatenates them.
Parameters:
hed_string(HedString): To be modified.
Returns: tuple
description(str): The concatenated values of all description tags.
Side effect:
The input HedString has its description tags removed.
"""
desc_tags = hed_string.find_tags({"description"}, recursive=True, include_groups=0)
desc_string = " ".join([tag.extension if tag.extension.endswith(".") else tag.extension + "." for tag in desc_tags])
hed_string.remove(desc_tags)
return desc_string
[docs]def split_base_tags(hed_string, base_tags, remove_group=False):
""" Split a HedString object into two separate HedString objects based on the presence of base tags.
Parameters:
hed_string (HedString): The input HedString object to be split.
base_tags (list of str): A list of strings representing the base tags.
This is matching the base tag NOT all the terms above it.
remove_group (bool, optional): Flag indicating whether to remove the parent group. Defaults to False.
Returns:
tuple: A tuple containing two HedString objects:
- The first HedString object contains the remaining tags from hed_string.
- The second HedString object contains the tags from hed_string that match the base_tags.
"""
base_tags = [tag.casefold() for tag in base_tags]
include_groups = 0
if remove_group:
include_groups = 2
found_things = hed_string.find_tags(base_tags, recursive=True, include_groups=include_groups)
if remove_group:
found_things = [tag if isinstance(group, HedString) else group for tag, group in found_things]
if found_things:
hed_string.remove(found_things)
return hed_string, HedString("", hed_string._schema, _contents=found_things)
[docs]def split_def_tags(hed_string, def_names, remove_group=False):
""" Split a HedString object into two separate HedString objects based on the presence of def tags
This does NOT handle def-expand tags currently.
Parameters:
hed_string (HedString): The input HedString object to be split.
def_names (list of str): A list of def names to search for. Can optionally include a value.
remove_group (bool, optional): Flag indicating whether to remove the parent group. Defaults to False.
Returns:
tuple: A tuple containing two HedString objects:
- The first HedString object contains the remaining tags from hed_string.
- The second HedString object contains the tags from hed_string that match the def_names.
"""
include_groups = 0
if remove_group:
include_groups = 2
wildcard_tags = [f"def/{def_name}".casefold() for def_name in def_names]
found_things = hed_string.find_wildcard_tags(wildcard_tags, recursive=True, include_groups=include_groups)
if remove_group:
found_things = [tag if isinstance(group, HedString) else group for tag, group in found_things]
if found_things:
hed_string.remove(found_things)
return hed_string, HedString("", hed_string._schema, _contents=found_things)
[docs]def cleanup_empties(string_in: str) -> str:
leading_comma_regex = re.compile(r'^\s*,+')
trailing_comma_regex = re.compile(r',\s*$')
inner_comma_regex = re.compile(r',\s*,+')
empty_parens_regex = re.compile(r'\(\s*\)')
redundant_parens_regex = re.compile(r'\(\s*([,\s]*)\s*\)')
trailing_inner_comma_regex = re.compile(r'[\s,]+\)')
result = string_in
previous_result = None
while result != previous_result:
previous_result = result
# Step 1: Remove empty parentheses
result = empty_parens_regex.sub('', result)
# Step 2: Remove redundant parentheses containing only commas/spaces
def replace_redundant_parens(match):
group1 = match.group(1)
if re.fullmatch(r'[,\s()]*', group1):
return ''
return f"({group1.strip().lstrip(',').rstrip(',')})"
result = redundant_parens_regex.sub(replace_redundant_parens, result)
# Step 3: Remove leading and trailing commas
result = leading_comma_regex.sub('', result)
result = trailing_comma_regex.sub('', result)
# Step 4: Collapse multiple commas inside
result = inner_comma_regex.sub(',', result)
# Step 5: Remove trailing commas inside parentheses
result = trailing_inner_comma_regex.sub(')', result)
result = re.sub(r'\(\s*,+', '(', result)
return result.strip()