Source code for hed.models.hed_string

"""
This module is used to split tags in a HED string.
"""
import copy
from hed.models.hed_group import HedGroup
from hed.models.hed_tag import HedTag
from hed.models.model_constants import DefTagNames


[docs]class HedString(HedGroup): """ A HED string. """ OPENING_GROUP_CHARACTER = '(' CLOSING_GROUP_CHARACTER = ')'
[docs] def __init__(self, hed_string, hed_schema, def_dict=None, _contents=None): """ Constructor for the HedString class. Parameters: hed_string (str): A HED string consisting of tags and tag groups. hed_schema (HedSchema): The schema to use to identify tags. def_dict(DefinitionDict or None): The def dict to use to identify def/def expand tags. _contents ([HedGroup and/or HedTag] or None): Create a HedString from this exact list of children. Does not make a copy. Notes: - The HedString object parses its component tags and groups into a tree-like structure. """ if _contents is not None: contents = _contents else: try: contents = self.split_into_groups(hed_string, hed_schema, def_dict) except ValueError: contents = [] super().__init__(hed_string, contents=contents, startpos=0, endpos=len(hed_string)) self._schema = hed_schema self._from_strings = None self._def_dict = def_dict
[docs] @classmethod def from_hed_strings(cls, hed_strings): """ Factory for creating HedStrings via combination. Parameters: hed_strings (list or None): A list of HedString objects to combine. This takes ownership of their children. Returns: new_string(HedString): The newly combined HedString """ if not hed_strings: raise TypeError("Passed an empty list to from_hed_strings") new_string = HedString.__new__(HedString) hed_string = ",".join([group._hed_string for group in hed_strings]) contents = [child for sub_string in hed_strings for child in sub_string.children] first_schema = hed_strings[0]._schema first_dict = hed_strings[0]._def_dict new_string.__init__(hed_string=hed_string, _contents=contents, hed_schema=first_schema, def_dict=first_dict) new_string._from_strings = hed_strings return new_string
@property def is_group(self): """ Always False since the underlying string is not a group with parentheses. """ return False def _calculate_to_canonical_forms(self, hed_schema): """ Identify all tags using the given schema. Parameters: hed_schema (HedSchema, HedSchemaGroup): The schema to use to validate/convert tags. Returns: list: A list of issues found while converting the string. Each issue is a dictionary. """ validation_issues = [] for tag in self.get_all_tags(): validation_issues += tag._calculate_to_canonical_forms(hed_schema) return validation_issues def __deepcopy__(self, memo): # check if the object has already been copied if id(self) in memo: return memo[id(self)] # create a new instance of HedString class, and direct copy all parameters new_string = self.__class__.__new__(self.__class__) new_string.__dict__.update(self.__dict__) # add the new object to the memo dictionary memo[id(self)] = new_string # Deep copy the attributes that need it(most notably, we don't copy schema/schema entry) new_string._original_children = copy.deepcopy(self._original_children, memo) new_string._from_strings = copy.deepcopy(self._from_strings, memo) new_string.children = copy.deepcopy(self.children, memo) return new_string
[docs] def copy(self): """ Return a deep copy of this string. Returns: HedString: The copied group. """ return_copy = copy.deepcopy(self) return return_copy
[docs] def remove_definitions(self): """ Remove definition tags and groups from this string. This does not validate definitions and will blindly removing invalid ones as well. """ definition_groups = self.find_top_level_tags({DefTagNames.DEFINITION_KEY}, include_groups=1) if definition_groups: self.remove(definition_groups)
[docs] def shrink_defs(self): """ Replace def-expand tags with def tags This does not validate them and will blindly shrink invalid ones as well. Returns: self """ for def_expand_tag, def_expand_group in self.find_tags({DefTagNames.DEF_EXPAND_KEY}, recursive=True): expanded_parent = def_expand_group._parent if expanded_parent: def_expand_tag.short_base_tag = DefTagNames.DEF_ORG_KEY def_expand_tag._parent = expanded_parent expanded_parent.replace(def_expand_group, def_expand_tag) return self
[docs] def expand_defs(self): """ Replace def tags with def-expand tags This does very minimal validation Returns: self """ def_tags = self.find_def_tags(recursive=True, include_groups=0) replacements = [] for tag in def_tags: if tag.expandable and not tag.expanded: replacements.append((tag, tag.expandable)) for tag, group in replacements: tag_parent = tag._parent tag_parent.replace(tag, group) tag._parent = group tag.short_base_tag = DefTagNames.DEF_EXPAND_KEY return self
[docs] def get_as_original(self): """ Return the original form of this string. Returns: str: The string with all the tags in their original form. Notes: Potentially with some extraneous spaces removed on returned string. """ return self.get_as_form("org_tag")
[docs] @staticmethod def split_into_groups(hed_string, hed_schema, def_dict=None): """ Split the HED string into a parse tree. Parameters: hed_string (str): A hed string consisting of tags and tag groups to be processed. hed_schema (HedSchema): HED schema to use to identify tags. def_dict(DefinitionDict): The definitions to identify Returns: list: A list of HedTag and/or HedGroup. :raises ValueError: - The string is significantly malformed, such as mismatched parentheses. Notes: - The parse tree consists of tag groups, tags, and delimiters. """ current_tag_group = [[]] input_tags = HedString.split_hed_string(hed_string) for is_hed_tag, (startpos, endpos) in input_tags: if is_hed_tag: new_tag = HedTag(hed_string, hed_schema, (startpos, endpos), def_dict) current_tag_group[-1].append(new_tag) else: string_portion = hed_string[startpos:endpos] delimiter_index = 0 for i, char in enumerate(string_portion): if not char.isspace(): delimiter_index = i break delimiter_char = string_portion[delimiter_index] if delimiter_char is HedString.OPENING_GROUP_CHARACTER: current_tag_group.append(HedGroup(hed_string, startpos + delimiter_index)) if delimiter_char is HedString.CLOSING_GROUP_CHARACTER: # if prev_delimiter == ",": # raise ValueError(f"Closing parentheses in hed string {hed_string}") # Terminate existing group, and save it off. paren_end = startpos + delimiter_index + 1 if len(current_tag_group) > 1: new_group = current_tag_group.pop() new_group._endpos = paren_end current_tag_group[-1].append(new_group) else: raise ValueError(f"Closing parentheses in hed string {hed_string}") # Comma delimiter issues are ignored and assumed already validated currently. if len(current_tag_group) != 1: raise ValueError(f"Unmatched opening parentheses in hed string {hed_string}") return current_tag_group[0]
def _get_org_span(self, tag_or_group): """ If this tag or group was in the original hed string, find its original span. Parameters: tag_or_group (HedTag or HedGroup): The hed tag to locate in this string. Returns: int or None: Starting position of the given item in the original string. int or None: Ending position of the given item in the original string. Notes: - If the hed tag or group was not in the original string, returns (None, None). """ if self._from_strings: return self._get_org_span_from_strings(tag_or_group) if self.check_if_in_original(tag_or_group): return tag_or_group.span return None, None def _get_org_span_from_strings(self, tag_or_group): """A different case of the above, to handle if this was created from hed string objects.""" found_string = None string_start_index = 0 for string in self._from_strings: if string.check_if_in_original(tag_or_group): found_string = string break # Add 1 for comma string_start_index += string.span[1] + 1 if not found_string: return None, None return tag_or_group.span[0] + string_start_index, tag_or_group.span[1] + string_start_index
[docs] @staticmethod def split_hed_string(hed_string): """ Split a HED string into delimiters and tags. Parameters: hed_string (str): The HED string to split. Returns: list: A list of tuples where each tuple is (is_hed_tag, (start_pos, end_pos)). Notes: - The tuple format is as follows - is_hed_tag (bool): A (possible) hed tag if true, delimiter if not. - start_pos (int): Index of start of string in hed_string. - end_pos (int): Index of end of string in hed_string - This function does not validate tags or delimiters in any form. """ tag_delimiters = ",()" current_spacing = 0 found_symbol = True result_positions = [] tag_start_pos = None last_end_pos = 0 for i, char in enumerate(hed_string): if char == " ": current_spacing += 1 continue if char in tag_delimiters: if found_symbol: # view_string = hed_string[last_end_pos: i] if last_end_pos != i: result_positions.append((False, (last_end_pos, i))) last_end_pos = i elif not found_symbol: found_symbol = True last_end_pos = i - current_spacing # view_string = hed_string[tag_start_pos: last_end_pos] result_positions.append((True, (tag_start_pos, last_end_pos))) current_spacing = 0 tag_start_pos = None continue # If we have a current delimiter, end it here. if found_symbol and last_end_pos is not None: # view_string = hed_string[last_end_pos: i] if last_end_pos != i: result_positions.append((False, (last_end_pos, i))) last_end_pos = None found_symbol = False current_spacing = 0 if tag_start_pos is None: tag_start_pos = i if last_end_pos is not None and len(hed_string) != last_end_pos: # view_string = hed_string[last_end_pos: len(hed_string)] result_positions.append((False, (last_end_pos, len(hed_string)))) if tag_start_pos is not None: # view_string = hed_string[tag_start_pos: len(hed_string)] result_positions.append((True, (tag_start_pos, len(hed_string) - current_spacing))) if current_spacing: result_positions.append((False, (len(hed_string) - current_spacing, len(hed_string)))) return result_positions
[docs] def validate(self, allow_placeholders=True, error_handler=None): """ Validate the string using the schema Parameters: allow_placeholders(bool): allow placeholders in the string error_handler(ErrorHandler or None): the error handler to use, creates a default one if none passed Returns: issues (list of dict): A list of issues for hed string """ from hed.validator import HedValidator validator = HedValidator(self._schema, def_dicts=self._def_dict) return validator.validate(self, allow_placeholders=allow_placeholders, error_handler=error_handler)
[docs] def find_top_level_tags(self, anchor_tags, include_groups=2): """ Find top level groups with an anchor tag. A max of 1 tag located per top level group. Parameters: anchor_tags (container): A list/set/etc of short_base_tags to find groups by. include_groups (0, 1 or 2): Parameter indicating what return values to include. If 0: return only tags. If 1: return only groups. If 2 or any other value: return both. Returns: list or tuple: The returned result depends on include_groups: """ top_level_tags = [] for group in self.groups(): for tag in group.tags(): if tag.short_base_tag.lower() in anchor_tags: top_level_tags.append((tag, group)) # Only capture a max of 1 per group. These are implicitly unique. break if include_groups == 0 or include_groups == 1: return [tag[include_groups] for tag in top_level_tags] return top_level_tags
[docs] def remove_refs(self): """ This removes any refs(tags contained entirely inside curly braces) from the string. This does NOT validate the contents of the curly braces. This is only relevant when directly editing sidecar strings. Tools will naturally ignore these. """ ref_tags = [tag for tag in self.get_all_tags() if tag.is_column_ref()] if ref_tags: self.remove(ref_tags)