"""
This module is used to validate the HED tags as strings.
"""
import re
from hed.errors.error_reporter import ErrorHandler
from hed.models.model_constants import DefTagNames
from hed.schema import HedKey
from hed.errors.error_types import ValidationErrors
from hed.validator import tag_validator_util
[docs]class TagValidator:
""" Validation for individual HED tags. """
CAMEL_CASE_EXPRESSION = r'([A-Z]+\s*[a-z-]*)+'
INVALID_STRING_CHARS = '[]{}~'
INVALID_STRING_CHARS_PLACEHOLDERS = '[]~'
OPENING_GROUP_CHARACTER = '('
CLOSING_GROUP_CHARACTER = ')'
COMMA = ','
# # sign is allowed by default as it is specifically checked for separately.
DEFAULT_ALLOWED_PLACEHOLDER_CHARS = ".+-^ _#"
# Placeholder characters are checked elsewhere, but by default allowed
TAG_ALLOWED_CHARS = "-_/"
[docs] def __init__(self, hed_schema):
"""Constructor for the Tag_Validator class.
Parameters:
hed_schema (HedSchema): A HedSchema object.
Returns:
TagValidator: A Tag_Validator object.
"""
self._hed_schema = hed_schema
# Dict contains all the value portion validators for value class. e.g. "is this a number?"
self._value_unit_validators = self._register_default_value_validators()
# ==========================================================================
# Top level validator functions
# =========================================================================+
[docs] def run_hed_string_validators(self, hed_string_obj, allow_placeholders=False):
"""Basic high level checks of the hed string
Parameters:
hed_string_obj (HedString): A HED string.
allow_placeholders: Allow placeholder and curly brace characters
Returns:
list: The validation issues associated with a HED string. Each issue is a dictionary.
Notes:
- Used for basic invalid characters or bad delimiters.
"""
validation_issues = []
validation_issues += self.check_invalid_character_issues(hed_string_obj.get_original_hed_string(),
allow_placeholders)
validation_issues += self.check_count_tag_group_parentheses(hed_string_obj.get_original_hed_string())
validation_issues += self.check_delimiter_issues_in_hed_string(hed_string_obj.get_original_hed_string())
for tag in hed_string_obj.get_all_tags():
validation_issues += self.check_tag_formatting(tag)
return validation_issues
[docs] def run_validate_tag_characters(self, original_tag, allow_placeholders):
""" Basic character validation of tags
Parameters:
original_tag (HedTag): A original tag.
allow_placeholders (bool): Allow value class or extensions to be placeholders rather than a specific value.
Returns:
list: The validation issues associated with the characters. Each issue is dictionary.
"""
return self.check_tag_invalid_chars(original_tag, allow_placeholders)
[docs] def run_individual_tag_validators(self, original_tag, allow_placeholders=False,
is_definition=False):
""" Runs the hed_ops on the individual tags.
Parameters:
original_tag (HedTag): A original tag.
allow_placeholders (bool): Allow value class or extensions to be placeholders rather than a specific value.
is_definition (bool): This tag is part of a Definition, not a normal line.
Returns:
list: The validation issues associated with the tags. Each issue is dictionary.
"""
validation_issues = []
# validation_issues += self.check_tag_invalid_chars(original_tag, allow_placeholders)
if self._hed_schema:
validation_issues += self.check_tag_exists_in_schema(original_tag)
if original_tag.is_unit_class_tag():
validation_issues += self.check_tag_unit_class_units_are_valid(original_tag)
elif original_tag.is_value_class_tag():
validation_issues += self.check_tag_value_class_valid(original_tag)
elif original_tag.extension:
validation_issues += self.check_for_invalid_extension_chars(original_tag)
if not allow_placeholders:
validation_issues += self.check_for_placeholder(original_tag, is_definition)
validation_issues += self.check_tag_requires_child(original_tag)
validation_issues += self.check_capitalization(original_tag)
return validation_issues
[docs] def run_tag_level_validators(self, original_tag_list, is_top_level, is_group):
""" Run hed_ops at each level in a HED string.
Parameters:
original_tag_list (list): A list containing the original HedTags.
is_top_level (bool): If True, this group is a "top level tag group".
is_group (bool): If true, group is contained by parenthesis.
Returns:
list: The validation issues associated with each level in a HED string.
Notes:
- This is for the top-level, all groups, and nested groups.
- This can contain definitions, Onset, etc tags.
"""
validation_issues = []
validation_issues += self.check_tag_level_issue(original_tag_list, is_top_level, is_group)
return validation_issues
# ==========================================================================
# Mostly internal functions to check individual types of errors
# =========================================================================+
[docs] def check_invalid_character_issues(self, hed_string, allow_placeholders):
""" Report invalid characters.
Parameters:
hed_string (str): A hed string.
allow_placeholders: Allow placeholder and curly brace characters
Returns:
list: Validation issues. Each issue is a dictionary.
Notes:
- Invalid tag characters are defined by TagValidator.INVALID_STRING_CHARS or
TagValidator.INVALID_STRING_CHARS_PLACEHOLDERS
"""
validation_issues = []
invalid_dict = TagValidator.INVALID_STRING_CHARS
if allow_placeholders:
invalid_dict = TagValidator.INVALID_STRING_CHARS_PLACEHOLDERS
for index, character in enumerate(hed_string):
if character in invalid_dict or ord(character) > 127:
validation_issues += self._report_invalid_character_error(hed_string, index)
return validation_issues
[docs] def check_count_tag_group_parentheses(self, hed_string):
""" Report unmatched parentheses.
Parameters:
hed_string (str): A hed string.
Returns:
list: A list of validation list. Each issue is a dictionary.
"""
validation_issues = []
number_open_parentheses = hed_string.count('(')
number_closed_parentheses = hed_string.count(')')
if number_open_parentheses != number_closed_parentheses:
validation_issues += ErrorHandler.format_error(ValidationErrors.PARENTHESES_MISMATCH,
opening_parentheses_count=number_open_parentheses,
closing_parentheses_count=number_closed_parentheses)
return validation_issues
[docs] def check_delimiter_issues_in_hed_string(self, hed_string):
""" Report missing commas or commas in value tags.
Parameters:
hed_string (str): A hed string.
Returns:
list: A validation issues list. Each issue is a dictionary.
"""
last_non_empty_valid_character = ''
last_non_empty_valid_index = 0
current_tag = ''
issues = []
for i, current_character in enumerate(hed_string):
current_tag += current_character
if not current_character.strip():
continue
if TagValidator._character_is_delimiter(current_character):
if current_tag.strip() == current_character:
issues += ErrorHandler.format_error(ValidationErrors.TAG_EMPTY, source_string=hed_string,
char_index=i)
current_tag = ''
continue
current_tag = ''
elif current_character == self.OPENING_GROUP_CHARACTER:
if current_tag.strip() == self.OPENING_GROUP_CHARACTER:
current_tag = ''
else:
issues += ErrorHandler.format_error(ValidationErrors.COMMA_MISSING, tag=current_tag)
elif last_non_empty_valid_character == "," and current_character == self.CLOSING_GROUP_CHARACTER:
issues += ErrorHandler.format_error(ValidationErrors.TAG_EMPTY, source_string=hed_string,
char_index=i)
elif TagValidator._comma_is_missing_after_closing_parentheses(last_non_empty_valid_character,
current_character):
issues += ErrorHandler.format_error(ValidationErrors.COMMA_MISSING, tag=current_tag[:-1])
break
last_non_empty_valid_character = current_character
last_non_empty_valid_index = i
if TagValidator._character_is_delimiter(last_non_empty_valid_character):
issues += ErrorHandler.format_error(ValidationErrors.TAG_EMPTY,
char_index=last_non_empty_valid_index,
source_string=hed_string)
return issues
pattern_doubleslash = re.compile(r"([ \t/]{2,}|^/|/$)")
[docs] def check_tag_invalid_chars(self, original_tag, allow_placeholders):
""" Report invalid characters in the given tag.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
allow_placeholders (bool): Allow placeholder characters(#) if True.
Returns:
list: Validation issues. Each issue is a dictionary.
"""
validation_issues = self._check_invalid_prefix_issues(original_tag)
allowed_chars = self.TAG_ALLOWED_CHARS
if allow_placeholders:
allowed_chars += "#"
validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag)
return validation_issues
[docs] def check_tag_exists_in_schema(self, original_tag):
""" Report invalid tag or doesn't take a value.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
Returns:
list: Validation issues. Each issue is a dictionary.
"""
validation_issues = []
if original_tag.is_basic_tag() or original_tag.is_takes_value_tag():
return validation_issues
is_extension_tag = original_tag.has_attribute(HedKey.ExtensionAllowed)
if not is_extension_tag:
actual_error = None
if "#" in original_tag.extension:
actual_error = ValidationErrors.PLACEHOLDER_INVALID
validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_EXTENSION_INVALID, tag=original_tag,
actual_error=actual_error)
else:
validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_EXTENDED, tag=original_tag,
index_in_tag=len(original_tag.org_base_tag),
index_in_tag_end=None)
return validation_issues
def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None):
"""Returns any issues found if this is a value tag"""
validation_issues = []
if original_tag.is_takes_value_tag() and \
not self._validate_value_class_portion(original_tag, stripped_value):
validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID, report_as)
if error_code:
validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID,
report_as, actual_error=error_code)
return validation_issues
def _check_units(self, original_tag, bad_units, report_as):
"""Returns an issue noting this is either bad units, or missing units"""
if bad_units:
tag_unit_class_units = original_tag.get_tag_unit_class_units()
validation_issue = ErrorHandler.format_error(ValidationErrors.UNITS_INVALID,
tag=report_as, units=tag_unit_class_units)
else:
default_unit = original_tag.default_unit
validation_issue = ErrorHandler.format_error(ValidationErrors.UNITS_MISSING,
tag=report_as, default_unit=default_unit)
return validation_issue
[docs] def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, error_code=None):
""" Report incorrect unit class or units.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
report_as (HedTag): Report errors as coming from this tag, rather than original_tag.
error_code (str): Override error codes to this
Returns:
list: Validation issues. Each issue is a dictionary.
"""
validation_issues = []
if original_tag.is_unit_class_tag():
stripped_value, unit = original_tag.get_stripped_unit_value()
if not unit:
# Todo: in theory this should separately validate the number and the units, for units
# that are prefixes like $. Right now those are marked as unit invalid AND value_invalid.
bad_units = " " in original_tag.extension
report_as = report_as if report_as else original_tag
if bad_units:
stripped_value = stripped_value.split(" ")[0]
validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code)
validation_issues += self._check_units(original_tag, bad_units, report_as)
# We don't want to give this overall error twice
if error_code and not any(error_code == issue['code'] for issue in validation_issues):
new_issue = validation_issues[0].copy()
new_issue['code'] = error_code
validation_issues += [new_issue]
return validation_issues
[docs] def check_tag_value_class_valid(self, original_tag, report_as=None, error_code=None):
""" Report an invalid value portion.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
report_as (HedTag): Report errors as coming from this tag, rather than original_tag.
error_code (str): Override error codes to this
Returns:
list: Validation issues.
"""
validation_issues = []
if not self._validate_value_class_portion(original_tag, original_tag.extension):
validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID,
report_as if report_as else original_tag,
actual_error=error_code)
return validation_issues
[docs] def check_tag_requires_child(self, original_tag):
""" Report if tag is a leaf with 'requiredTag' attribute.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
Returns:
list: Validation issues. Each issue is a dictionary.
"""
validation_issues = []
if original_tag.has_attribute(HedKey.RequireChild):
validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_REQUIRES_CHILD,
tag=original_tag)
return validation_issues
[docs] def check_for_invalid_extension_chars(self, original_tag):
"""Report invalid characters in extension/value.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
Returns:
list: Validation issues. Each issue is a dictionary.
"""
allowed_chars = self.TAG_ALLOWED_CHARS
allowed_chars += self.DEFAULT_ALLOWED_PLACEHOLDER_CHARS
allowed_chars += " "
return self._check_invalid_chars(original_tag.extension, allowed_chars, original_tag,
starting_index=len(original_tag.org_base_tag) + 1)
[docs] def check_capitalization(self, original_tag):
"""Report warning if incorrect tag capitalization.
Parameters:
original_tag (HedTag): The original tag used to report the warning.
Returns:
list: Validation issues. Each issue is a dictionary.
"""
validation_issues = []
tag_names = original_tag.org_base_tag.split("/")
for tag_name in tag_names:
correct_tag_name = tag_name.capitalize()
if tag_name != correct_tag_name and not re.search(self.CAMEL_CASE_EXPRESSION, tag_name):
validation_issues += ErrorHandler.format_error(ValidationErrors.STYLE_WARNING,
tag=original_tag)
break
return validation_issues
[docs] def check_tag_level_issue(self, original_tag_list, is_top_level, is_group):
""" Report tags incorrectly positioned in hierarchy.
Parameters:
original_tag_list (list): HedTags containing the original tags.
is_top_level (bool): If True, this group is a "top level tag group"
is_group (bool): If true group should be contained by parenthesis
Returns:
list: Validation issues. Each issue is a dictionary.
Notes:
- Top-level groups can contain definitions, Onset, etc tags.
"""
validation_issues = []
top_level_tags = [tag for tag in original_tag_list if
tag.base_tag_has_attribute(HedKey.TopLevelTagGroup)]
tag_group_tags = [tag for tag in original_tag_list if
tag.base_tag_has_attribute(HedKey.TagGroup)]
for tag_group_tag in tag_group_tags:
if not is_group:
validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_GROUP_TAG,
tag=tag_group_tag)
for top_level_tag in top_level_tags:
if not is_top_level:
actual_code = None
if top_level_tag.short_base_tag == DefTagNames.DEFINITION_ORG_KEY:
actual_code = ValidationErrors.DEFINITION_INVALID
elif top_level_tag.short_base_tag in {DefTagNames.ONSET_ORG_KEY, DefTagNames.OFFSET_ORG_KEY}:
actual_code = ValidationErrors.ONSET_OFFSET_INSET_ERROR
if actual_code:
validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG,
tag=top_level_tag,
actual_error=actual_code)
validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG,
tag=top_level_tag)
if is_top_level and len(top_level_tags) > 1:
validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS,
tag=top_level_tags[0],
multiple_tags=top_level_tags[1:])
return validation_issues
# ==========================================================================
# Private utility functions
# =========================================================================+
def _check_invalid_prefix_issues(self, original_tag):
"""Check for invalid schema namespace."""
issues = []
schema_namespace = original_tag.schema_namespace
if schema_namespace and not schema_namespace[:-1].isalpha():
issues += ErrorHandler.format_error(ValidationErrors.TAG_NAMESPACE_PREFIX_INVALID,
tag=original_tag, tag_namespace=schema_namespace)
return issues
def _validate_value_class_portion(self, original_tag, portion_to_validate):
if portion_to_validate is None:
return False
value_class_types = original_tag.value_classes
return self.validate_value_class_type(portion_to_validate, value_class_types)
def _report_invalid_character_error(self, hed_string, index):
""" Report an invalid character.
Parameters:
hed_string (str): The HED string that caused the error.
index (int): The index of the invalid character in the HED string.
Returns:
list: A singleton list with a dictionary representing the error.
"""
error_type = ValidationErrors.CHARACTER_INVALID
character = hed_string[index]
if character == "~":
error_type = ValidationErrors.TILDES_UNSUPPORTED
return ErrorHandler.format_error(error_type, char_index=index,
source_string=hed_string)
@staticmethod
def _comma_is_missing_after_closing_parentheses(last_non_empty_character, current_character):
""" Checks if missing comma after a closing parentheses.
Parameters:
last_non_empty_character (str): The last non-empty string in the HED string.
current_character (str): The current character in the HED string.
Returns:
bool: True if a comma is missing after a closing parentheses. False, if otherwise.
Notes:
- This is a helper function for the find_missing_commas_in_hed_string function.
"""
return last_non_empty_character == TagValidator.CLOSING_GROUP_CHARACTER and \
not (TagValidator._character_is_delimiter(current_character)
or current_character == TagValidator.CLOSING_GROUP_CHARACTER)
@staticmethod
def _character_is_delimiter(character):
""" Checks if the character is a delimiter.
Parameters:
character (str): A string character.
Returns:
bool: Returns true if the character is a delimiter. False, if otherwise.
Notes:
- A delimiter is a comma.
"""
return character == TagValidator.COMMA
[docs] def check_for_placeholder(self, original_tag, is_definition=False):
""" Report invalid placeholder characters.
Parameters:
original_tag (HedTag): The HedTag to be checked
is_definition (bool): If True, placeholders are allowed.
Returns:
list: Validation issues. Each issue is a dictionary.
Notes:
- Invalid placeholder may appear in the extension/value portion of a tag.
"""
validation_issues = []
if not is_definition:
starting_index = len(original_tag.org_base_tag) + 1
for i, character in enumerate(original_tag.extension):
if character == "#":
validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_TAG_CHARACTER,
tag=original_tag,
index_in_tag=starting_index + i,
index_in_tag_end=starting_index + i + 1,
actual_error=ValidationErrors.PLACEHOLDER_INVALID)
return validation_issues
def _check_invalid_chars(self, check_string, allowed_chars, source_tag, starting_index=0):
validation_issues = []
for i, character in enumerate(check_string):
if character.isalnum():
continue
if character in allowed_chars:
continue
# Todo: Remove this patch when clock times and invalid characters are more properly checked
if character == ":":
continue
validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_TAG_CHARACTER,
tag=source_tag, index_in_tag=starting_index + i,
index_in_tag_end=starting_index + i + 1)
return validation_issues
@staticmethod
def _register_default_value_validators():
validator_dict = {
tag_validator_util.DATE_TIME_VALUE_CLASS: tag_validator_util.is_date_time,
tag_validator_util.NUMERIC_VALUE_CLASS: tag_validator_util.validate_numeric_value_class,
tag_validator_util.TEXT_VALUE_CLASS: tag_validator_util.validate_text_value_class,
tag_validator_util.NAME_VALUE_CLASS: tag_validator_util.validate_text_value_class
}
return validator_dict
[docs] def validate_value_class_type(self, unit_or_value_portion, valid_types):
""" Report invalid unit or valid class values.
Parameters:
unit_or_value_portion (str): The value portion to validate.
valid_types (list): The names of value class or unit class types (e.g. dateTime or dateTimeClass).
Returns:
type_valid (bool): True if this is one of the valid_types validators.
"""
for unit_class_type in valid_types:
valid_func = self._value_unit_validators.get(unit_class_type)
if valid_func:
if valid_func(unit_or_value_portion):
return True
return False