Source code for hed.validator.tag_validator

"""
This module is used to validate the HED tags as strings.

"""

import re
from hed.errors.error_reporter import ErrorHandler
from hed.models.model_constants import DefTagNames
from hed.schema import HedKey
from hed.errors.error_types import ValidationErrors
from hed.validator import tag_validator_util


[docs]class TagValidator:
    """ Validation for individual HED tags. """

    CAMEL_CASE_EXPRESSION = r'([A-Z]+\s*[a-z-]*)+'
    INVALID_STRING_CHARS = '[]{}~'
    INVALID_STRING_CHARS_PLACEHOLDERS = '[]~'
    OPENING_GROUP_CHARACTER = '('
    CLOSING_GROUP_CHARACTER = ')'
    COMMA = ','

    # # sign is allowed by default as it is specifically checked for separately.
    DEFAULT_ALLOWED_PLACEHOLDER_CHARS = ".+-^ _#"
    # Placeholder characters are checked elsewhere, but by default allowed
    TAG_ALLOWED_CHARS = "-_/"

[docs]    def __init__(self, hed_schema):
        """Constructor for the Tag_Validator class.

        Parameters:
            hed_schema (HedSchema): A HedSchema object.

        Returns:
            TagValidator: A Tag_Validator object.

        """
        self._hed_schema = hed_schema

        # Dict contains all the value portion validators for value class.  e.g. "is this a number?"
        self._value_unit_validators = self._register_default_value_validators()

    # ==========================================================================
    # Top level validator functions
    # =========================================================================+
[docs]    def run_hed_string_validators(self, hed_string_obj, allow_placeholders=False):
        """Basic high level checks of the hed string

        Parameters:
            hed_string_obj (HedString): A HED string.
            allow_placeholders: Allow placeholder and curly brace characters

        Returns:
            list: The validation issues associated with a HED string. Each issue is a dictionary.

        Notes:
            - Used for basic invalid characters or bad delimiters.

         """
        validation_issues = []
        validation_issues += self.check_invalid_character_issues(hed_string_obj.get_original_hed_string(),
                                                                 allow_placeholders)
        validation_issues += self.check_count_tag_group_parentheses(hed_string_obj.get_original_hed_string())
        validation_issues += self.check_delimiter_issues_in_hed_string(hed_string_obj.get_original_hed_string())
        for tag in hed_string_obj.get_all_tags():
            validation_issues += self.check_tag_formatting(tag)
        return validation_issues

[docs]    def run_validate_tag_characters(self, original_tag, allow_placeholders):
        """ Basic character validation of tags

        Parameters:
            original_tag (HedTag): A original tag.
            allow_placeholders (bool): Allow value class or extensions to be placeholders rather than a specific value.

        Returns:
            list: The validation issues associated with the characters. Each issue is dictionary.

        """
        return self.check_tag_invalid_chars(original_tag, allow_placeholders)

[docs]    def run_individual_tag_validators(self, original_tag, allow_placeholders=False,
                                      is_definition=False):
        """ Runs the hed_ops on the individual tags.

        Parameters:
            original_tag (HedTag): A original tag.
            allow_placeholders (bool): Allow value class or extensions to be placeholders rather than a specific value.
            is_definition (bool): This tag is part of a Definition, not a normal line.

        Returns:
            list: The validation issues associated with the tags. Each issue is dictionary.

         """
        validation_issues = []
        # validation_issues += self.check_tag_invalid_chars(original_tag, allow_placeholders)
        if self._hed_schema:
            validation_issues += self.check_tag_exists_in_schema(original_tag)
            if original_tag.is_unit_class_tag():
                validation_issues += self.check_tag_unit_class_units_are_valid(original_tag)
            elif original_tag.is_value_class_tag():
                validation_issues += self.check_tag_value_class_valid(original_tag)
            elif original_tag.extension:
                validation_issues += self.check_for_invalid_extension_chars(original_tag)

            if not allow_placeholders:
                validation_issues += self.check_for_placeholder(original_tag, is_definition)
            validation_issues += self.check_tag_requires_child(original_tag)
        validation_issues += self.check_capitalization(original_tag)
        return validation_issues

[docs]    def run_tag_level_validators(self, original_tag_list, is_top_level, is_group):
        """ Run hed_ops at each level in a HED string.

        Parameters:
            original_tag_list (list): A list containing the original HedTags.
            is_top_level (bool): If True, this group is a "top level tag group".
            is_group (bool): If true, group is contained by parenthesis.

        Returns:
            list: The validation issues associated with each level in a HED string.

        Notes:
            - This is for the top-level, all groups, and nested groups.
            - This can contain definitions, Onset, etc tags.

        """
        validation_issues = []
        validation_issues += self.check_tag_level_issue(original_tag_list, is_top_level, is_group)
        return validation_issues

[docs]    def run_all_tags_validators(self, tags):
        """ Validate the multi-tag properties in a hed string.

        Parameters:
            tags (list): A list containing the HedTags in a HED string.

        Returns:
            list: The validation issues associated with the tags in a HED string. Each issue is a dictionary.

        Notes:
            - Multi-tag properties include required tags.

        """
        validation_issues = []
        if self._hed_schema:
            validation_issues += self.check_for_required_tags(tags)
            validation_issues += self.check_multiple_unique_tags_exist(tags)
        return validation_issues

    # ==========================================================================
    # Mostly internal functions to check individual types of errors
    # =========================================================================+
[docs]    def check_invalid_character_issues(self, hed_string, allow_placeholders):
        """ Report invalid characters.

        Parameters:
            hed_string (str): A hed string.
            allow_placeholders: Allow placeholder and curly brace characters

        Returns:
            list: Validation issues. Each issue is a dictionary.

        Notes:
            - Invalid tag characters are defined by TagValidator.INVALID_STRING_CHARS or
                                                    TagValidator.INVALID_STRING_CHARS_PLACEHOLDERS
        """
        validation_issues = []
        invalid_dict = TagValidator.INVALID_STRING_CHARS
        if allow_placeholders:
            invalid_dict = TagValidator.INVALID_STRING_CHARS_PLACEHOLDERS
        for index, character in enumerate(hed_string):
            if character in invalid_dict or ord(character) > 127:
                validation_issues += self._report_invalid_character_error(hed_string, index)

        return validation_issues

[docs]    def check_count_tag_group_parentheses(self, hed_string):
        """ Report unmatched parentheses.

        Parameters:
            hed_string (str): A hed string.

        Returns:
            list: A list of validation list. Each issue is a dictionary.
        """
        validation_issues = []
        number_open_parentheses = hed_string.count('(')
        number_closed_parentheses = hed_string.count(')')
        if number_open_parentheses != number_closed_parentheses:
            validation_issues += ErrorHandler.format_error(ValidationErrors.PARENTHESES_MISMATCH,
                                                           opening_parentheses_count=number_open_parentheses,
                                                           closing_parentheses_count=number_closed_parentheses)
        return validation_issues

[docs]    def check_delimiter_issues_in_hed_string(self, hed_string):
        """ Report missing commas or commas in value tags.

        Parameters:
            hed_string (str): A hed string.

        Returns:
            list: A validation issues list. Each issue is a dictionary.
        """
        last_non_empty_valid_character = ''
        last_non_empty_valid_index = 0
        current_tag = ''
        issues = []

        for i, current_character in enumerate(hed_string):
            current_tag += current_character
            if not current_character.strip():
                continue
            if TagValidator._character_is_delimiter(current_character):
                if current_tag.strip() == current_character:
                    issues += ErrorHandler.format_error(ValidationErrors.TAG_EMPTY, source_string=hed_string,
                                                        char_index=i)
                    current_tag = ''
                    continue
                current_tag = ''
            elif current_character == self.OPENING_GROUP_CHARACTER:
                if current_tag.strip() == self.OPENING_GROUP_CHARACTER:
                    current_tag = ''
                else:
                    issues += ErrorHandler.format_error(ValidationErrors.COMMA_MISSING, tag=current_tag)
            elif last_non_empty_valid_character == "," and current_character == self.CLOSING_GROUP_CHARACTER:
                issues += ErrorHandler.format_error(ValidationErrors.TAG_EMPTY, source_string=hed_string,
                                                    char_index=i)
            elif TagValidator._comma_is_missing_after_closing_parentheses(last_non_empty_valid_character,
                                                                          current_character):
                issues += ErrorHandler.format_error(ValidationErrors.COMMA_MISSING, tag=current_tag[:-1])
                break
            last_non_empty_valid_character = current_character
            last_non_empty_valid_index = i
        if TagValidator._character_is_delimiter(last_non_empty_valid_character):
            issues += ErrorHandler.format_error(ValidationErrors.TAG_EMPTY,
                                                char_index=last_non_empty_valid_index,
                                                source_string=hed_string)
        return issues

    pattern_doubleslash = re.compile(r"([ \t/]{2,}|^/|/$)")

[docs]    def check_tag_formatting(self, original_tag):
        """ Report repeated or erroneous slashes.

        Parameters:
            original_tag (HedTag): The original tag that is used to report the error.

        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        validation_issues = []
        for match in self.pattern_doubleslash.finditer(original_tag.org_tag):
            validation_issues += ErrorHandler.format_error(ValidationErrors.NODE_NAME_EMPTY,
                                                           tag=original_tag,
                                                           index_in_tag=match.start(),
                                                           index_in_tag_end=match.end())

        return validation_issues

[docs]    def check_tag_invalid_chars(self, original_tag, allow_placeholders):
        """ Report invalid characters in the given tag.

        Parameters:
            original_tag (HedTag): The original tag that is used to report the error.
            allow_placeholders (bool): Allow placeholder characters(#) if True.

        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        validation_issues = self._check_invalid_prefix_issues(original_tag)
        allowed_chars = self.TAG_ALLOWED_CHARS
        if allow_placeholders:
            allowed_chars += "#"
        validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag)
        return validation_issues

[docs]    def check_tag_exists_in_schema(self, original_tag):
        """ Report invalid tag or doesn't take a value.

        Parameters:
            original_tag (HedTag): The original tag that is used to report the error.

        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        validation_issues = []
        if original_tag.is_basic_tag() or original_tag.is_takes_value_tag():
            return validation_issues

        is_extension_tag = original_tag.has_attribute(HedKey.ExtensionAllowed)
        if not is_extension_tag:
            actual_error = None
            if "#" in original_tag.extension:
                actual_error = ValidationErrors.PLACEHOLDER_INVALID
            validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_EXTENSION_INVALID, tag=original_tag,
                                                           actual_error=actual_error)
        else:
            validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_EXTENDED, tag=original_tag,
                                                           index_in_tag=len(original_tag.org_base_tag),
                                                           index_in_tag_end=None)
        return validation_issues

    def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None):
        """Returns any issues found if this is a value tag"""
        validation_issues = []
        if original_tag.is_takes_value_tag() and \
                not self._validate_value_class_portion(original_tag, stripped_value):
            validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID, report_as)
            if error_code:
                validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID,
                                                               report_as, actual_error=error_code)
        return validation_issues

    def _check_units(self, original_tag, bad_units, report_as):
        """Returns an issue noting this is either bad units, or missing units"""
        if bad_units:
            tag_unit_class_units = original_tag.get_tag_unit_class_units()
            validation_issue = ErrorHandler.format_error(ValidationErrors.UNITS_INVALID,
                                                         tag=report_as, units=tag_unit_class_units)
        else:
            default_unit = original_tag.default_unit
            validation_issue = ErrorHandler.format_error(ValidationErrors.UNITS_MISSING,
                                                         tag=report_as, default_unit=default_unit)
        return validation_issue

[docs]    def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, error_code=None):
        """ Report incorrect unit class or units.

        Parameters:
            original_tag (HedTag): The original tag that is used to report the error.
            report_as (HedTag): Report errors as coming from this tag, rather than original_tag.
            error_code (str): Override error codes to this
        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        validation_issues = []
        if original_tag.is_unit_class_tag():
            stripped_value, unit = original_tag.get_stripped_unit_value()
            if not unit:
                # Todo: in theory this should separately validate the number and the units, for units
                # that are prefixes like $.  Right now those are marked as unit invalid AND value_invalid.
                bad_units = " " in original_tag.extension
                report_as = report_as if report_as else original_tag

                if bad_units:
                    stripped_value = stripped_value.split(" ")[0]

                validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code)
                validation_issues += self._check_units(original_tag, bad_units, report_as)

                # We don't want to give this overall error twice
                if error_code and not any(error_code == issue['code'] for issue in validation_issues):
                    new_issue = validation_issues[0].copy()
                    new_issue['code'] = error_code
                    validation_issues += [new_issue]

        return validation_issues

[docs]    def check_tag_value_class_valid(self, original_tag, report_as=None, error_code=None):
        """ Report an invalid value portion.

        Parameters:
            original_tag (HedTag): The original tag that is used to report the error.
            report_as (HedTag): Report errors as coming from this tag, rather than original_tag.
            error_code (str): Override error codes to this
        Returns:
            list: Validation issues.
        """
        validation_issues = []
        if not self._validate_value_class_portion(original_tag, original_tag.extension):
            validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID,
                                                           report_as if report_as else original_tag,
                                                           actual_error=error_code)

        return validation_issues

[docs]    def check_tag_requires_child(self, original_tag):
        """ Report if tag is a leaf with 'requiredTag' attribute.

        Parameters:
            original_tag (HedTag): The original tag that is used to report the error.

        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        validation_issues = []
        if original_tag.has_attribute(HedKey.RequireChild):
            validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_REQUIRES_CHILD,
                                                           tag=original_tag)
        return validation_issues

[docs]    def check_for_invalid_extension_chars(self, original_tag):
        """Report invalid characters in extension/value.

        Parameters:
            original_tag (HedTag): The original tag that is used to report the error.

        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        allowed_chars = self.TAG_ALLOWED_CHARS
        allowed_chars += self.DEFAULT_ALLOWED_PLACEHOLDER_CHARS
        allowed_chars += " "
        return self._check_invalid_chars(original_tag.extension, allowed_chars, original_tag,
                                         starting_index=len(original_tag.org_base_tag) + 1)

[docs]    def check_capitalization(self, original_tag):
        """Report warning if incorrect tag capitalization.

        Parameters:
            original_tag (HedTag): The original tag used to report the warning.

        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        validation_issues = []
        tag_names = original_tag.org_base_tag.split("/")
        for tag_name in tag_names:
            correct_tag_name = tag_name.capitalize()
            if tag_name != correct_tag_name and not re.search(self.CAMEL_CASE_EXPRESSION, tag_name):
                validation_issues += ErrorHandler.format_error(ValidationErrors.STYLE_WARNING,
                                                               tag=original_tag)
                break
        return validation_issues

[docs]    def check_tag_level_issue(self, original_tag_list, is_top_level, is_group):
        """ Report tags incorrectly positioned in hierarchy.

        Parameters:
            original_tag_list (list): HedTags containing the original tags.
            is_top_level (bool): If True, this group is a "top level tag group"
            is_group (bool): If true group should be contained by parenthesis

        Returns:
            list: Validation issues. Each issue is a dictionary.

        Notes:
            - Top-level groups can contain definitions, Onset, etc tags.
        """
        validation_issues = []
        top_level_tags = [tag for tag in original_tag_list if
                          tag.base_tag_has_attribute(HedKey.TopLevelTagGroup)]
        tag_group_tags = [tag for tag in original_tag_list if
                          tag.base_tag_has_attribute(HedKey.TagGroup)]
        for tag_group_tag in tag_group_tags:
            if not is_group:
                validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_GROUP_TAG,
                                                               tag=tag_group_tag)
        for top_level_tag in top_level_tags:
            if not is_top_level:
                actual_code = None
                if top_level_tag.short_base_tag == DefTagNames.DEFINITION_ORG_KEY:
                    actual_code = ValidationErrors.DEFINITION_INVALID
                elif top_level_tag.short_base_tag in {DefTagNames.ONSET_ORG_KEY, DefTagNames.OFFSET_ORG_KEY}:
                    actual_code = ValidationErrors.ONSET_OFFSET_INSET_ERROR

                if actual_code:
                    validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG,
                                                                   tag=top_level_tag,
                                                                   actual_error=actual_code)
                validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG,
                                                               tag=top_level_tag)

        if is_top_level and len(top_level_tags) > 1:
            validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS,
                                                           tag=top_level_tags[0],
                                                           multiple_tags=top_level_tags[1:])

        return validation_issues

[docs]    def check_for_required_tags(self, tags):
        """ Report missing required tags.

        Parameters:
            tags (list): HedTags containing the tags.

        Returns:
            list: Validation issues. Each issue is a dictionary.

        """
        validation_issues = []
        required_prefixes = self._hed_schema.get_tags_with_attribute(HedKey.Required)
        for required_prefix in required_prefixes:
            if not any(tag.long_tag.lower().startswith(required_prefix.lower()) for tag in tags):
                validation_issues += ErrorHandler.format_error(ValidationErrors.REQUIRED_TAG_MISSING,
                                                               tag_namespace=required_prefix)
        return validation_issues

[docs]    def check_multiple_unique_tags_exist(self, tags):
        """ Report if multiple identical unique tags exist

            A unique Term can only appear once in a given HedString.
            Unique terms are terms with the 'unique' property in the schema.

        Parameters:
            tags (list): HedTags containing the tags.

        Returns:
            list: Validation issues. Each issue is a dictionary.
        """
        validation_issues = []
        unique_prefixes = self._hed_schema.get_tags_with_attribute(HedKey.Unique)
        for unique_prefix in unique_prefixes:
            unique_tag_prefix_bool_mask = [x.long_tag.lower().startswith(unique_prefix.lower()) for x in tags]
            if sum(unique_tag_prefix_bool_mask) > 1:
                validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_NOT_UNIQUE,
                                                               tag_namespace=unique_prefix)
        return validation_issues

    # ==========================================================================
    # Private utility functions
    # =========================================================================+
    def _check_invalid_prefix_issues(self, original_tag):
        """Check for invalid schema namespace."""
        issues = []
        schema_namespace = original_tag.schema_namespace
        if schema_namespace and not schema_namespace[:-1].isalpha():
            issues += ErrorHandler.format_error(ValidationErrors.TAG_NAMESPACE_PREFIX_INVALID,
                                                tag=original_tag, tag_namespace=schema_namespace)
        return issues

    def _validate_value_class_portion(self, original_tag, portion_to_validate):
        if portion_to_validate is None:
            return False

        value_class_types = original_tag.value_classes
        return self.validate_value_class_type(portion_to_validate, value_class_types)

    def _report_invalid_character_error(self, hed_string, index):
        """ Report an invalid character.

        Parameters:
            hed_string (str): The HED string that caused the error.
            index (int): The index of the invalid character in the HED string.

        Returns:
            list: A singleton list with a dictionary representing the error.

        """
        error_type = ValidationErrors.CHARACTER_INVALID
        character = hed_string[index]
        if character == "~":
            error_type = ValidationErrors.TILDES_UNSUPPORTED
        return ErrorHandler.format_error(error_type, char_index=index,
                                         source_string=hed_string)

    @staticmethod
    def _comma_is_missing_after_closing_parentheses(last_non_empty_character, current_character):
        """ Checks if missing comma after a closing parentheses.

        Parameters:
            last_non_empty_character (str): The last non-empty string in the HED string.
            current_character (str): The current character in the HED string.

        Returns:
            bool: True if a comma is missing after a closing parentheses. False, if otherwise.

        Notes:
            - This is a helper function for the find_missing_commas_in_hed_string function.

        """
        return last_non_empty_character == TagValidator.CLOSING_GROUP_CHARACTER and \
            not (TagValidator._character_is_delimiter(current_character)
                 or current_character == TagValidator.CLOSING_GROUP_CHARACTER)

    @staticmethod
    def _character_is_delimiter(character):
        """ Checks if the character is a delimiter.

        Parameters:
            character (str): A string character.

        Returns:
            bool: Returns true if the character is a delimiter. False, if otherwise.

        Notes:
            -  A delimiter is a comma.

        """
        return character == TagValidator.COMMA

[docs]    def check_for_placeholder(self, original_tag, is_definition=False):
        """ Report invalid placeholder characters.

        Parameters:
            original_tag (HedTag):  The HedTag to be checked
            is_definition (bool): If True, placeholders are allowed.

        Returns:
            list: Validation issues. Each issue is a dictionary.

        Notes:
            - Invalid placeholder may appear in the extension/value portion of a tag.

        """
        validation_issues = []
        if not is_definition:
            starting_index = len(original_tag.org_base_tag) + 1
            for i, character in enumerate(original_tag.extension):
                if character == "#":
                    validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_TAG_CHARACTER,
                                                                   tag=original_tag,
                                                                   index_in_tag=starting_index + i,
                                                                   index_in_tag_end=starting_index + i + 1,
                                                                   actual_error=ValidationErrors.PLACEHOLDER_INVALID)

        return validation_issues

    def _check_invalid_chars(self, check_string, allowed_chars, source_tag, starting_index=0):
        validation_issues = []
        for i, character in enumerate(check_string):
            if character.isalnum():
                continue
            if character in allowed_chars:
                continue
            # Todo: Remove this patch when clock times and invalid characters are more properly checked
            if character == ":":
                continue
            validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_TAG_CHARACTER,
                                                           tag=source_tag, index_in_tag=starting_index + i,
                                                           index_in_tag_end=starting_index + i + 1)
        return validation_issues

    @staticmethod
    def _register_default_value_validators():
        validator_dict = {
            tag_validator_util.DATE_TIME_VALUE_CLASS: tag_validator_util.is_date_time,
            tag_validator_util.NUMERIC_VALUE_CLASS: tag_validator_util.validate_numeric_value_class,
            tag_validator_util.TEXT_VALUE_CLASS: tag_validator_util.validate_text_value_class,
            tag_validator_util.NAME_VALUE_CLASS: tag_validator_util.validate_text_value_class
        }

        return validator_dict

[docs]    def validate_value_class_type(self, unit_or_value_portion, valid_types):
        """ Report invalid unit or valid class values.

        Parameters:
            unit_or_value_portion (str): The value portion to validate.
            valid_types (list): The names of value class or unit class types (e.g. dateTime or dateTimeClass).

        Returns:
            type_valid (bool): True if this is one of the valid_types validators.

        """
        for unit_class_type in valid_types:
            valid_func = self._value_unit_validators.get(unit_class_type)
            if valid_func:
                if valid_func(unit_or_value_portion):
                    return True
        return False