Source code for hed.validator.util.char_util

""" Classes responsible for basic character validation of a string or tag."""
import json
import re
import os

from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import ValidationErrors

CLASS_REX_FILENAME = 'class_regex.json'


[docs]class CharValidator: """Class responsible for basic character level validation of a string or tag.""" # # sign is allowed by default as it is specifically checked for separately. DEFAULT_ALLOWED_PLACEHOLDER_CHARS = ".+-^ _#" # Placeholder characters are checked elsewhere, but by default allowed TAG_ALLOWED_CHARS = "-_/" INVALID_STRING_CHARS = '[]{}~' INVALID_STRING_CHARS_PLACEHOLDERS = '[]~'
[docs] def __init__(self, modern_allowed_char_rules=False): """Does basic character validation for HED strings/tags Parameters: modern_allowed_char_rules(bool): If True, use 8.3 style rules for unicode characters. """ self._validate_characters = modern_allowed_char_rules
[docs] def check_invalid_character_issues(self, hed_string, allow_placeholders): """ Report invalid characters. Parameters: hed_string (str): A HED string. allow_placeholders (bool): Allow placeholder and curly brace characters. Returns: list: Validation issues. Each issue is a dictionary. Notes: - Invalid tag characters are defined by self.INVALID_STRING_CHARS or self.INVALID_STRING_CHARS_PLACEHOLDERS """ validation_issues = [] invalid_dict = self.INVALID_STRING_CHARS if allow_placeholders: invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS for index, character in enumerate(hed_string): if self._validate_characters: if character in invalid_dict or not character.isprintable(): validation_issues += self._report_invalid_character_error(hed_string, index) else: if character in invalid_dict or ord(character) > 127: validation_issues += self._report_invalid_character_error(hed_string, index) return validation_issues
[docs] def check_tag_invalid_chars(self, original_tag, allow_placeholders): """ Report invalid characters in the given tag. Parameters: original_tag (HedTag): The original tag that is used to report the error. allow_placeholders (bool): Allow placeholder characters(#) if True. Returns: list: Validation issues. Each issue is a dictionary. """ validation_issues = self._check_invalid_prefix_issues(original_tag) allowed_chars = self.TAG_ALLOWED_CHARS if allow_placeholders: allowed_chars += "#" validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag) return validation_issues
[docs] def check_for_invalid_extension_chars(self, original_tag, validate_text, error_code=None, index_offset=0): """Report invalid characters in extension/value. Parameters: original_tag (HedTag): The original tag that is used to report the error. validate_text (str): the text we want to validate, if not the full extension. error_code(str): The code to override the error as. Again mostly for def/def-expand tags. index_offset(int): Offset into the extension validate_text starts at. Returns: list: Validation issues. Each issue is a dictionary. """ allowed_chars = self.TAG_ALLOWED_CHARS allowed_chars += self.DEFAULT_ALLOWED_PLACEHOLDER_CHARS allowed_chars += " " return self._check_invalid_chars(validate_text, allowed_chars, original_tag, starting_index=len(original_tag.org_base_tag) + 1 + index_offset, error_code=error_code)
@staticmethod def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index=0, error_code=None): """ Helper for checking for invalid characters. Parameters: check_string (str): String to be checked for invalid characters. allowed_chars (str): Characters allowed in string. source_tag (HedTag): Tag from which the string came from. starting_index (int): Starting index of check_string within the tag. error_code (str): The code to override the error as. Again mostly for def/def-expand tags. Returns: list: List of dictionaries with validation issues. """ validation_issues = [] for i, character in enumerate(check_string): if character.isalnum(): continue if character in allowed_chars: continue # Todo: Remove this patch when clock times and invalid characters are more properly checked if character == ":": continue validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_TAG_CHARACTER, tag=source_tag, index_in_tag=starting_index + i, index_in_tag_end=starting_index + i + 1, actual_error=error_code) return validation_issues @staticmethod def _check_invalid_prefix_issues(original_tag): """Check for invalid schema namespace. Parameters: original_tag (HedTag): Tag to look Returns: list: List of dictionaries with validation issues. """ issues = [] schema_namespace = original_tag.schema_namespace if schema_namespace and not schema_namespace[:-1].isalpha(): issues += ErrorHandler.format_error(ValidationErrors.TAG_NAMESPACE_PREFIX_INVALID, tag=original_tag, tag_namespace=schema_namespace) return issues @staticmethod def _report_invalid_character_error(hed_string, index): """ Report an invalid character. Parameters: hed_string (str): The HED string that caused the error. index (int): The index of the invalid character in the HED string. Returns: list: A singleton list with a dictionary representing the error. """ error_type = ValidationErrors.CHARACTER_INVALID character = hed_string[index] if character == "~": error_type = ValidationErrors.TILDES_UNSUPPORTED return ErrorHandler.format_error(error_type, char_index=index, source_string=hed_string)
[docs]class CharRexValidator(CharValidator): """Class responsible for basic character level validation of a string or tag."""
[docs] def __init__(self, modern_allowed_char_rules=False): """Does basic character validation for HED strings/tags Parameters: modern_allowed_char_rules(bool): If True, use 8.3 style rules for unicode characters. """ super().__init__(modern_allowed_char_rules) self._rex_dict = self._get_rex_dict()
[docs] def get_problem_chars(self, input_string, class_name): # List to store problem indices and characters bad_indices = [] # Retrieve the allowed character classes for the given class_name allowed_classes = self._rex_dict["class_chars"].get(class_name, []) if not allowed_classes: return bad_indices # Combine the corresponding regular expressions from the char_regex section allowed_regex_parts = [self._rex_dict["char_regex"][char_class] for char_class in allowed_classes] # Create one combined regex that matches any of the allowed character classes combined_regex = "|".join(allowed_regex_parts) # Compile the combined regular expression compiled_regex = re.compile(combined_regex) # Iterate through the input string, checking each character for index, char in enumerate(input_string): # If the character doesn't match the combined regex, it's a problem if not compiled_regex.match(char): bad_indices.append((index, char)) return bad_indices
[docs] def is_valid_value(self, input_string, class_name): # Retrieve the allowed character classes for the given class_name class_regex = self._rex_dict["class_words"].get(class_name, []) if not class_regex: return True match = re.match(class_regex, input_string) match = match if match else False return match
@staticmethod def _get_rex_dict(): current_dir = os.path.dirname(os.path.abspath(__file__)) json_path = os.path.realpath(os.path.join(current_dir, CLASS_REX_FILENAME)) with open(json_path, 'r', encoding='utf-8') as f: return json.load(f)
if __name__ == "__main__": # Example input string input_string = "Hello World123" # Class name (e.g., "nameClass" or "testClass") class_name = "nameClass" # Call the function and print the result # problem_indices = get_problem_chars(input_string, class_name, json_data) # print(problem_indices)