Source code for hed.validator.sidecar_validator

import copy
import re
from hed.errors import ErrorHandler, ErrorContext, SidecarErrors, DefinitionErrors, ColumnErrors
from hed.models import ColumnType
from hed import HedString
from hed import Sidecar
from hed.models.column_metadata import ColumnMetadata
from hed.errors.error_reporter import sort_issues
from hed.models.model_constants import DefTagNames
from hed.errors.error_reporter import check_for_any_errors


# todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't)
[docs]class SidecarValidator: reserved_column_names = ["HED"] reserved_category_values = ["n/a"]
[docs] def __init__(self, hed_schema): """ Constructor for the HedValidator class. Parameters: hed_schema (HedSchema): HED schema object to use for validation. """ self._schema = hed_schema
[docs] def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None): """Validate the input data using the schema Parameters: sidecar (Sidecar): Input data to be validated. extra_def_dicts(list or DefinitionDict): extra def dicts in addition to sidecar name(str): The name to report this sidecar as error_handler (ErrorHandler): Error context to use. Creates a new one if None Returns: issues (list of dict): A list of issues associated with each level in the HED string. """ from hed.validator import HedValidator issues = [] if error_handler is None: error_handler = ErrorHandler() error_handler.push_error_context(ErrorContext.FILE_NAME, name) issues += self.validate_structure(sidecar, error_handler=error_handler) issues += self._validate_refs(sidecar, error_handler) # only allowed early out, something is very wrong with structure or refs if check_for_any_errors(issues): error_handler.pop_error_context() return issues sidecar_def_dict = sidecar.get_def_dict(hed_schema=self._schema, extra_def_dicts=extra_def_dicts) hed_validator = HedValidator(self._schema, def_dicts=sidecar_def_dict, definitions_allowed=True) issues += sidecar._extract_definition_issues issues += sidecar_def_dict.issues definition_checks = {} for column_data in sidecar: column_name = column_data.column_name column_data = column_data._get_unvalidated_data() hed_strings = column_data.get_hed_strings() error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) for key_name, hed_string in hed_strings.items(): new_issues = [] if len(hed_strings) > 1: error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) hed_string_obj = HedString(hed_string, hed_schema=self._schema, def_dict=sidecar_def_dict) hed_string_obj.remove_refs() error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) new_issues += hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True) new_issues += hed_validator.run_full_string_checks(hed_string_obj) def_check_list = definition_checks.setdefault(column_name, []) def_check_list.append(hed_string_obj.find_tags({DefTagNames.DEFINITION_KEY}, recursive=True, include_groups=0)) # Might refine this later - for now just skip checking placeholder counts in definition columns. if not def_check_list[-1]: new_issues += self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type) if len(hed_strings) > 1: error_handler.pop_error_context() error_handler.add_context_and_filter(new_issues) issues += new_issues error_handler.pop_error_context() error_handler.pop_error_context() issues += self._check_definitions_bad_spot(definition_checks, error_handler) issues = sort_issues(issues) return issues
[docs] def validate_structure(self, sidecar, error_handler): """ Validate the raw structure of this sidecar. Parameters: sidecar(Sidecar): the sidecar to validate error_handler(ErrorHandler): The error handler to use for error context Returns: issues(list): A list of issues found with the structure """ all_validation_issues = [] for column_name, dict_for_entry in sidecar.loaded_dict.items(): error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) all_validation_issues += self._validate_column_structure(column_name, dict_for_entry, error_handler) error_handler.pop_error_context() return all_validation_issues
def _validate_refs(self, sidecar, error_handler): possible_column_refs = sidecar.all_hed_columns if "HED" not in possible_column_refs: possible_column_refs.append("HED") issues = [] found_column_references = {} for column_data in sidecar: column_name = column_data.column_name hed_strings = column_data.get_hed_strings() error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) matches = [] for key_name, hed_string in hed_strings.items(): new_issues = [] if len(hed_strings) > 1: error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) error_handler.push_error_context(ErrorContext.HED_STRING, HedString(hed_string, hed_schema=self._schema)) invalid_locations = self._find_non_matching_braces(hed_string) for loc in invalid_locations: bad_symbol = hed_string[loc] new_issues += error_handler.format_error_with_context(ColumnErrors.MALFORMED_COLUMN_REF, column_name, loc, bad_symbol) sub_matches = re.findall(r"\{([a-z_\-0-9]+)\}", hed_string, re.IGNORECASE) matches.append(sub_matches) for match in sub_matches: if match not in possible_column_refs: new_issues += error_handler.format_error_with_context(ColumnErrors.INVALID_COLUMN_REF, match) error_handler.pop_error_context() if len(hed_strings) > 1: error_handler.pop_error_context() error_handler.add_context_and_filter(new_issues) issues += new_issues error_handler.pop_error_context() references = [match for sublist in matches for match in sublist] if references: found_column_references[column_name] = references if column_name in references: issues += error_handler.format_error_with_context(ColumnErrors.SELF_COLUMN_REF, column_name) for column_name, refs in found_column_references.items(): for ref in refs: if ref in found_column_references and ref != column_name: issues += error_handler.format_error_with_context(ColumnErrors.NESTED_COLUMN_REF, column_name, ref) return issues @staticmethod def _find_non_matching_braces(hed_string): issues = [] open_brace_index = -1 for i, char in enumerate(hed_string): if char == '{': if open_brace_index >= 0: # Nested brace detected issues.append(open_brace_index) open_brace_index = i elif char == '}': if open_brace_index >= 0: open_brace_index = -1 else: issues.append(i) if open_brace_index >= 0: issues.append(open_brace_index) return issues @staticmethod def _check_for_key(key, data): # Probably can be cleaned up more -> Return True if any data or subdata is key if isinstance(data, dict): return SidecarValidator._check_dict(key, data) elif isinstance(data, list): return SidecarValidator._check_list(key, data) return False @staticmethod def _check_dict(key, data_dict): if key in data_dict: return True for sub_data in data_dict.values(): if SidecarValidator._check_for_key(key, sub_data): return True return False @staticmethod def _check_list(key, data_list): for sub_data in data_list: if SidecarValidator._check_for_key(key, sub_data): return True return False def _validate_column_structure(self, column_name, dict_for_entry, error_handler): """ Checks primarily for type errors such as expecting a string and getting a list in a json sidecar. Parameters: error_handler (ErrorHandler) Sets the context for the error reporting. Cannot be None. Returns: list: Issues in performing the operations. Each issue is a dictionary. """ val_issues = [] if column_name in self.reserved_column_names: val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN) return val_issues column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry, basic_validation=False) if column_type is None: val_issues += error_handler.format_error_with_context(SidecarErrors.UNKNOWN_COLUMN_TYPE, column_name=column_name) elif column_type == ColumnType.Ignore: found_hed = self._check_for_key("HED", dict_for_entry) if found_hed: val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED) elif column_type == ColumnType.Categorical: val_issues += self._validate_categorical_column(column_name, dict_for_entry, error_handler) return val_issues def _validate_categorical_column(self, column_name, dict_for_entry, error_handler): """Validates a categorical column in a json sidecar.""" val_issues = [] raw_hed_dict = dict_for_entry["HED"] if not raw_hed_dict: val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING) for key_name, hed_string in raw_hed_dict.items(): error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) if not hed_string: val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING) elif not isinstance(hed_string, str): val_issues += error_handler.format_error_with_context(SidecarErrors.WRONG_HED_DATA_TYPE, given_type=type(hed_string), expected_type="str") elif key_name in self.reserved_category_values: val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_NA_USED, column_name) error_handler.pop_error_context() return val_issues def _validate_pound_sign_count(self, hed_string, column_type): """ Check if a given hed string in the column has the correct number of pound signs. Parameters: hed_string (str or HedString): HED string to be checked. Returns: list: Issues due to pound sign errors. Each issue is a dictionary. Notes: Normally the number of # should be either 0 or 1, but sometimes will be higher due to the presence of definition tags. """ # Make a copy without definitions to check placeholder count. expected_count, error_type = ColumnMetadata.expected_pound_sign_count(column_type) hed_string_copy = copy.deepcopy(hed_string) hed_string_copy.remove_definitions() hed_string_copy.shrink_defs() if hed_string_copy.lower().count("#") != expected_count: return ErrorHandler.format_error(error_type, pound_sign_count=str(hed_string_copy).count("#")) return [] def _check_definitions_bad_spot(self, definition_checks, error_handler): issues = [] # This could be simplified now for col_name, has_def in definition_checks.items(): error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, col_name) def_check = set(bool(d) for d in has_def) if len(def_check) != 1: flat_def_list = [d for defs in has_def for d in defs] for d in flat_def_list: issues += error_handler.format_error_with_context(DefinitionErrors.BAD_DEFINITION_LOCATION, d) error_handler.pop_error_context() return issues