Source code for hed.tools.remodeling.operations.summarize_hed_validation_op

""" Validate the HED tags in a dataset and report errors. """

import os
from hed.errors import ErrorSeverity, ErrorHandler, get_printable_issue_string
from hed.models.sidecar import Sidecar
from hed.models.tabular_input import TabularInput
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.tools.remodeling.operations.base_summary import BaseSummary


[docs]class SummarizeHedValidationOp(BaseOp): """ Validate the HED tags in a dataset and report errors. Required remodeling parameters: - **summary_name** (*str*): The name of the summary. - **summary_filename** (*str*): Base filename of the summary. - **check_for_warnings** (*bool*): If true include warnings as well as errors. The purpose of this op is to produce a summary of the HED validation errors in a file. """ PARAMS = { "operation": "summarize_hed_validation", "required_parameters": { "summary_name": str, "summary_filename": str }, "optional_parameters": { "append_timecode": bool, "check_for_warnings": bool } } SUMMARY_TYPE = 'hed_validation'
[docs] def __init__(self, parameters): """ Constructor for the summarize hed validation operation. Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. :raises KeyError: - If a required parameter is missing. - If an unexpected parameter is provided. :raises TypeError: - If a parameter has the wrong type. """ super().__init__(self.PARAMS, parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.append_timecode = parameters.get('append_timecode', False) self.check_for_warnings = parameters.get('check_for_warnings', False)
[docs] def do_op(self, dispatcher, df, name, sidecar=None): """ Validate the dataframe with the accompanying sidecar, if any. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be validated. name (str): Unique identifier for the dataframe -- often the original file path. sidecar (Sidecar or file-like): Usually needed unless only HED tags in HED column of event file. Returns: DataFrame: A copy of df Side effect: Updates the relevant summary. """ df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = HedValidationSummary(self) dispatcher.summary_dicts[self.summary_name] = summary summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) return df_new
[docs]class HedValidationSummary(BaseSummary):
[docs] def __init__(self, sum_op): super().__init__(sum_op) self.check_for_warnings = sum_op.check_for_warnings
def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): """ Return a formatted string with the summary for the indicated name. Parameters: name (str): Identifier (usually the filename) of the individual file. result (dict): The dictionary of the summary results indexed by name. indent (str): A string containing spaces used for indentation (usually 3 spaces). Returns: str - The results in a printable format ready to be saved to a text file. Notes: This gets the error list from "sidecar_issues" and "event_issues". """ specifics = result.get("Specifics", {}) sum_list = [f"{name}: [{len(specifics['sidecar_files'])} sidecar files, " f"{len(specifics['event_files'])} event files]"] if specifics.get('is_merged'): sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], count_only=True, indent=indent) sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=True, indent=indent) else: sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], indent=indent*2) if specifics['sidecar_had_issues']: sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], count_only=False, indent=indent*2) else: sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=False, indent=indent*2) return "\n".join(sum_list)
[docs] def update_summary(self, new_info): """ Update the summary for a given tabular input file. Parameters: new_info (dict): A dictionary with the parameters needed to update a summary. Notes: - The summary needs a "name" str, a schema, a "df", and a "Sidecar". """ sidecar = new_info.get('sidecar', None) if sidecar and not isinstance(sidecar, Sidecar): sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) results = self._get_sidecar_results(sidecar, new_info, self.check_for_warnings) if not results['sidecar_had_issues']: input_data = TabularInput(new_info['df'], sidecar=sidecar) issues = input_data.validate(new_info['schema']) if not self.check_for_warnings: issues = ErrorHandler.filter_issues_by_severity(issues, ErrorSeverity.ERROR) issues = [get_printable_issue_string([issue], skip_filename=True) for issue in issues] results['event_issues'][new_info["name"]] = issues results['total_event_issues'] = len(issues) self.summary_dict[new_info["name"]] = results
[docs] def get_details_dict(self, summary_info): """Return the summary details from the summary_info. Parameters: summary_info (dict): Dictionary of issues Returns: dict: Same summary_info as was passed in. """ return {"Name": "", "Total events": "n/a", "Total files": len(summary_info.get("event_files", [])), "Files": summary_info.get("event_files", []), "Specifics": summary_info}
[docs] def merge_all_info(self): """ Create a dictionary containing all the errors in the dataset. Returns: dict - dictionary of issues organized into sidecar_issues and event_issues. """ results = self.get_empty_results() results["is_merged"] = True for key, ind_results in self.summary_dict.items(): HedValidationSummary._update_sidecar_results(results, ind_results) results["event_files"].append(key) HedValidationSummary._update_events_results(results, ind_results) return results
@staticmethod def _update_events_results(results, ind_results): results["total_event_issues"] += ind_results["total_event_issues"] for ikey, errors in ind_results["event_issues"].items(): if ind_results["sidecar_had_issues"]: results["event_issues"][ikey] = \ f"Validation incomplete due to {ind_results['total_sidecar_issues']} sidecar issues" else: results["event_issues"][ikey] = f"{len(errors)}" @staticmethod def _update_sidecar_results(results, ind_results): results["total_sidecar_issues"] += ind_results["total_sidecar_issues"] results["sidecar_files"] = results["sidecar_files"] + ind_results["sidecar_files"] for ikey, errors in ind_results["sidecar_issues"].items(): results["sidecar_issues"][ikey] = errors
[docs] @staticmethod def get_empty_results(): return {"event_files": [], "total_event_issues": 0, "event_issues": {}, "is_merged": False, "sidecar_files": [], "total_sidecar_issues": 0, "sidecar_issues": {}, "sidecar_had_issues": False}
[docs] @staticmethod def get_error_list(error_dict, count_only=False, indent=BaseSummary.DISPLAY_INDENT): error_list = [] for key, item in error_dict.items(): if count_only and isinstance(item, list): error_list.append(f"{key}: {len(item)} issues") elif count_only: error_list.append(f"{key}: {item} issues") elif not len(item): error_list.append(f"{key} has no issues") else: error_list.append(f"{key}:") error_list = error_list + item #HedValidationSummary._format_errors(error_list, key, item, indent) return error_list
@staticmethod def _format_errors(error_list, name, errors, indent): error_list.append(f"{indent}{name} issues:") for this_item in errors: error_list.append(f"{indent * 2}{HedValidationSummary._format_error(this_item)}") @staticmethod def _format_error(error): error_str = error['code'] error_locations = [] HedValidationSummary.update_error_location(error_locations, "row", "ec_row", error) HedValidationSummary.update_error_location(error_locations, "column", "ec_column", error) HedValidationSummary.update_error_location(error_locations, "sidecar column", "ec_sidecarColumnName", error) HedValidationSummary.update_error_location(error_locations, "sidecar key", "ec_sidecarKeyName", error) location_str = ",".join(error_locations) if location_str: error_str = error_str + f"[{location_str}]" error_str = error_str + f": {error['message']}" return error_str
[docs] @staticmethod def update_error_location(error_locations, location_name, location_key, error): if location_key in error: error_locations.append(f"{location_name}={error[location_key][0]}")
@staticmethod def _get_sidecar_results(sidecar, new_info, check_for_warnings): results = HedValidationSummary.get_empty_results() results["event_files"].append(new_info["name"]) results["event_issues"][new_info["name"]] = [] if sidecar: results["sidecar_files"].append(sidecar.name) results["sidecar_issues"][sidecar.name] = [] sidecar_issues = sidecar.validate(new_info['schema']) filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) if filtered_issues: results["sidecar_had_issues"] = True if not check_for_warnings: sidecar_issues = filtered_issues str_issues = [get_printable_issue_string([issue], skip_filename=True) for issue in sidecar_issues] results['sidecar_issues'][sidecar.name] = str_issues results['total_sidecar_issues'] = len(sidecar_issues) return results