Source code for hed.schema.schema_compare

from hed.schema.hed_schema import HedSchema, HedKey
from hed.schema.hed_schema_constants import HedSectionKey

# This is still in design, means header attributes, epilogue, and prologue
MiscSection = "misc"

SectionEntryNames = {
    HedSectionKey.Tags: "Tag",
    HedSectionKey.Units: "Unit",
    HedSectionKey.UnitClasses: "Unit Class",
    HedSectionKey.ValueClasses: "Value Class",
    HedSectionKey.UnitModifiers: "Unit Modifier",
    HedSectionKey.Properties: "Property",
    HedSectionKey.Attributes: "Attribute",
}

SectionEntryNamesPlural = {
    HedSectionKey.Tags: "Tags",
    HedSectionKey.Units: "Units",
    HedSectionKey.UnitClasses: "Unit Classes",
    HedSectionKey.ValueClasses: "Value Classes",
    HedSectionKey.UnitModifiers: "Unit Modifiers",
    HedSectionKey.Properties: "Properties",
    HedSectionKey.Attributes: "Attributes",
}


[docs]def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.Tags,),
                       include_summary=True):
    """
    Compare the tags in two library schemas.  This finds tags with the same term.

    Parameters:
        schema1 (HedSchema): The first schema to be compared.
        schema2 (HedSchema): The second schema to be compared.
        output (str): Defaults to returning a python object dicts.
                      'string' returns a single string
                      'dict' returns a json style dictionary
        sections(list): the list of sections to compare.  By default, just the tags section.
                        If None, checks all sections including header, prologue, and epilogue.
        include_summary(bool): If True, adds the 'summary' dict to the dict return option, and prints it with the
                               string option.  Lists the names of all the nodes that are missing or different.
    Returns:
        dict, json style dict, or str: A dictionary containing matching entries in the Tags section of both schemas.
    """
    matches, _, _, unequal_entries = compare_schemas(schema1, schema2, sections=sections)

    for section_key, section_dict in matches.items():
        section_dict.update(unequal_entries[section_key])

    header_summary = _get_tag_name_summary((matches, unequal_entries))

    if output == 'string':
        final_string = ""
        if include_summary:
            final_string += _pretty_print_header(header_summary)
        if sections is None:
            sections = HedSectionKey
        for section_key in sections:
            type_name = SectionEntryNames[section_key]
            entries = matches[section_key]
            if not entries:
                continue
            final_string += f"{type_name} differences:\n"
            final_string += _pretty_print_diff_all(entries, type_name=type_name) + "\n"
        return final_string
    elif output == 'dict':
        output_dict = {}
        if include_summary:
            output_dict["summary"] = {str(key): value for key, value in header_summary.items()}

        for section_name, section_entries in matches.items():
            output_dict[str(section_name)] = {}
            for key, (entry1, entry2) in section_entries.items():
                output_dict[str(section_name)][key] = _dict_diff_entries(entry1, entry2)
        return output_dict
    return matches


[docs]def compare_differences(schema1, schema2, output='raw', attribute_filter=None, sections=(HedSectionKey.Tags,),
                        include_summary=True):
    """
    Compare the tags in two schemas, this finds any differences

    Parameters:
        schema1 (HedSchema): The first schema to be compared.
        schema2 (HedSchema): The second schema to be compared.
        output (str): 'raw' (default) returns a tuple of python object dicts with raw results.
                      'string' returns a single string
                      'dict' returns a json-style python dictionary that can be converted to JSON
        attribute_filter (str, optional): The attribute to filter entries by.
                                          Entries without this attribute are skipped.
                                          The most common use would be HedKey.InLibrary
                                          If it evaluates to False, no filtering is performed.
        sections(list or None): the list of sections to compare.  By default, just the tags section.
                If None, checks all sections including header, prologue, and epilogue.
        include_summary(bool): If True, adds the 'summary' dict to the dict return option, and prints it with the
                               string option.  Lists the names of all the nodes that are missing or different.

    Returns:
        tuple, str or dict: 
        - Tuple with dict entries (not_in_schema1, not_in_schema1, unequal_entries).
        - Formatted string with the output ready for printing.
        - A Python dictionary with the output ready to be converted to JSON (for web output).

    Notes: The underlying dictionaries are:
        - not_in_schema1(dict): Entries present in schema2 but not in schema1.
        - not_in_schema2(dict): Entries present in schema1 but not in schema2.
        - unequal_entries(dict): Entries that differ between the two schemas.

    """
    _, not_in_1, not_in_2, unequal_entries = compare_schemas(schema1, schema2, attribute_filter=attribute_filter,
                                                             sections=sections)

    if sections is None:
        sections = HedSectionKey

    header_summary = _get_tag_name_summary((not_in_1, not_in_2, unequal_entries))
    if output == 'string':
        final_string = ""
        if include_summary:
            final_string += _pretty_print_header(header_summary)
            if not final_string:
                return final_string
            final_string = ("Overall summary:\n================\n" + final_string + \
                            "\n\n\nSummary details:\n================\n\n")
        for section_key in sections:
            val1, val2, val3 = unequal_entries[section_key], not_in_1[section_key], not_in_2[section_key]
            type_name = SectionEntryNames[section_key]
            if val1 or val2 or val3:
                final_string += f"{type_name} differences:\n"
                if val1:
                    final_string += _pretty_print_diff_all(val1, type_name=type_name) + "\n"
                if val2:
                    final_string += _pretty_print_missing_all(val2, "Schema1", type_name) + "\n"
                if val3:
                    final_string += _pretty_print_missing_all(val3, "Schema2", type_name) + "\n"
                final_string += "\n\n"
        return final_string
    elif output == 'dict':
        # todo: clean this part up
        output_dict = {}
        current_section = {}
        if include_summary:
            output_dict["summary"] = {str(key): value for key, value in header_summary.items()}

        output_dict["unequal"] = current_section
        for section_name, section_entries in unequal_entries.items():
            current_section[str(section_name)] = {}
            for key, (entry1, entry2) in section_entries.items():
                current_section[str(section_name)][key] = _dict_diff_entries(entry1, entry2)

        current_section = {}
        output_dict["not_in_1"] = current_section
        for section_name, section_entries in not_in_1.items():
            current_section[str(section_name)] = {}
            for key, entry in section_entries.items():
                current_section[str(section_name)][key] = _entry_to_dict(entry)

        current_section = {}
        output_dict["not_in_2"] = current_section
        for section_name, section_entries in not_in_2.items():
            current_section[str(section_name)] = {}
            for key, entry in section_entries.items():
                current_section[str(section_name)][key] = _entry_to_dict(entry)
        return output_dict
    return not_in_1, not_in_2, unequal_entries


[docs]def compare_schemas(schema1, schema2, attribute_filter=HedKey.InLibrary, sections=(HedSectionKey.Tags,)):
    """
    Compare two schemas section by section.
    The function records matching entries, entries present in one schema but not in the other, and unequal entries.

    Parameters:
        schema1 (HedSchema): The first schema to be compared.
        schema2 (HedSchema): The second schema to be compared.
        attribute_filter (str, optional): The attribute to filter entries by.
                                        Entries without this attribute are skipped.
                                        The most common use would be HedKey.InLibrary
                                        If it evaluates to False, no filtering is performed.
        sections(list): the list of sections to compare.  By default, just the tags section.
                        If None, checks all sections including header, prologue, and epilogue.

    Returns:
    tuple: A tuple containing four dictionaries:
        - matches(dict): Entries present in both schemas and are equal.
        - not_in_schema1(dict): Entries present in schema2 but not in schema1.
        - not_in_schema2(dict): Entries present in schema1 but not in schema2.
        - unequal_entries(dict): Entries present in both schemas but are not equal.
    """
    # Result dictionaries to hold matches, keys not in schema2, keys not in schema1, and unequal entries
    matches = {}
    not_in_schema2 = {}
    not_in_schema1 = {}
    unequal_entries = {}

    if sections is None or MiscSection in sections:
        unequal_entries[MiscSection] = {}
        if schema1.get_save_header_attributes() != schema2.get_save_header_attributes():
            unequal_entries[MiscSection]['header_attributes'] = \
                (str(schema1.get_save_header_attributes()), str(schema2.get_save_header_attributes()))
        if schema1.prologue != schema2.prologue:
            unequal_entries[MiscSection]['prologue'] = (schema1.prologue, schema2.prologue)
        if schema1.epilogue != schema2.epilogue:
            unequal_entries[MiscSection]['epilogue'] = (schema1.epilogue, schema2.epilogue)

    # Iterate over keys in HedSectionKey
    for section_key in HedSectionKey:
        if sections is not None and section_key not in sections:
            continue
        # Dictionaries to record (short_tag_name or name): entry pairs
        dict1 = {}
        dict2 = {}

        section1 = schema1[section_key]
        section2 = schema2[section_key]

        attribute = 'short_tag_name' if section_key == HedSectionKey.Tags else 'name'

        # Get the name we're comparing things by
        for entry in section1.all_entries:
            if not attribute_filter or entry.has_attribute(attribute_filter):
                dict1[getattr(entry, attribute)] = entry

        for entry in section2.all_entries:
            if not attribute_filter or entry.has_attribute(attribute_filter):
                dict2[getattr(entry, attribute)] = entry

        # Find keys present in dict1 but not in dict2, and vice versa
        not_in_schema2[section_key] = {key: dict1[key] for key in dict1 if key not in dict2}
        not_in_schema1[section_key] = {key: dict2[key] for key in dict2 if key not in dict1}

        # Find keys present in both but with unequal entries
        unequal_entries[section_key] = {key: (dict1[key], dict2[key]) for key in dict1
                                        if key in dict2 and dict1[key] != dict2[key]}

        # Find matches
        matches[section_key] = {key: (dict1[key], dict2[key]) for key in dict1
                                if key in dict2 and dict1[key] == dict2[key]}

    return matches, not_in_schema1, not_in_schema2, unequal_entries


def _get_tag_name_summary(tag_dicts):
    out_dict = {section_key: [] for section_key in HedSectionKey}
    for tag_dict in tag_dicts:
        for section_key, section in tag_dict.items():
            if section_key == MiscSection:
                continue
            out_dict[section_key].extend(section.keys())

    return out_dict


def _pretty_print_header(summary_dict):
    
    output_string = ""
    first_entry = True
    for section_key, tag_names in summary_dict.items():
        if not tag_names:
            continue
        type_name = SectionEntryNamesPlural[section_key]
        if not first_entry:
            output_string += "\n"
        output_string += f"{type_name}: "

        output_string += ", ".join(sorted(tag_names))

        output_string += "\n"
        first_entry = False
    return output_string


def _pretty_print_entry(entry):
    """ Returns the contents of a HedSchemaEntry object as a list of strings.

    Parameters:
        entry (HedSchemaEntry): The HedSchemaEntry object to be displayed.

    Returns:
        List of strings representing the entry.
    """
    # Initialize the list with the name of the entry
    output = [f"\tName: {entry.name}"]

    # Add the description to the list if it exists
    if entry.description is not None:
        output.append(f"\tDescription: {entry.description}")

    # Iterate over all attributes and add them to the list
    for attr_key, attr_value in entry.attributes.items():
        output.append(f"\tAttribute: {attr_key} - Value: {attr_value}")

    return output


def _entry_to_dict(entry):
    """
    Returns the contents of a HedSchemaEntry object as a dictionary.

    Parameters:
        entry (HedSchemaEntry): The HedSchemaEntry object to be displayed.

    Returns:
        Dictionary representing the entry.
    """
    output = {
        "Name": entry.name,
        "Description": entry.description,
        "Attributes": entry.attributes
    }
    return output


def _dict_diff_entries(entry1, entry2):
    """
    Returns the differences between two HedSchemaEntry objects as a dictionary.

    Parameters:
        entry1 (HedSchemaEntry or str): The first entry.
        entry2 (HedSchemaEntry or str): The second entry.

    Returns:
        Dictionary representing the differences.
    """
    diff_dict = {}

    if isinstance(entry1, str):
        # Handle special case ones like prologue
        if entry1 != entry2:
            diff_dict["value"] = {
                "Schema1": entry1,
                "Schema2": entry2
            }
    else:
        if entry1.name != entry2.name:
            diff_dict["name"] = {
                "Schema1": entry1.name,
                "Schema2": entry2.name
            }

        # Checking if both entries have the same description
        if entry1.description != entry2.description:
            diff_dict["description"] = {
                "Schema1": entry1.description,
                "Schema2": entry2.description
            }

        # Comparing attributes
        for attr in set(entry1.attributes.keys()).union(entry2.attributes.keys()):
            if entry1.attributes.get(attr) != entry2.attributes.get(attr):
                diff_dict[attr] = {
                    "Schema1": entry1.attributes.get(attr),
                    "Schema2": entry2.attributes.get(attr)
                }

    return diff_dict


def _pretty_print_diff_entry(entry1, entry2):
    """
    Returns the differences between two HedSchemaEntry objects as a list of strings.

    Parameters:
        entry1 (HedSchemaEntry): The first entry.
        entry2 (HedSchemaEntry): The second entry.

    Returns:
        List of strings representing the differences.
    """
    diff_dict = _dict_diff_entries(entry1, entry2)
    diff_lines = []

    for key, value in diff_dict.items():
        diff_lines.append(f"\t{key}:")
        for schema, val in value.items():
            diff_lines.append(f"\t\t{schema}: {val}")

    return diff_lines


def _pretty_print_diff_all(entries, type_name=""):
    """
    Formats the differences between pairs of HedSchemaEntry objects.

    Parameters:
        entries (dict): A dictionary where each key maps to a pair of HedSchemaEntry objects.
        type_name(str): The type to identify this as, such as Tag
    Returns:
        diff_string(str): The differences found in the dict
    """
    output = []
    if not type_name.endswith(" "):
        type_name += " "
    if not entries:
        return ""
    for key, (entry1, entry2) in entries.items():
        output.append(f"{type_name}'{key}':")
        output += _pretty_print_diff_entry(entry1, entry2)
        output.append("")

    return "\n".join(output)


def _pretty_print_missing_all(entries, schema_name, type_name):
    """
    Formats the missing entries from schema_name.

    Parameters:
        entries (dict): A dictionary where each key maps to a pair of HedSchemaEntry objects.
        schema_name(str): The name these entries are missing from
        type_name(str): The type to identify this as, such as Tag
    Returns:
        diff_string(str): The differences found in the dict
    """
    output = []
    if not entries:
        return ""
    if not type_name.endswith(" "):
        type_name += " "
    for key, entry in entries.items():
        output.append(f"{type_name}'{key}' not in '{schema_name}':")
        output += _pretty_print_entry(entry)
        output.append("")

    return "\n".join(output)