Source code for hed.schema.hed_schema_io

""" Utilities for loading and outputting HED schema. """
import os
import json
import functools

from hed.schema.schema_io.xml2schema import SchemaLoaderXML
from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki
from hed.schema.schema_io.df2schema import SchemaLoaderDF
from hed.schema import hed_cache

from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io import schema_util
from hed.schema.hed_schema_group import HedSchemaGroup
from hed.schema.schema_header_util import validate_version_string
from collections import defaultdict
from urllib.error import URLError

MAX_MEMORY_CACHE = 40


[docs]def load_schema_version(xml_version=None, xml_folder=None): """ Return a HedSchema or HedSchemaGroup extracted from xml_version Parameters: xml_version (str or list): List or str specifying which official HED schemas to use. A json str format is also supported, based on the output of HedSchema.get_formatted_version Basic format: `[schema_namespace:][library_name_]X.Y.Z`. xml_folder (str): Path to a folder containing schema. Returns: HedSchema or HedSchemaGroup: The schema or schema group extracted. :raises HedFileError: - The xml_version is not valid. - The specified version cannot be found or loaded - Other fatal errors loading the schema (These are unlikely if you are not editing them locally) - The prefix is invalid """ # Check if we start and end with a square bracket, or double quote. This might be valid json if xml_version and isinstance(xml_version, str) and \ ((xml_version[0], xml_version[-1]) in [('[', ']'), ('"', '"')]): try: xml_version = json.loads(xml_version) except json.decoder.JSONDecodeError as e: raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), xml_version) from e if xml_version and isinstance(xml_version, list): xml_versions = parse_version_list(xml_version) schemas = [_load_schema_version(xml_version=version, xml_folder=xml_folder) for version in xml_versions.values()] if len(schemas) == 1: return schemas[0] name = ",".join([schema.version for schema in schemas]) return HedSchemaGroup(schemas, name=name) else: return _load_schema_version(xml_version=xml_version, xml_folder=xml_folder)
[docs]def load_schema(hed_path, schema_namespace=None, schema=None, name=None): """ Load a schema from the given file or URL path. Parameters: hed_path (str): A filepath or url to open a schema from. If loading a TSV file, this should be a single filename where: Template: basename.tsv, where files are named basename_Struct.tsv, basename_Tag.tsv, etc. schema_namespace (str or None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. name(str or None): User supplied identifier for this schema Returns: HedSchema: The loaded schema. :raises HedFileError: - Empty path passed - Unknown extension - Any fatal issues when loading the schema. """ if not hed_path: raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file path passed to HedSchema.load_file", filename=hed_path) is_url = hed_cache._check_if_url(hed_path) if is_url: try: file_as_string = schema_util.url_to_string(hed_path) except URLError as e: raise HedFileError(HedExceptions.URL_ERROR, str(e), hed_path) from e hed_schema = from_string(file_as_string, schema_format=os.path.splitext(hed_path.lower())[1], name=name) elif hed_path.lower().endswith(".xml"): hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name) elif hed_path.lower().endswith(".mediawiki"): hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name) elif hed_path.lower().endswith(".tsv"): if schema is not None: raise HedFileError(HedExceptions.INVALID_HED_FORMAT, "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name) hed_schema = SchemaLoaderDF.load_spreadsheet(filenames=hed_path, name=name) else: raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=hed_path) if schema_namespace: hed_schema.set_schema_prefix(schema_namespace=schema_namespace) return hed_schema
[docs]def from_string(schema_string, schema_format=".xml", schema_namespace=None, schema=None, name=None): """ Create a schema from the given string. Parameters: schema_string (str): An XML or mediawiki file as a single long string schema_format (str): The schema format of the source schema string. Allowed normal values: .mediawiki, .xml schema_namespace (str, None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. name(str or None): User supplied identifier for this schema Returns: (HedSchema): The loaded schema. :raises HedFileError: - If empty string or invalid extension is passed. - Other fatal formatting issues with file Notes: - The loading is determined by file type. """ if not schema_string: raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string", filename=name) if isinstance(schema_string, str): # Replace carriage returns with new lines since this might not be done by the caller schema_string = schema_string.replace("\r\n", "\n") if schema_format.endswith(".xml"): hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name) elif schema_format.endswith(".mediawiki"): hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name) else: raise HedFileError(HedExceptions.INVALID_EXTENSION, f"Unknown schema extension {schema_format}", filename=name) if schema_namespace: hed_schema.set_schema_prefix(schema_namespace=schema_namespace) return hed_schema
[docs]def from_dataframes(schema_data, schema_namespace=None, name=None): """ Create a schema from the given string. Parameters: schema_string (dict): A dict of DF_SUFFIXES:file_as_string_or_df Should have an entry for all values of DF_SUFFIXES. schema_namespace (str, None): The name_prefix all tags in this schema will accept. name(str or None): User supplied identifier for this schema Returns: (HedSchema): The loaded schema. :raises HedFileError: - Empty/invalid parameters Notes: - The loading is determined by file type. """ if not schema_data or not isinstance(schema_data, dict): raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty or non dict value passed to HedSchema.from_dataframes", filename=name) hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings_or_df=schema_data, name=name) if schema_namespace: hed_schema.set_schema_prefix(schema_namespace=schema_namespace) return hed_schema
# If this is actually used, we could easily add other versions/update this one
[docs]def get_hed_xml_version(xml_file_path): """ Get the version number from a HED XML file. Parameters: xml_file_path (str): The path to a HED XML file. Returns: str: The version number of the HED XML file. :raises HedFileError: - There is an issue loading the schema """ parser = SchemaLoaderXML(xml_file_path) return parser.schema.version
[docs]def parse_version_list(xml_version_list): """Takes a list of xml versions and returns a dictionary split by prefix e.g. ["score", "testlib"] will return {"": "score, testlib"} e.g. ["score", "testlib", "ol:otherlib"] will return {"": "score, testlib", "ol:": "otherlib"} Parameters: xml_version_list (list): List of str specifying which hed schemas to use Returns: HedSchema or HedSchemaGroup: The schema or schema group extracted. """ out_versions = defaultdict(list) for version in xml_version_list: schema_namespace = "" if version and ":" in version: schema_namespace, _, version = version.partition(":") if not isinstance(version, str): raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, f"Must specify a schema version by number, found no version on {xml_version_list} schema.", filename=None) if version in out_versions[schema_namespace]: raise HedFileError(HedExceptions.SCHEMA_DUPLICATE_LIBRARY, f"Attempting to load the same library '{version}' twice: {out_versions[schema_namespace]}", filename=None) out_versions[schema_namespace].append(version) out_versions = {key: ",".join(value) if not key else f"{key}:" + ",".join(value) for key, value in out_versions.items()} return out_versions
@functools.lru_cache(maxsize=MAX_MEMORY_CACHE) def _load_schema_version(xml_version=None, xml_folder=None): """ Return specified version Parameters: xml_version (str): HED version format string. Expected format: '[schema_namespace:][library_name_]X.Y.Z' Further versions can be added comma separated after the version number/library name. e.g. "lib:library_x.y.z,otherlibrary_x.y.z" will load "library" and "otherlibrary" into "lib:" The schema namespace must be the same and not repeated if loading multiple merged schemas. xml_folder (str): Path to a folder containing schema. Returns: HedSchema or HedSchemaGroup: The requested HedSchema object. :raises HedFileError: - The xml_version is not valid. - The specified version cannot be found or loaded - Multiple schemas are being loaded with the same prefix, and they have duplicate tags - Other fatal errors loading the schema (These are unlikely if you are not editing them locally) - The prefix is invalid """ schema_namespace = "" name = xml_version if xml_version: if ":" in xml_version: schema_namespace, _, xml_version = xml_version.partition(":") if xml_version: xml_versions = xml_version.split(",") # Add a blank entry to generate an error if we have no xml version else: xml_versions = [""] first_schema = _load_schema_version_sub(xml_versions[0], schema_namespace, xml_folder=xml_folder, name=name) filenames = [os.path.basename(first_schema.filename)] for version in xml_versions[1:]: _load_schema_version_sub(version, schema_namespace, xml_folder=xml_folder, schema=first_schema, name=name) # Detect duplicate errors when merging schemas in the same namespace current_filename = os.path.basename(first_schema.filename) duplicate_name = first_schema.has_duplicates() if duplicate_name: issues = first_schema.check_compliance(check_for_warnings=False) filename_string = ",".join(filenames) msg = (f"A duplicate tag, '{duplicate_name}', was detected in the schema file '{current_filename}'. " f"Previously loaded schemas include: {filename_string}. " f"To resolve this, consider prefixing the final schema during load: " f"custom_prefix:schema_version.") raise HedFileError(HedExceptions.SCHEMA_DUPLICATE_NAMES, msg, first_schema.filename, issues) filenames.append(current_filename) return first_schema def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None, schema=None, name=""): """ Return specified version Parameters: xml_version (str): HED version format string. Expected format: '[schema_namespace:][library_name_]X.Y.Z' schema_namespace(str): Namespace to add this schema to, default none xml_folder (str): Path to a folder containing schema. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. Returns: HedSchema: The requested HedSchema object. :raises HedFileError: - The xml_version is not valid. - The specified version cannot be found or loaded - Other fatal errors loading the schema (These are unlikely if you are not editing them locally) - The prefix is invalid """ library_name = None if not xml_version: out_name = schema_namespace if schema_namespace else "standard" raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, f"Must specify a schema version by number, found no version on {out_name} schema.", filename=None) save_version = xml_version if "_" in xml_version: library_name, _, xml_version = xml_version.rpartition("_") if validate_version_string(xml_version): raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, f"Must specify a schema version by number, found no version on {xml_version} schema.", filename=name) try: # 1. Try fully local copy final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder) if not final_hed_xml_file: hed_cache.cache_local_versions(xml_folder) # 2. Cache the schemas included in hedtools and try local again final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder) hed_schema = load_schema(final_hed_xml_file, schema=schema, name=name) except HedFileError as e: if e.code == HedExceptions.FILE_NOT_FOUND: # Cache all schemas if we haven't recently. hed_cache.cache_xml_versions(cache_folder=xml_folder) # 3. See if we got a copy from online final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder) # 4. Finally check for a pre-release one if not final_hed_xml_file: final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder, check_prerelease=True) if not final_hed_xml_file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, f"HED version '{save_version}' not found in cache: {hed_cache.get_cache_directory()}", filename=xml_folder) hed_schema = load_schema(final_hed_xml_file, schema=schema, name=name) else: raise e if schema_namespace: hed_schema.set_schema_prefix(schema_namespace=schema_namespace) return hed_schema