Source code for hed.schema.schema_io.df_util

import csv
import os

import pandas as pd

from hed.errors import HedFileError, HedExceptions
from hed.schema import hed_schema_df_constants as constants
from hed.schema.hed_schema_constants import HedKey
from hed.schema.hed_cache import get_library_data
from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line

UNKNOWN_LIBRARY_VALUE = 0


[docs]def save_dataframes(base_filename, dataframe_dict): """ Writes out the dataframes using the provided suffixes. Does not validate contents or suffixes. If base_filename has a .tsv suffix, save directly to the indicated location. If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that. The subfiles are named the same. e.g. HED8.3.0/HED8.3.0_Tag.tsv Parameters: base_filename(str): The base filename to use. Output is {base_filename}_{suffix}.tsv See DF_SUFFIXES for all expected names. dataframe_dict(dict of str: df.DataFrame): The list of files to save out. No validation is done. """ if base_filename.lower().endswith(".tsv"): base, base_ext = os.path.splitext(base_filename) base_dir, base_name = os.path.split(base) else: # Assumed as a directory name base_dir = base_filename base_filename = os.path.split(base_dir)[1] base = os.path.join(base_dir, base_filename) os.makedirs(base_dir, exist_ok=True) for suffix, dataframe in dataframe_dict.items(): filename = f"{base}_{suffix}.tsv" with open(filename, mode='w', encoding='utf-8') as opened_file: dataframe.to_csv(opened_file, sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE, lineterminator="\n")
[docs]def convert_filenames_to_dict(filenames, include_prefix_dfs=False): """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet Parameters: filenames(str or None or list or dict): The list to convert to a dict If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files. include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes. Returns: filename_dict(str: str): The required suffix to filename mapping""" result_filenames = {} dataframe_names = constants.DF_SUFFIXES_OMN if include_prefix_dfs else constants.DF_SUFFIXES if isinstance(filenames, str): if filenames.endswith(".tsv"): base, base_ext = os.path.splitext(filenames) else: # Load as foldername/foldername_suffix.tsv base_dir = filenames base_filename = os.path.split(base_dir)[1] base = os.path.join(base_dir, base_filename) for suffix in dataframe_names: filename = f"{base}_{suffix}.tsv" result_filenames[suffix] = filename filenames = result_filenames elif isinstance(filenames, list): for filename in filenames: remainder, suffix = filename.replace("_", "-").rsplit("-") for needed_suffix in dataframe_names: if needed_suffix in suffix: result_filenames[needed_suffix] = filename filenames = result_filenames return filenames
[docs]def create_empty_dataframes(): """Returns the default empty dataframes""" base_dfs = {constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str), constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str), constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str), constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), } return base_dfs
[docs]def load_dataframes(filenames, include_prefix_dfs=False): """Load the dataframes from the source folder or series of files. Parameters: filenames(str or None or list or dict): The input filenames If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files. include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes. Returns: dataframes_dict(str: dataframes): The suffix:dataframe dict """ dict_filenames = convert_filenames_to_dict(filenames, include_prefix_dfs=include_prefix_dfs) dataframes = create_empty_dataframes() for key, filename in dict_filenames.items(): try: loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False) if key in dataframes: columns_not_in_loaded = dataframes[key].columns[~dataframes[key].columns.isin(loaded_dataframe.columns)] # and not dataframes[key].columns.isin(loaded_dataframe.columns).all(): if columns_not_in_loaded.any(): raise HedFileError(HedExceptions.SCHEMA_LOAD_FAILED, f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}. " f"The required columns are {list(dataframes[key].columns)}", filename=filename) dataframes[key] = loaded_dataframe except OSError: # todo: consider if we want to report this error(we probably do) pass # We will use a blank one for this return dataframes
[docs]def get_library_name_and_id(schema): """ Get the library("Standard" for the standard schema) and first id for a schema range Parameters: schema(HedSchema): The schema to check Returns: library_name(str): The capitalized library name first_id(int): the first id for a given library """ name = schema.library library_data = get_library_data(name) starting_id, _ = library_data.get("id_range", (UNKNOWN_LIBRARY_VALUE, UNKNOWN_LIBRARY_VALUE)) if not name: name = "standard" return name.capitalize(), starting_id
# todo: Replace this once we no longer support < python 3.9
[docs]def remove_prefix(text, prefix): if text and text.startswith(prefix): return text[len(prefix):] return text
[docs]def calculate_attribute_type(attribute_entry): """Returns the type of this attribute(annotation, object, data) Returns: attribute_type(str): "annotation", "object", or "data". """ attributes = attribute_entry.attributes object_ranges = {HedKey.TagRange, HedKey.UnitRange, HedKey.UnitClassRange, HedKey.ValueClassRange} if HedKey.AnnotationProperty in attributes: return "annotation" elif any(attribute in object_ranges for attribute in attributes): return "object" return "data"
[docs]def get_attributes_from_row(row): """ Get the tag attributes from a line. Parameters: row (pd.Series): A tag line. Returns: dict: Dictionary of attributes. """ if constants.properties in row.index: attr_string = row[constants.properties] elif constants.attributes in row.index: attr_string = row[constants.attributes] else: attr_string = "" if constants.subclass_of in row.index and row[constants.subclass_of] == "HedHeader": header_attributes, _ = _parse_header_attributes_line(attr_string) return header_attributes return parse_attribute_string(attr_string)