Source code for hed.models.column_metadata

from enum import Enum
from hed.errors.error_types import SidecarErrors
import pandas as pd
import copy


[docs]class ColumnType(Enum): """ The overall column_type of a column in column mapper, e.g. treat it as HED tags. Mostly internal to column mapper related code """ Unknown = None # Do not return this column at all Ignore = "ignore" # This column is a category with a list of possible values to replace with hed strings. Categorical = "categorical" # This column has a value(e.g. filename) that is added to a hed tag in place of a # sign. Value = "value" # Return this column exactly as given, it is HED tags. HEDTags = "hed_tags"
[docs]class ColumnMetadata: """ Column in a ColumnMapper. """
[docs] def __init__(self, column_type=None, name=None, source=None): """ A single column entry in the column mapper. Parameters: column_type (ColumnType or None): How to treat this column when reading data. name (str, int, or None): The column_name or column number identifying this column. If name is a string, you'll need to use a column map to set the number later. source (dict or str or None): Either the entire loaded json sidecar or a single HED string """ self.column_name = name self._source = source if column_type is None: column_type = self._detect_column_type(self.source_dict) self.column_type = column_type
@property def hed_dict(self): """ The hed strings for any given entry. Returns: dict or str: A string or dict of strings for this column """ if self._source is None or isinstance(self._source, str): return self._source return self._source[self.column_name].get("HED", {}) @property def source_dict(self): """ The raw dict for this entry(if it exists) Returns: dict or str: A string or dict of strings for this column """ if self._source is None or isinstance(self._source, str): return {"HED": self._source} return self._source[self.column_name]
[docs] def get_hed_strings(self): """ Returns the hed strings for this entry as a series. Returns: hed_strings(pd.Series): the hed strings for this series.(potentially empty) """ if not self.column_type: return pd.Series(dtype=str) series = pd.Series(self.hed_dict, dtype=str) return series
[docs] def set_hed_strings(self, new_strings): """ Sets the hed strings for this entry. Parameters: new_strings(pd.Series, dict, or str): The hed strings to set. This should generally be the return value from get_hed_strings Returns: hed_strings(pd.Series): the hed strings for this series.(potentially empty) """ if new_strings is None: return False if not self.column_type: return False if isinstance(new_strings, pd.Series): if self.column_type == ColumnType.Categorical: new_strings = new_strings.to_dict() elif new_strings.empty: return False else: new_strings = new_strings.iloc[0] self._source[self.column_name]["HED"] = new_strings return True
@staticmethod def _detect_column_type(dict_for_entry, basic_validation=True): """ Determine the ColumnType of a given json entry. Parameters: dict_for_entry (dict): The loaded json entry a specific column. Generally has a "HED" entry among other optional ones. basic_validation (bool): If False, does not verify past "HED" exists and the type This is used to issue more precise errors that are normally just silently ignored, but also not crash. Returns: ColumnType: The determined type of given column. Returns None if unknown. """ if not dict_for_entry or not isinstance(dict_for_entry, dict): return ColumnType.Ignore minimum_required_keys = ("HED",) if not set(minimum_required_keys).issubset(dict_for_entry.keys()): return ColumnType.Ignore hed_entry = dict_for_entry["HED"] if isinstance(hed_entry, dict): if basic_validation and not all(isinstance(entry, str) for entry in hed_entry.values()): return None return ColumnType.Categorical if not isinstance(hed_entry, str): return None if basic_validation and "#" not in dict_for_entry["HED"]: return None return ColumnType.Value
[docs] @staticmethod def expected_pound_sign_count(column_type): """ Return how many pound signs a column string should have. Parameters: column_type(ColumnType): The type of the column Returns: tuple: expected_count(int): The expected count. 0 or 1 error_type(str): The type of the error we should issue """ if column_type == ColumnType.Value: expected_count = 1 error_type = SidecarErrors.INVALID_POUND_SIGNS_VALUE elif column_type == ColumnType.HEDTags or column_type == ColumnType.Categorical: expected_count = 0 error_type = SidecarErrors.INVALID_POUND_SIGNS_CATEGORY else: return 0, None return expected_count, error_type
def _get_unvalidated_data(self): """Returns a copy with less preliminary validation done(such as verifying all data types)""" return_copy = copy.deepcopy(self) return_copy.column_type = ColumnMetadata._detect_column_type(dict_for_entry=return_copy.source_dict, basic_validation=False) return return_copy