Source code for hed.tools.util.io_util

"""Utilities for generating and handling file names."""

import os
import re
from datetime import datetime
from hed.errors.exceptions import HedFileError

TIME_FORMAT = '%Y_%m_%d_T_%H_%M_%S_%f'


[docs]def check_filename(test_file, name_prefix=None, name_suffix=None, extensions=None):
    """ Return True if correct extension, suffix, and prefix.

    Parameters:
        test_file (str) :           Path of filename to test.
        name_prefix (list, str, None):  An optional name_prefix or list of prefixes to accept for the base filename.
        name_suffix (list, str, None):  An optional name_suffix or list of suffixes to accept for the base file name.
        extensions (list, str, None):   An optional extension or list of extensions to accept for the extensions.

    Returns:
        bool: True if file has the appropriate format.

    Notes:
        - Everything is converted to lower case prior to testing so this test should be case-insensitive.
        - None indicates that all are accepted.


    """

    basename = os.path.basename(test_file.lower())
    if name_prefix and not get_allowed(basename, allowed_values=name_prefix, starts_with=True):
        return False
    if extensions:
        ext = get_allowed(basename, allowed_values=extensions, starts_with=False)
        if not ext:
            return False
        basename = basename[:-len(ext)]
    else:
        basename = os.path.splitext(basename)[0]
    if name_suffix and not get_allowed(basename, allowed_values=name_suffix, starts_with=False):
        return False
    return True


[docs]def get_allowed(value, allowed_values=None, starts_with=True):
    """ Return the portion of the value that matches a value in allowed_values or None if no match.

    Parameters:
        value (str): value to be matched.
        allowed_values (list, str, or None):  Values to match.
        starts_with (bool):  If true match is done at beginning of string, otherwise the end.

    Notes:
        - match is done in lower case.

    """
    if not allowed_values:
        return value
    elif not isinstance(allowed_values, list):
        allowed_values = [allowed_values]
    allowed_values = [item.lower() for item in allowed_values]
    lower_value = value.lower()
    if starts_with:
        result = list(filter(lower_value.startswith, allowed_values))
    else:
        result = list(filter(lower_value.endswith, allowed_values))
    if result:
        result = result[0]
    return result


[docs]def extract_suffix_path(path, prefix_path):
    """ Return the suffix of path after prefix path has been removed.

    Parameters:
        path (str)           path of the root directory.
        prefix_path (str)    sub-path relative to the root directory.

    Returns:
        str:   Suffix path.

    Notes:
        - This function is useful for creating files within BIDS datasets

    """

    real_prefix = os.path.normpath(os.path.realpath(prefix_path).lower())
    suffix_path = os.path.normpath(os.path.realpath(path).lower())
    return_path = os.path.normpath(os.path.realpath(path))
    if suffix_path.startswith(real_prefix):
        return_path = return_path[len(real_prefix):]
    return return_path


[docs]def clean_filename(filename):
    """ Replaces invalid characters with under-bars

    Parameters:
        filename (str):   source filename

    Returns:
        str:  The filename with anything but alphanumeric, period, hyphens, and under-bars removed.
    """
    if not filename:
        return ""
    out_name = re.sub(r'[^a-zA-Z0-9._-]+', '_', filename)
    return out_name


[docs]def get_dir_dictionary(dir_path, name_prefix=None, name_suffix=None, extensions=None, skip_empty=True,
                       exclude_dirs=None):

    """ Create dictionary directory paths keys.

    Parameters:
        dir_path (str):               Full path of the directory tree to be traversed (no ending slash).
        name_prefix (str, None):      An optional name_prefix for the base filename.
        name_suffix (str, None):      An optional name_suffix for the base file name.
        extensions (list, None):      An optional list of file extensions.
        skip_empty (bool):            Do not put entry for directories that have no files.
        exclude_dirs (list):          List of directories to skip

    Returns:
        dict:  Dictionary with directories as keys and file lists values.

    """

    if not exclude_dirs:
        exclude_dirs = []
    dir_dict = {}
    for root, dirs, files in os.walk(dir_path, topdown=True):
        dirs[:] = [d for d in dirs if d not in exclude_dirs]
        file_list = []
        for r_file in files:
            if check_filename(r_file, name_prefix, name_suffix, extensions):
                file_list.append(os.path.join(os.path.realpath(root), r_file))
        if skip_empty and not file_list:
            continue
        dir_dict[os.path.realpath(root)] = file_list
    return dir_dict


[docs]def get_filtered_by_element(file_list, elements):
    """ Filter a file list by whether the base names have a substring matching any of the members of elements.

    Parameters:
        file_list (list):  List of file paths to be filtered.
        elements (list):  List of strings to use as filename filters.

    Returns:
        list:  The list only containing file paths whose filenames match a filter.

    """
    new_list = [file for file in file_list if any(substring in os.path.basename(file) for substring in elements)]
    return new_list


[docs]def get_filtered_list(file_list, name_prefix=None, name_suffix=None, extensions=None):
    """ Get list of filenames satisfying the criteria.

    Everything is converted to lower case prior to testing so this test should be case-insensitive.

    Parameters:
        file_list (list):      List of files to test.
        name_prefix (str):     Optional name_prefix for the base filename.
        name_suffix (str):     Optional name_suffix for the base filename.
        extensions (list):     Optional list of file extensions (allows two periods (.tsv.gz))

     Returns:
         list:  The filtered file names.

     """
    filtered_files = [file for file in file_list if
                      check_filename(file, name_prefix=name_prefix, name_suffix=name_suffix, extensions=extensions)]
    return filtered_files


[docs]def get_file_list(root_path, name_prefix=None, name_suffix=None, extensions=None, exclude_dirs=None):
    """ Return paths satisfying various conditions.

    Parameters:
        root_path (str):              Full path of the directory tree to be traversed (no ending slash).
        name_prefix (str, None):      An optional name_prefix for the base filename.
        name_suffix (str, None):      The name_suffix of the paths to be extracted.
        extensions (list, None):      A list of extensions to be selected.
        exclude_dirs (list, None):    A list of paths to be excluded.

    Returns:
        list:   The full paths.
    """
    file_list = []
    if not exclude_dirs:
        exclude_dirs = []
    for root, dirs, files in os.walk(root_path, topdown=True):
        dirs[:] = [d for d in dirs if d not in exclude_dirs]
        for r_file in files:
            if check_filename(r_file, name_prefix, name_suffix, extensions):
                file_list.append(os.path.realpath(os.path.join(root, r_file)))
    return file_list


[docs]def get_path_components(root_path, this_path):
    """ Get a list of the remaining components after root path.

    Parameters:
        root_path (str):      A path (no trailing separator)
        this_path (str):      The path of a file or directory descendant of root_path

    Returns:
        list or None:   A list with the remaining elements directory components to the file.

    Notes: this_path must be a descendant of root_path.

    """

    base_path = os.path.normpath(os.path.realpath(root_path))
    cur_path = os.path.normpath(os.path.realpath(this_path))
    common_prefix = os.path.commonprefix([base_path, cur_path])
    if not common_prefix:
        raise ValueError("NoPathInCommon", f"Paths {base_path} and {cur_path} must have items in common")
    common_path = os.path.commonpath([base_path, cur_path])
    if common_path != base_path:
        return None
    rel_path = os.path.relpath(cur_path, base_path)
    the_dir = os.path.dirname(rel_path)
    if the_dir:
        return os.path.normpath(the_dir).split(os.sep)
    else:
        return []


[docs]def get_timestamp():
    now = datetime.now()
    return now.strftime(TIME_FORMAT)[:-3]


[docs]def make_path(root_path, sub_path, filename):
    """ Get path for a file, verifying all components exist.

    Parameters:
        root_path (str):   path of the root directory.
        sub_path (str):    sub-path relative to the root directory.
        filename (str):    filename of the file.

    Returns:
        str: A valid realpath for the specified file.

    Notes: This function is useful for creating files within BIDS datasets

    """

    dir_path = os.path.realpath(os.path.join(root_path, sub_path))
    os.makedirs(dir_path, exist_ok=True)
    return os.path.realpath(os.path.join(dir_path, filename))


[docs]def parse_bids_filename(file_path):
    """ Split a filename into BIDS-relevant components.

    Parameters:
        file_path (str):     Path to be parsed.

    Returns:
        str:   BIDS suffix name.
        str:   File extension (including the .).
        dict:  Dictionary with key-value pair being (entity type, entity value).

    :raises HedFileError:
        - If filename does not conform to name-value_suffix format.

    Notes:  
        - splits into BIDS suffix, extension, and a dictionary of entity name-value pairs.

    """

    filename = os.path.splitext(os.path.basename(file_path))
    ext = filename[1].lower()
    basename = filename[0].strip()
    entity_dict = {}

    if len(basename) == 0:
        raise HedFileError("BlankFileName", f"The basename for {file_path} is blank", "")
    entity_pieces = basename.split('_')
    split_dict = _split_entity(entity_pieces[-1])
    if "bad" in split_dict:
        raise HedFileError("BadSuffixPiece",
                           f"The basename for {entity_pieces[-1]} has bad {split_dict['bad']}", "")
    if "suffix" in split_dict:
        suffix = split_dict["suffix"]
    else:
        suffix = None
        entity_dict[split_dict["key"]] = split_dict["value"]
    for pos, entity in reversed(list(enumerate(entity_pieces[:-1]))):
        split_dict = _split_entity(entity)
        if "key" not in split_dict:
            raise HedFileError("BadKeyValue", f"The piece {entity} is not in key-value form", "")
        entity_dict[split_dict["key"]] = split_dict["value"]
    return suffix, ext, entity_dict


def _split_entity(piece):
    """Splits a piece into an entity or suffix.

    Parameters:
        piece (str):   A string to be parsed.

    Returns:
        dict:  with entities as keys as well as the key "bad" and the key "suffix".

    """
    piece = piece.strip()
    if not piece:
        return {"bad": ""}
    split_piece = piece.split('-')
    if len(split_piece) == 1:
        return {"suffix": piece}
    if len(split_piece) == 2:
        return {"key": split_piece[0].strip(), "value": split_piece[1].strip()}
    else:
        return {"bad": piece}


[docs]def get_task_from_file(file_path):
    filename = os.path.splitext(os.path.basename(file_path))
    basename = filename[0].strip()
    position = basename.lower().find("task-")
    if position == -1:
        return ""
    splits = re.split(r'[_.]', basename[position+5:])
    return splits[0]