"""Utilities for generating and handling file names."""
import os
import re
from datetime import datetime
from hed.errors.exceptions import HedFileError
TIME_FORMAT = '%Y_%m_%d_T_%H_%M_%S_%f'
[docs]def check_filename(test_file, name_prefix=None, name_suffix=None, extensions=None):
""" Return True if correct extension, suffix, and prefix.
Parameters:
test_file (str) : Path of filename to test.
name_prefix (list, str, None): An optional name_prefix or list of prefixes to accept for the base filename.
name_suffix (list, str, None): An optional name_suffix or list of suffixes to accept for the base file name.
extensions (list, str, None): An optional extension or list of extensions to accept for the extensions.
Returns:
bool: True if file has the appropriate format.
Notes:
- Everything is converted to lower case prior to testing so this test should be case-insensitive.
- None indicates that all are accepted.
"""
basename = os.path.basename(test_file.lower())
if name_prefix and not get_allowed(basename, allowed_values=name_prefix, starts_with=True):
return False
if extensions:
ext = get_allowed(basename, allowed_values=extensions, starts_with=False)
if not ext:
return False
basename = basename[:-len(ext)]
else:
basename = os.path.splitext(basename)[0]
if name_suffix and not get_allowed(basename, allowed_values=name_suffix, starts_with=False):
return False
return True
[docs]def get_allowed(value, allowed_values=None, starts_with=True):
""" Return the portion of the value that matches a value in allowed_values or None if no match.
Parameters:
value (str): value to be matched.
allowed_values (list, str, or None): Values to match.
starts_with (bool): If true match is done at beginning of string, otherwise the end.
Notes:
- match is done in lower case.
"""
if not allowed_values:
return value
elif not isinstance(allowed_values, list):
allowed_values = [allowed_values]
allowed_values = [item.lower() for item in allowed_values]
lower_value = value.lower()
if starts_with:
result = list(filter(lower_value.startswith, allowed_values))
else:
result = list(filter(lower_value.endswith, allowed_values))
if result:
result = result[0]
return result
[docs]def clean_filename(filename):
""" Replaces invalid characters with under-bars
Parameters:
filename (str): source filename
Returns:
str: The filename with anything but alphanumeric, period, hyphens, and under-bars removed.
"""
if not filename:
return ""
out_name = re.sub(r'[^a-zA-Z0-9._-]+', '_', filename)
return out_name
[docs]def get_dir_dictionary(dir_path, name_prefix=None, name_suffix=None, extensions=None, skip_empty=True,
exclude_dirs=None):
""" Create dictionary directory paths keys.
Parameters:
dir_path (str): Full path of the directory tree to be traversed (no ending slash).
name_prefix (str, None): An optional name_prefix for the base filename.
name_suffix (str, None): An optional name_suffix for the base file name.
extensions (list, None): An optional list of file extensions.
skip_empty (bool): Do not put entry for directories that have no files.
exclude_dirs (list): List of directories to skip
Returns:
dict: Dictionary with directories as keys and file lists values.
"""
if not exclude_dirs:
exclude_dirs = []
dir_dict = {}
for root, dirs, files in os.walk(dir_path, topdown=True):
dirs[:] = [d for d in dirs if d not in exclude_dirs]
file_list = []
for r_file in files:
if check_filename(r_file, name_prefix, name_suffix, extensions):
file_list.append(os.path.join(os.path.realpath(root), r_file))
if skip_empty and not file_list:
continue
dir_dict[os.path.realpath(root)] = file_list
return dir_dict
[docs]def get_filtered_by_element(file_list, elements):
""" Filter a file list by whether the base names have a substring matching any of the members of elements.
Parameters:
file_list (list): List of file paths to be filtered.
elements (list): List of strings to use as filename filters.
Returns:
list: The list only containing file paths whose filenames match a filter.
"""
new_list = [file for file in file_list if any(substring in os.path.basename(file) for substring in elements)]
return new_list
[docs]def get_filtered_list(file_list, name_prefix=None, name_suffix=None, extensions=None):
""" Get list of filenames satisfying the criteria.
Everything is converted to lower case prior to testing so this test should be case-insensitive.
Parameters:
file_list (list): List of files to test.
name_prefix (str): Optional name_prefix for the base filename.
name_suffix (str): Optional name_suffix for the base filename.
extensions (list): Optional list of file extensions (allows two periods (.tsv.gz))
Returns:
list: The filtered file names.
"""
filtered_files = [file for file in file_list if
check_filename(file, name_prefix=name_prefix, name_suffix=name_suffix, extensions=extensions)]
return filtered_files
[docs]def get_file_list(root_path, name_prefix=None, name_suffix=None, extensions=None, exclude_dirs=None):
""" Return paths satisfying various conditions.
Parameters:
root_path (str): Full path of the directory tree to be traversed (no ending slash).
name_prefix (str, None): An optional name_prefix for the base filename.
name_suffix (str, None): The name_suffix of the paths to be extracted.
extensions (list, None): A list of extensions to be selected.
exclude_dirs (list, None): A list of paths to be excluded.
Returns:
list: The full paths.
"""
file_list = []
if not exclude_dirs:
exclude_dirs = []
for root, dirs, files in os.walk(root_path, topdown=True):
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for r_file in files:
if check_filename(r_file, name_prefix, name_suffix, extensions):
file_list.append(os.path.realpath(os.path.join(root, r_file)))
return file_list
[docs]def get_path_components(root_path, this_path):
""" Get a list of the remaining components after root path.
Parameters:
root_path (str): A path (no trailing separator)
this_path (str): The path of a file or directory descendant of root_path
Returns:
list or None: A list with the remaining elements directory components to the file.
Notes: this_path must be a descendant of root_path.
"""
base_path = os.path.normpath(os.path.realpath(root_path))
cur_path = os.path.normpath(os.path.realpath(this_path))
common_prefix = os.path.commonprefix([base_path, cur_path])
if not common_prefix:
raise ValueError("NoPathInCommon", f"Paths {base_path} and {cur_path} must have items in common")
common_path = os.path.commonpath([base_path, cur_path])
if common_path != base_path:
return None
rel_path = os.path.relpath(cur_path, base_path)
the_dir = os.path.dirname(rel_path)
if the_dir:
return os.path.normpath(the_dir).split(os.sep)
else:
return []
[docs]def get_timestamp():
now = datetime.now()
return now.strftime(TIME_FORMAT)[:-3]
[docs]def make_path(root_path, sub_path, filename):
""" Get path for a file, verifying all components exist.
Parameters:
root_path (str): path of the root directory.
sub_path (str): sub-path relative to the root directory.
filename (str): filename of the file.
Returns:
str: A valid realpath for the specified file.
Notes: This function is useful for creating files within BIDS datasets
"""
dir_path = os.path.realpath(os.path.join(root_path, sub_path))
os.makedirs(dir_path, exist_ok=True)
return os.path.realpath(os.path.join(dir_path, filename))
[docs]def parse_bids_filename(file_path):
""" Split a filename into BIDS-relevant components.
Parameters:
file_path (str): Path to be parsed.
Returns:
str: BIDS suffix name.
str: File extension (including the .).
dict: Dictionary with key-value pair being (entity type, entity value).
:raises HedFileError:
- If filename does not conform to name-value_suffix format.
Notes:
- splits into BIDS suffix, extension, and a dictionary of entity name-value pairs.
"""
filename = os.path.splitext(os.path.basename(file_path))
ext = filename[1].lower()
basename = filename[0].strip()
entity_dict = {}
if len(basename) == 0:
raise HedFileError("BlankFileName", f"The basename for {file_path} is blank", "")
entity_pieces = basename.split('_')
split_dict = _split_entity(entity_pieces[-1])
if "bad" in split_dict:
raise HedFileError("BadSuffixPiece",
f"The basename for {entity_pieces[-1]} has bad {split_dict['bad']}", "")
if "suffix" in split_dict:
suffix = split_dict["suffix"]
else:
suffix = None
entity_dict[split_dict["key"]] = split_dict["value"]
for pos, entity in reversed(list(enumerate(entity_pieces[:-1]))):
split_dict = _split_entity(entity)
if "key" not in split_dict:
raise HedFileError("BadKeyValue", f"The piece {entity} is not in key-value form", "")
entity_dict[split_dict["key"]] = split_dict["value"]
return suffix, ext, entity_dict
def _split_entity(piece):
"""Splits a piece into an entity or suffix.
Parameters:
piece (str): A string to be parsed.
Returns:
dict: with entities as keys as well as the key "bad" and the key "suffix".
"""
piece = piece.strip()
if not piece:
return {"bad": ""}
split_piece = piece.split('-')
if len(split_piece) == 1:
return {"suffix": piece}
if len(split_piece) == 2:
return {"key": split_piece[0].strip(), "value": split_piece[1].strip()}
else:
return {"bad": piece}
[docs]def get_task_from_file(file_path):
filename = os.path.splitext(os.path.basename(file_path))
basename = filename[0].strip()
position = basename.lower().find("task-")
if position == -1:
return ""
splits = re.split(r'[_.]', basename[position+5:])
return splits[0]