Source code for hed.models.basic_search

"""
Utilities to support HED searches based on strings.
"""
import re
from itertools import combinations, product
from collections import defaultdict
import pandas as pd


[docs]def find_matching(series, search_string, regex=False):
    """ Find lines in the series that match the search string and returns a mask.

    Syntax Rules:
        - '@': Prefixing a term in the search string means the term must appear anywhere within a line.
        - '~': Prefixing a term in the search string means the term must NOT appear within a line.
        - Parentheses: Elements within parentheses must appear in the line with the same level of nesting.
                e.g.: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't
                      start in the same group.
        - "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis)
        - An individual term can be arbitrary regex, but it is limited to single continuous words.

    Notes:
        - Specific words only care about their level relative to other specific words, not overall.
              e.g. "(A, B)" will find: "A, B", "(A, B)", (A, (C), B)", or ((A, B))"
        - If you have no grouping or anywhere words in the search, it assumes all terms are anywhere words.
        - The format of the series should match the format of the search string, whether it's in short or long form.
        - To enable support for matching parent tags, ensure that both the series and search string are in long form.

    Parameters:
        series (pd.Series): A Pandas Series object containing the lines to be searched.
        search_string (str): The string to search for in each line of the series.
        regex (bool): By default, translate any * wildcard characters to .*? regex.
                      If True, do no translation and pass the words as is. Due to how it's setup, you must not include
                      the following characters: (),

    Returns:
        mask (pd.Series): A Boolean mask Series of the same length as the input series.
                          The mask has `True` for lines that match the search string and `False` otherwise.
    """
    if not regex:
        # Replace *'s with a reasonable value for people who don't know regex
        search_string = re.sub(r'(?<!\.)\*', '.*?', search_string)
    anywhere_words, negative_words, specific_words = find_words(search_string)
    # If we have no nesting or anywhere words, assume they don't care about level
    if "(" not in search_string and "@" not in search_string:
        anywhere_words += specific_words
        specific_words = []
    delimiter_map = construct_delimiter_map(search_string, specific_words)
    candidate_indexes = _verify_basic_words(series, anywhere_words, negative_words)

    # do a basic check for all specific words(this doesn't verify word delimiters)
    for word in specific_words:
        matches = series.str.contains(word, regex=True)
        current_word_indexes = set(matches[matches].index.tolist())

        candidate_indexes &= current_word_indexes

        if not candidate_indexes:
            break

    candidate_indexes = sorted(candidate_indexes)

    full_mask = pd.Series(False, index=series.index, dtype=bool)

    if candidate_indexes:
        if specific_words:
            candidate_series = series[candidate_indexes]
            mask = candidate_series.apply(verify_search_delimiters, args=(specific_words, delimiter_map))
            full_mask.loc[candidate_indexes] = mask
        else:
            full_mask.loc[candidate_indexes] = True

    return full_mask


def _get_word_indexes(series, word):
    pattern = r'(?:[ ,()]|^)' + word + r'(?:[ ,()]|$)'
    matches = series.str.contains(pattern, regex=True)
    return set(matches[matches].index.tolist())


def _verify_basic_words(series, anywhere_words, negative_words):
    candidate_indexes = set(series.index)
    for word in anywhere_words:
        current_word_indexes = _get_word_indexes(series, word)
        candidate_indexes &= current_word_indexes

    for word in negative_words:
        current_word_indexes = _get_word_indexes(series, word)
        candidate_indexes -= current_word_indexes
    return candidate_indexes


[docs]def find_words(search_string):
    """ Extract words in the search string based on their prefixes.

    Parameters:
        search_string (str): The search query string to parse.
                             Words can be prefixed with '@' or '~'.

    Returns:
        list: A list containing three lists:
            - Words prefixed with '@'
            - Words prefixed with '~'
            - Words with no prefix
    """
    # Match sequences of characters that are not commas or parentheses.
    pattern = r'[^,()]+'
    words = re.findall(pattern, search_string)

    # Remove any extraneous whitespace from each word
    words = [word.strip() for word in words if word.strip()]

    at_words = [word[1:] for word in words if word.startswith("@")]
    tilde_words = [word[1:] for word in words if word.startswith("~")]
    no_prefix_words = [word for word in words if not word.startswith("~") and not word.startswith("@")]

    return [at_words, tilde_words, no_prefix_words]


[docs]def check_parentheses(text):
    """ Check for balanced parentheses in the given text and returns the unbalanced ones.

    Parameters:
        text (str): The text to be checked for balanced parentheses.

    Returns:
        str: A string containing the unbalanced parentheses in their original order.

    Notes:
        - The function only considers the characters '(' and ')' for balancing.
        - Balanced pairs of parentheses are removed, leaving behind only the unbalanced ones.

    """
    # Extract all parentheses from the text
    all_parentheses = ''.join(re.findall('[()]', text))

    stack = []
    remaining_parentheses = []

    # Loop through all parentheses and find balanced ones
    for p in all_parentheses:
        if p == '(':
            stack.append(p)
        elif p == ')' and stack:
            stack.pop()
        else:
            remaining_parentheses.append(p)

    # Add unbalanced ( back to remaining parentheses
    remaining_parentheses.extend(stack)

    return ''.join(remaining_parentheses)


[docs]def reverse_and_flip_parentheses(s):
    """ Reverse a string and flips the parentheses.

        Parameters:
            s (str): The string to be reversed and have its parentheses flipped.

        Returns:
            str: The reversed string with flipped parentheses.

        Notes:
            - The function takes into account only the '(' and ')' characters for flipping.
    """
    # Reverse the string
    reversed_s = s[::-1]

    # Flip the parentheses directly in the reversed string
    flipped_s = reversed_s.translate(str.maketrans("()", ")("))
    return flipped_s


[docs]def construct_delimiter_map(text, words):
    """ Based on an input search query and list of words, return the parenthetical delimiters between them.

    Parameters:
        text (str): The search query.
        words(list): A list of words we want to map between from the query.

    Returns:
        dict: The two-way delimiter map.
    """
    locations = {}
    # Find the locations of each word in the text
    for word in words:
        for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text):
            start_index = match.start(1)
            end_index = match.end(1)
            match_length = end_index - start_index
            locations[start_index] = (word, match_length)

    sorted_locations = sorted(locations.items())

    delimiter_map = {}
    # Use combinations to get every combination of two words in order
    for (start1, (word1, length1)), (start2, (word2, length2)) in combinations(sorted_locations, 2):
        end1 = start1 + length1
        delimiter_text = text[end1:start2]
        delimiter_map[(word1, word2)] = check_parentheses(delimiter_text)

    # Add the reversed version of the above
    reverse_map = {(word2, word1): reverse_and_flip_parentheses(delimiter_text) for ((word1, word2), delimiter_text) in
                   delimiter_map.items()}
    delimiter_map.update(reverse_map)

    return delimiter_map


[docs]def verify_search_delimiters(text, specific_words, delimiter_map):
    """ Verify that the text contains specific words with expected delimiters between them.

    Parameters:
        text (str): The text to search in.
        specific_words (list of str): Words that must appear relative to other words in the text.
        delimiter_map (dict): A dictionary specifying expected delimiters between pairs of specific words.

    Returns:
        bool: True if all conditions are met, otherwise False.
    """
    locations = defaultdict(list)

    # Find all locations for each word in the text
    for word in specific_words:
        for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text):
            start_index = match.start(1)
            matched_word = match.group(1)
            locations[word].append((start_index, len(matched_word), word))

    if len(locations) != len(specific_words):
        return False

    # Generate all possible combinations of word sequences
    # this covers cases where the same tag is found twice, and you need to check both
    for sequence in product(*locations.values()):
        sorted_sequence = sorted(sequence)

        # Check if the delimiters for this sequence match the expected delimiters
        valid = True
        for i in range(len(sorted_sequence) - 1):
            start1, len1, word1 = sorted_sequence[i]
            start2, len2, word2 = sorted_sequence[i + 1]

            end1 = start1 + len1
            delimiter_text = text[end1:start2]

            found_delimiter = check_parentheses(delimiter_text)
            expected_delimiter = delimiter_map.get((word1, word2), None)

            if found_delimiter != expected_delimiter:
                valid = False
                break

        if valid:
            return True  # Return True if any sequence is valid

    return False  # Return False if no valid sequence is found