Source code for hed.models.basic_search

import re
from itertools import combinations, product
from collections import defaultdict
import pandas as pd


[docs]def find_matching(series, search_string, regex=False): """ Finds lines in the series that match the search string and returns a mask. Syntax Rules: - '@': Prefixing a term in the search string means the term must appear anywhere within a line. - '~': Prefixing a term in the search string means the term must NOT appear within a line. - Parentheses: Elements within parentheses must appear in the line with the same level of nesting. e.g.: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't start in the same group. - "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis) - An individual term can be arbitrary regex, but it is limited to single continuous words. Notes: - Specific words only care about their level relative to other specific words, not overall. e.g. "(A, B)" will find: "A, B", "(A, B)", (A, (C), B)", or ((A, B))" - If you have no grouping or anywhere words in the search, it assumes all terms are anywhere words. - The format of the series should match the format of the search string, whether it's in short or long form. - To enable support for matching parent tags, ensure that both the series and search string are in long form. Args: series (pd.Series): A Pandas Series object containing the lines to be searched. search_string (str): The string to search for in each line of the series. regex (bool): By default, translate any * wildcard characters to .*? regex If True, do no translation and pass the words as is. Due to how it's setup, you must not include the following characters: (), Returns: mask (pd.Series): A Boolean mask Series of the same length as the input series. The mask has `True` for lines that match the search string and `False` otherwise. """ if not regex: # Replace *'s with a reasonable value for people who don't know regex search_string = re.sub(r'(?<!\.)\*', '.*?', search_string) anywhere_words, negative_words, specific_words = find_words(search_string) # If we have no nesting or anywhere words, assume they don't care about level if "(" not in search_string and "@" not in search_string: anywhere_words += specific_words specific_words = [] delimiter_map = construct_delimiter_map(search_string, specific_words) candidate_indexes = _verify_basic_words(series, anywhere_words, negative_words) # do a basic check for all specific words(this doesn't verify word delimiters) for word in specific_words: matches = series.str.contains(word, regex=True) current_word_indexes = set(matches[matches].index.tolist()) candidate_indexes &= current_word_indexes if not candidate_indexes: break candidate_indexes = sorted(candidate_indexes) full_mask = pd.Series(False, index=series.index, dtype=bool) if candidate_indexes: if specific_words: candidate_series = series[candidate_indexes] mask = candidate_series.apply(verify_search_delimiters, args=(specific_words, delimiter_map)) full_mask.loc[candidate_indexes] = mask else: full_mask.loc[candidate_indexes] = True return full_mask
def _get_word_indexes(series, word): pattern = r'(?:[ ,()]|^)' + word + r'(?:[ ,()]|$)' matches = series.str.contains(pattern, regex=True) return set(matches[matches].index.tolist()) def _verify_basic_words(series, anywhere_words, negative_words): candidate_indexes = set(series.index) for word in anywhere_words: current_word_indexes = _get_word_indexes(series, word) candidate_indexes &= current_word_indexes for word in negative_words: current_word_indexes = _get_word_indexes(series, word) candidate_indexes -= current_word_indexes return candidate_indexes
[docs]def find_words(search_string): """ Extract words in the search string based on their prefixes. Args: search_string (str): The search query string to parse. Words can be prefixed with '@' or '~'. Returns: list: A list containing three lists: - Words prefixed with '@' - Words prefixed with '~' - Words with no prefix """ # Match sequences of characters that are not commas or parentheses. pattern = r'[^,()]+' words = re.findall(pattern, search_string) # Remove any extraneous whitespace from each word words = [word.strip() for word in words if word.strip()] at_words = [word[1:] for word in words if word.startswith("@")] tilde_words = [word[1:] for word in words if word.startswith("~")] no_prefix_words = [word for word in words if not word.startswith("~") and not word.startswith("@")] return [at_words, tilde_words, no_prefix_words]
[docs]def check_parentheses(text): """ Checks for balanced parentheses in the given text and returns the unbalanced ones. Args: text (str): The text to be checked for balanced parentheses. Returns: str: A string containing the unbalanced parentheses in their original order. Notes: - The function only considers the characters '(' and ')' for balancing. - Balanced pairs of parentheses are removed, leaving behind only the unbalanced ones. """ # Extract all parentheses from the text all_parentheses = ''.join(re.findall('[()]', text)) stack = [] remaining_parentheses = [] # Loop through all parentheses and find balanced ones for p in all_parentheses: if p == '(': stack.append(p) elif p == ')' and stack: stack.pop() else: remaining_parentheses.append(p) # Add unbalanced ( back to remaining parentheses remaining_parentheses.extend(stack) return ''.join(remaining_parentheses)
[docs]def reverse_and_flip_parentheses(s): """ Reverses a string and flips the parentheses. Args: s (str): The string to be reversed and have its parentheses flipped. Returns: str: The reversed string with flipped parentheses. Notes: - The function takes into account only the '(' and ')' characters for flipping. """ # Reverse the string reversed_s = s[::-1] # Flip the parentheses directly in the reversed string flipped_s = reversed_s.translate(str.maketrans("()", ")(")) return flipped_s
[docs]def construct_delimiter_map(text, words): """ Takes an input search query and list of words, returning the parenthetical delimiters between them. Args: delimiter text (str): The search query words(list): A list of words we want to map between from the query Returns: dict: The two-way delimiter map """ locations = {} # Find the locations of each word in the text for word in words: for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text): start_index = match.start(1) end_index = match.end(1) match_length = end_index - start_index locations[start_index] = (word, match_length) sorted_locations = sorted(locations.items()) delimiter_map = {} # Use combinations to get every combination of two words in order for (start1, (word1, length1)), (start2, (word2, length2)) in combinations(sorted_locations, 2): end1 = start1 + length1 delimiter_text = text[end1:start2] delimiter_map[(word1, word2)] = check_parentheses(delimiter_text) # Add the reversed version of the above reverse_map = {(word2, word1): reverse_and_flip_parentheses(delimiter_text) for ((word1, word2), delimiter_text) in delimiter_map.items()} delimiter_map.update(reverse_map) return delimiter_map
[docs]def verify_search_delimiters(text, specific_words, delimiter_map): """ Verifies if the text contains specific words with expected delimiters between them. Args: text (str): The text to search in. specific_words (list of str): Words that must appear relative to other words in the text delimiter_map (dict): A dictionary specifying expected delimiters between pairs of specific words. Returns: bool: True if all conditions are met, otherwise False. """ locations = defaultdict(list) # Find all locations for each word in the text for word in specific_words: for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text): start_index = match.start(1) matched_word = match.group(1) locations[word].append((start_index, len(matched_word), word)) if len(locations) != len(specific_words): return False # Generate all possible combinations of word sequences # this covers cases where the same tag is found twice, and you need to check both for sequence in product(*locations.values()): sorted_sequence = sorted(sequence) # Check if the delimiters for this sequence match the expected delimiters valid = True for i in range(len(sorted_sequence) - 1): start1, len1, word1 = sorted_sequence[i] start2, len2, word2 = sorted_sequence[i + 1] end1 = start1 + len1 delimiter_text = text[end1:start2] found_delimiter = check_parentheses(delimiter_text) expected_delimiter = delimiter_map.get((word1, word2), None) if found_delimiter != expected_delimiter: valid = False break if valid: return True # Return True if any sequence is valid return False # Return False if no valid sequence is found