Source code for hed.tools.analysis.sequence_map

""" A map of containing the number of times a particular sequence of values in a column of a columnar file. """


import pandas as pd
from hed.tools.util import data_util


[docs]class SequenceMap: # TODO: This class is partially implemented. """ A map of unique sequences of column values of a particular length appear in a columnar file. Attributes: name (str): An optional name of this remap for identification purposes. Notes: This mapping converts all columns in the mapping to strings. The remapping does not support other types of columns. """
[docs] def __init__(self, codes=None, name=''): """ Information for setting up the maps. Parameters: codes (list or None): If None use all codes, otherwise only include listed codes in the map. name (str): Name associated with this remap (usually a pathname of the events file). """ self.codes = codes self.name = name self.node_counts = {} self.edges = {} # map of keys to n-element sequences self.edge_counts = {} # Keeps a running count of the number of times a key appears in the data
@property def __str__(self): """ Return a version of this sequence map serialized to a string. """ node_counts = [f"{value}({str(count)})" for value, count in self.node_counts.items()] node_str = " ".join(node_counts) return node_str # temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"] # for index, row in self.col_map.iterrows(): # key_hash = get_row_hash(row, self.columns) # temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}") # return "\n".join(temp_list)
[docs] def dot_str(self, group_spec={}): """ Produce a DOT string representing this sequence map. """ base = 'digraph g { \n' if self.codes: node_list = [f"{node};" for node in self.codes if node not in self.node_counts] if node_list: base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + "\n".join(node_list) + "\n}\n" if group_spec: for group, spec in group_spec.items(): group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]] if group_list: spec_color = spec["color"] if spec_color[0] == '#': spec_color = f'"{spec_color}"' base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \ '\n'.join(group_list) + '\n}\n' edge_list = self.get_edge_list(sort=True) dot_str = base + "\n".join(edge_list) + "}\n" return dot_str
[docs] def edge_to_str(self, key): """ Convert a graph edge to a DOT string. Parameters: key(str): Hashcode string representing a graph edge. """ value = self.edges.get(key, []) if value: return f"{value[0]} -> {value[1]} " else: return ""
[docs] def get_edge_list(self, sort=True): """ Return a DOT format edge list with the option of sorting by edge counts. Parameters: sort (bool): If True (the default), the edge list is sorted by edge counts. Returns: list: list of DOT strings representing the edges labeled by counts. """ df = pd.DataFrame(list(self.edge_counts.items()), columns=['Key', 'Counts']) if sort: df = df.sort_values(by='Counts', ascending=False) edge_list = [f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];" for index, row in df.iterrows()] return edge_list
[docs] def filter_edges(self): pass
[docs] def update(self, data): """ Update the existing map with information from data. Parameters: data (Series): DataFrame or filename of an events file or event map. allow_missing (bool): If True allow missing keys and add as n/a columns. :raises HedFileError: - If there are missing keys and allow_missing is False. """ filtered = self.prep(data) if self.codes: mask = filtered.isin(self.codes) filtered = filtered[mask] for index, value in filtered.items(): if value not in self.node_counts: self.node_counts[value] = 1 else: self.node_counts[value] = self.node_counts[value] + 1 if index + 1 >= len(filtered): break key_list = filtered[index:index+2].tolist() key = data_util.get_key_hash(key_list) if key in self.edges: self.edge_counts[key] = self.edge_counts[key] + 1 else: self.edges[key] = key_list self.edge_counts[key] = 1
# def update(self, data): # """ Update the existing map with information from data. # # Parameters: # data (Series): DataFrame or filename of an events file or event map. # allow_missing (bool): If true allow missing keys and add as n/a columns. # # :raises HedFileError: # - If there are missing keys and allow_missing is False. # # """ # filtered = self.prep(data) # if self.codes: # mask = filtered.isin(self.codes) # filtered = filtered[mask] # for index, value in filtered.items(): # if value not in self.node_counts: # self.node_counts[value] = 1 # else: # self.node_counts[value] = self.node_counts[value] + 1 # if index + 1 >= len(filtered): # break # key_list = filtered[index:index + 2].tolist() # key = get_key_hash(key_list) # if key in self.edges: # self.edge_counts[key] = self.edge_counts[key] + 1 # else: # self.edges[key] = key_list # self.edge_counts[key] = 1
[docs] @staticmethod def prep(data): """ Remove quotes from the specified columns and convert to string. Parameters: data (Series): Dataframe to process by removing quotes. Returns: Series Notes: - Replacement is done in place. """ filtered = data.astype(str) filtered.fillna('n/a').astype(str) filtered = filtered.str.replace('"', '') filtered = filtered.str.replace("'", "") return filtered