Source code for hed.tools.analysis.column_name_summary

""" Summarize the unique column names in a dataset. """

import json


[docs]class ColumnNameSummary:
    """ Summarize the unique column names in a dataset. """

[docs]    def __init__(self, name=''):
        self.name = name
        self.file_dict = {}
        self.unique_headers = []

[docs]    def update(self, name, columns):
        """ Update the summary based on columns associated with a file.

        Parameters:
            name (str): File name associated with the columns.
            columns (list):  List of file names.

        """
        position = self.update_headers(columns)
        if name not in self.file_dict:
            self.file_dict[name] = position
        elif name in self.file_dict and position != self.file_dict[name]:
            raise ValueError("FileHasChangedColumnNames",
                             f"{name}: Summary has conflicting column names " +
                             f"Current: {str(columns)} Previous: {str(self.unique_headers[self.file_dict[name]])}")

[docs]    def update_headers(self, column_names):
        """ Update the unique combinations of column names.

        Parameters:
            column_names (list):  List of  column names to update.

        """
        for index, item in enumerate(self.unique_headers):
            if item == column_names:
                return index
        self.unique_headers.append(column_names)
        return len(self.unique_headers) - 1

[docs]    def get_summary(self, as_json=False):
        """ Return summary as an object or in JSON.

        Parameters:
            as_json (bool):  If False (the default), return the underlying summary object, otherwise transform to JSON.

        """
        patterns = [list() for _ in self.unique_headers]
        for key, value in self.file_dict.items():
            patterns[value].append(key)
        column_headers = []
        for index in range(len(patterns)):
            column_headers.append({'Column names': self.unique_headers[index], 'Files': patterns[index]})
        summary = {"Summary name": self.name, "Columns": column_headers, "Number files": len(self.file_dict)}
        if as_json:
            return json.dumps(summary, indent=4)
        else:
            return summary