Source code for hed.tools.remodeling.operations.summarize_column_values_op

""" Summarize the values in the columns of a columnar file. """

from hed.tools.analysis.tabular_summary import TabularSummary
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.tools.remodeling.operations.base_summary import BaseSummary


[docs]class SummarizeColumnValuesOp(BaseOp):
    """ Summarize the values in the columns of a columnar file.

    Required remodeling parameters:
        - **summary_name** (*str*): The name of the summary.
        - **summary_filename** (*str*): Base filename of the summary.

    Optional remodeling parameters:
        - **append_timecode** (*bool*): (**Optional**: Default False) If True append timecodes to the summary filename.
        - **max_categorical** (*int*): Maximum number of unique values to include in summary for a categorical column.
        - **skip_columns** (*list*):  Names of columns to skip in the summary.
        - **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns.
        - **values_per_line** (*int*): The number of values output per line in the summary.

    The purpose is to produce a summary of the values in a tabular file.

    """
    NAME = "summarize_column_values"

    PARAMS = {
        "type": "object",
        "properties": {
            "summary_name": {
                "type": "string",
                "description": "Name to use for the summary in titles."
            },
            "summary_filename": {
                "type": "string",
                "description": "Name to use for the summary file name base."
            },
            "append_timecode": {
                "type": "boolean",
                "description": "If true, the timecode is appended to the base filename so each run has a unique name."
            },
            "max_categorical": {
                "type": "integer",
                "description": "Maximum number of unique column values to show in text description."
            },
            "skip_columns": {
                "type": "array",
                "description": "List of columns to skip when creating the summary.",
                "items": {
                    "type": "string"
                },
                "minItems": 1,
                "uniqueItems": True
            },
            "value_columns": {
                "type": "array",
                "description": "Columns to be annotated with a single HED annotation and placeholder.",
                "items": {
                    "type": "string"
                },
                "minItems": 1,
                "uniqueItems": True
            },
            "values_per_line": {
                "type": "integer",
                "description": "Number of items per line to display in the text file."
            }
        },
        "required": [
            "summary_name",
            "summary_filename"
        ],
        "additionalProperties": False
    }

    SUMMARY_TYPE = 'column_values'
    VALUES_PER_LINE = 5
    MAX_CATEGORICAL = 50

[docs]    def __init__(self, parameters):
        """ Constructor for the summarize column values operation.

        Parameters:
            parameters (dict): Dictionary with the parameter values for required and optional parameters.

        """

        super().__init__(parameters)
        self.summary_name = parameters['summary_name']
        self.summary_filename = parameters['summary_filename']
        self.append_timecode = parameters.get('append_timecode', False)
        self.max_categorical = parameters.get('max_categorical', float('inf'))
        self.skip_columns = parameters.get('skip_columns', [])
        self.value_columns = parameters.get('value_columns', [])
        self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE)

[docs]    def do_op(self, dispatcher, df, name, sidecar=None):
        """ Create a summary of the column values in df.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The DataFrame to be remodeled.
            name (str):  Unique identifier for the dataframe -- often the original file path.
            sidecar (Sidecar or file-like): Not needed for this operation.

        Returns:
            DataFrame: A copy of df.

        Side effect:
            Updates the relevant summary.

        """

        df_new = df.copy()
        summary = dispatcher.summary_dicts.get(self.summary_name, None)
        if not summary:
            summary = ColumnValueSummary(self)
            dispatcher.summary_dicts[self.summary_name] = summary
        summary.update_summary(
            {'df': dispatcher.post_proc_data(df_new), 'name': name})
        return df_new

[docs]    @staticmethod
    def validate_input_data(parameters):
        """ Additional validation required of operation parameters not performed by JSON schema validator. """
        return []


[docs]class ColumnValueSummary(BaseSummary):
    """ Manager for summaries of column contents for columnar files. """

[docs]    def __init__(self, sum_op):
        """ Constructor for column value summary manager.

        Parameters:
            sum_op (SummarizeColumnValuesOp): Operation associated with this summary.

        """
        super().__init__(sum_op)

[docs]    def update_summary(self, new_info):
        """ Update the summary for a given tabular input file.

        Parameters:
            new_info (dict):  A dictionary with the parameters needed to update a summary.

        Notes:
            - The summary information is kept in separate TabularSummary objects for each file.
            - The summary needs a "name" str and a "df" .

        """
        name = new_info['name']
        if name not in self.summary_dict:
            self.summary_dict[name] = \
                TabularSummary(value_cols=self.op.value_columns,
                               skip_cols=self.op.skip_columns, name=name)
        self.summary_dict[name].update(new_info['df'])

[docs]    def get_details_dict(self, summary):
        """ Return a dictionary with the summary contained in a TabularSummary.

        Parameters:
            summary (TabularSummary): Dictionary of merged summary information.

        Returns:
            dict: Dictionary with the information suitable for extracting printout.

        """
        this_summary = summary.get_summary(as_json=False)
        unique_counts = [(key, len(count_dict)) for key,
                         count_dict in this_summary['Categorical columns'].items()]
        this_summary['Categorical counts'] = dict(unique_counts)
        for key, dict_entry in this_summary['Categorical columns'].items():
            num_disp, sorted_tuples = ColumnValueSummary.sort_dict(
                dict_entry, reverse=True)
            this_summary['Categorical columns'][key] = dict(
                sorted_tuples[:min(num_disp, self.op.max_categorical)])
        return {"Name": this_summary['Name'], "Total events": this_summary["Total events"],
                "Total files": this_summary['Total files'],
                "Files": list(this_summary['Files'].keys()),
                "Specifics": {"Value columns": list(this_summary['Value columns']),
                              "Skip columns": this_summary['Skip columns'],
                              "Value column summaries": this_summary['Value columns'],
                              "Categorical column summaries": this_summary['Categorical columns'],
                              "Categorical counts": this_summary['Categorical counts']}}

[docs]    def merge_all_info(self):
        """ Create a TabularSummary containing the overall dataset summary.

        Returns:
            TabularSummary - the summary object for column values.

        """
        all_sum = TabularSummary(
            value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset')
        for counts in self.summary_dict.values():
            all_sum.update_summary(counts)
        return all_sum

    def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT):
        """ Return a formatted string with the summary for the indicated name.

        Parameters:
            name (str):  Identifier (usually the filename) of the individual file.
            result (dict): The dictionary of the summary results indexed by name.
            indent (str): A string containing spaces used for indentation (usually 3 spaces).

        Returns:
            str - The results in a printable format ready to be saved to a text file.

        Notes:
            This calls _get_dataset_string to get the overall summary string and
            _get_individual_string to get an individual summary string.

        """

        if name == "Dataset":
            sum_list = [f"Dataset: Total events={result.get('Total events', 0)} "
                        f"Total files={result.get('Total files', 0)}"]
        else:
            sum_list = [f"Total events={result.get('Total events', 0)}"]
        sum_list = sum_list + self._get_detail_list(result, indent=indent)
        return "\n".join(sum_list)

    def _get_categorical_string(self, result, offset="", indent="   "):
        """ Return  a string with the summary for a particular categorical dictionary.

         Parameters:
             result (dict): Dictionary of summary information for a particular tabular file.
             offset (str): String of blanks used as offset for every item
             indent (str):  String of blanks used as the additional amount to indent an item's for readability.

         Returns:
             str: Formatted string suitable for saving in a file or printing.

         """
        cat_dict = result.get('Categorical column summaries', {})
        if not cat_dict:
            return ""
        count_dict = result['Categorical counts']
        sum_list = [
            f"{offset}{indent}Categorical column values[Events, Files]:"]
        sorted_tuples = sorted(cat_dict.items(), key=lambda x: x[0])
        for entry in sorted_tuples:
            sum_list = sum_list + \
                self._get_categorical_col(
                    entry, count_dict, offset="", indent="   ")
        return "\n".join(sum_list)

    def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT):
        """ Return  a list of strings with the details

        Parameters:
            result (dict): Dictionary of merged summary information.
            indent (str):  String of blanks used as the amount to indent for readability.

        Returns:
            list: list of formatted strings suitable for saving in a file or printing.

        """
        sum_list = []
        specifics = result["Specifics"]
        cat_string = self._get_categorical_string(
            specifics, offset="", indent=indent)
        if cat_string:
            sum_list.append(cat_string)
        val_dict = specifics.get("Value column summaries", {})
        if val_dict:
            sum_list.append(ColumnValueSummary._get_value_string(
                val_dict, offset="", indent=indent))
        return sum_list

    def _get_categorical_col(self, entry, count_dict, offset="", indent="   "):
        """ Return  a string with the summary for a particular categorical column.

         Parameters:
             entry(tuple): (Name of the column, summary dict for that column)
             count_dict (dict): Count of the total number of unique values indexed by the name
             offset(str): String of blanks used as offset for all items
             indent (str):  String of blanks used as the additional amount to indent for this item's readability.

         Returns:
             list: Formatted strings, each corresponding to a line in the output.
         """
        num_unique = count_dict[entry[0]]
        num_disp = min(self.op.max_categorical, num_unique)
        col_list = [f"{offset}{indent * 2}{entry[0]}: {num_unique} unique values "
                    f"(displaying top {num_disp} values)"]
        # Create and partition the list of individual entries
        value_list = [f"{item[0]}{str(item[1])}" for item in entry[1].items()]
        value_list = value_list[:num_disp]
        part_list = ColumnValueSummary.partition_list(
            value_list, self.op.values_per_line)
        return col_list + [f"{offset}{indent * 3}{ColumnValueSummary.get_list_str(item)}" for item in part_list]

[docs]    @staticmethod
    def get_list_str(lst):
        """ Return a str version of a list with items separated by a blank.

        Returns:
            str:  String version of list.

        """
        return f"{' '.join(str(item) for item in lst)}"

[docs]    @staticmethod
    def partition_list(lst, n):
        """ Partition a list into lists of n items.

        Parameters:
            lst (list): List to be partitioned.
            n (int):  Number of items in each sublist.

        Returns:
            list:  list of lists of n elements, the last might have fewer.

        """
        return [lst[i:i + n] for i in range(0, len(lst), n)]

    @staticmethod
    def _get_value_string(val_dict, offset="", indent=""):
        sum_list = [f"{offset}{indent}Value columns[Events, Files]:"]
        for col_name, val_counts in val_dict.items():
            sum_list.append(f"{offset}{indent*2}{col_name}{str(val_counts)}")
        return "\n".join(sum_list)

[docs]    @staticmethod
    def sort_dict(count_dict, reverse=False):
        sorted_tuples = sorted(
            count_dict.items(), key=lambda x: x[1][0], reverse=reverse)
        return len(sorted_tuples), sorted_tuples