""" Summarize the values in the columns of a columnar file. """
from hed.tools.analysis.tabular_summary import TabularSummary
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.tools.remodeling.operations.base_summary import BaseSummary
[docs]class SummarizeColumnValuesOp(BaseOp):
""" Summarize the values in the columns of a columnar file.
Required remodeling parameters:
- **summary_name** (*str*): The name of the summary.
- **summary_filename** (*str*): Base filename of the summary.
Optional remodeling parameters:
- **append_timecode** (*bool*): (**Optional**: Default False) If True append timecodes to the summary filename.
- **max_categorical** (*int*): Maximum number of unique values to include in summary for a categorical column.
- **skip_columns** (*list*): Names of columns to skip in the summary.
- **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns.
- **values_per_line** (*int*): The number of values output per line in the summary.
The purpose is to produce a summary of the values in a tabular file.
"""
NAME = "summarize_column_values"
PARAMS = {
"type": "object",
"properties": {
"summary_name": {
"type": "string",
"description": "Name to use for the summary in titles."
},
"summary_filename": {
"type": "string",
"description": "Name to use for the summary file name base."
},
"append_timecode": {
"type": "boolean",
"description": "If true, the timecode is appended to the base filename so each run has a unique name."
},
"max_categorical": {
"type": "integer",
"description": "Maximum number of unique column values to show in text description."
},
"skip_columns": {
"type": "array",
"description": "List of columns to skip when creating the summary.",
"items": {
"type": "string"
},
"minItems": 1,
"uniqueItems": True
},
"value_columns": {
"type": "array",
"description": "Columns to be annotated with a single HED annotation and placeholder.",
"items": {
"type": "string"
},
"minItems": 1,
"uniqueItems": True
},
"values_per_line": {
"type": "integer",
"description": "Number of items per line to display in the text file."
}
},
"required": [
"summary_name",
"summary_filename"
],
"additionalProperties": False
}
SUMMARY_TYPE = 'column_values'
VALUES_PER_LINE = 5
MAX_CATEGORICAL = 50
[docs] def __init__(self, parameters):
""" Constructor for the summarize column values operation.
Parameters:
parameters (dict): Dictionary with the parameter values for required and optional parameters.
"""
super().__init__(parameters)
self.summary_name = parameters['summary_name']
self.summary_filename = parameters['summary_filename']
self.append_timecode = parameters.get('append_timecode', False)
self.max_categorical = parameters.get('max_categorical', float('inf'))
self.skip_columns = parameters.get('skip_columns', [])
self.value_columns = parameters.get('value_columns', [])
self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE)
[docs] def do_op(self, dispatcher, df, name, sidecar=None):
""" Create a summary of the column values in df.
Parameters:
dispatcher (Dispatcher): Manages the operation I/O.
df (DataFrame): The DataFrame to be remodeled.
name (str): Unique identifier for the dataframe -- often the original file path.
sidecar (Sidecar or file-like): Not needed for this operation.
Returns:
DataFrame: A copy of df.
Side effect:
Updates the relevant summary.
"""
df_new = df.copy()
summary = dispatcher.summary_dicts.get(self.summary_name, None)
if not summary:
summary = ColumnValueSummary(self)
dispatcher.summary_dicts[self.summary_name] = summary
summary.update_summary(
{'df': dispatcher.post_proc_data(df_new), 'name': name})
return df_new
[docs]class ColumnValueSummary(BaseSummary):
""" Manager for summaries of column contents for columnar files. """
[docs] def __init__(self, sum_op):
""" Constructor for column value summary manager.
Parameters:
sum_op (BaseOp): Operation associated with this summary.
"""
super().__init__(sum_op)
[docs] def update_summary(self, new_info):
""" Update the summary for a given tabular input file.
Parameters:
new_info (dict): A dictionary with the parameters needed to update a summary.
Notes:
- The summary information is kept in separate TabularSummary objects for each file.
- The summary needs a "name" str and a "df" .
"""
name = new_info['name']
if name not in self.summary_dict:
self.summary_dict[name] = \
TabularSummary(value_cols=self.op.value_columns,
skip_cols=self.op.skip_columns, name=name)
self.summary_dict[name].update(new_info['df'])
[docs] def get_details_dict(self, summary):
""" Return a dictionary with the summary contained in a TabularSummary.
Parameters:
summary (TabularSummary): Dictionary of merged summary information.
Returns:
dict: Dictionary with the information suitable for extracting printout.
"""
this_summary = summary.get_summary(as_json=False)
unique_counts = [(key, len(count_dict)) for key,
count_dict in this_summary['Categorical columns'].items()]
this_summary['Categorical counts'] = dict(unique_counts)
for key, dict_entry in this_summary['Categorical columns'].items():
num_disp, sorted_tuples = ColumnValueSummary.sort_dict(
dict_entry, reverse=True)
this_summary['Categorical columns'][key] = dict(
sorted_tuples[:min(num_disp, self.op.max_categorical)])
return {"Name": this_summary['Name'], "Total events": this_summary["Total events"],
"Total files": this_summary['Total files'],
"Files": list(this_summary['Files'].keys()),
"Specifics": {"Value columns": list(this_summary['Value columns']),
"Skip columns": this_summary['Skip columns'],
"Value column summaries": this_summary['Value columns'],
"Categorical column summaries": this_summary['Categorical columns'],
"Categorical counts": this_summary['Categorical counts']}}
[docs] def merge_all_info(self):
""" Create a TabularSummary containing the overall dataset summary.
Returns:
TabularSummary - the summary object for column values.
"""
all_sum = TabularSummary(
value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset')
for counts in self.summary_dict.values():
all_sum.update_summary(counts)
return all_sum
def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.
Parameters:
name (str): Identifier (usually the filename) of the individual file.
result (dict): The dictionary of the summary results indexed by name.
indent (str): A string containing spaces used for indentation (usually 3 spaces).
Returns:
str - The results in a printable format ready to be saved to a text file.
Notes:
This calls _get_dataset_string to get the overall summary string and
_get_individual_string to get an individual summary string.
"""
if name == "Dataset":
sum_list = [f"Dataset: Total events={result.get('Total events', 0)} "
f"Total files={result.get('Total files', 0)}"]
else:
sum_list = [f"Total events={result.get('Total events', 0)}"]
sum_list = sum_list + self._get_detail_list(result, indent=indent)
return "\n".join(sum_list)
def _get_categorical_string(self, result, offset="", indent=" "):
""" Return a string with the summary for a particular categorical dictionary.
Parameters:
result (dict): Dictionary of summary information for a particular tabular file.
offset (str): String of blanks used as offset for every item
indent (str): String of blanks used as the additional amount to indent an item's for readability.
Returns:
str: Formatted string suitable for saving in a file or printing.
"""
cat_dict = result.get('Categorical column summaries', {})
if not cat_dict:
return ""
count_dict = result['Categorical counts']
sum_list = [
f"{offset}{indent}Categorical column values[Events, Files]:"]
sorted_tuples = sorted(cat_dict.items(), key=lambda x: x[0])
for entry in sorted_tuples:
sum_list = sum_list + \
self._get_categorical_col(
entry, count_dict, offset="", indent=" ")
return "\n".join(sum_list)
def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT):
""" Return a list of strings with the details
Parameters:
result (dict): Dictionary of merged summary information.
indent (str): String of blanks used as the amount to indent for readability.
Returns:
list: list of formatted strings suitable for saving in a file or printing.
"""
sum_list = []
specifics = result["Specifics"]
cat_string = self._get_categorical_string(
specifics, offset="", indent=indent)
if cat_string:
sum_list.append(cat_string)
val_dict = specifics.get("Value column summaries", {})
if val_dict:
sum_list.append(ColumnValueSummary._get_value_string(
val_dict, offset="", indent=indent))
return sum_list
def _get_categorical_col(self, entry, count_dict, offset="", indent=" "):
""" Return a string with the summary for a particular categorical column.
Parameters:
entry(tuple): (Name of the column, summary dict for that column)
count_dict (dict): Count of the total number of unique values indexed by the name
offset(str): String of blanks used as offset for all items
indent (str): String of blanks used as the additional amount to indent for this item's readability.
Returns:
list: Formatted strings, each corresponding to a line in the output.
"""
num_unique = count_dict[entry[0]]
num_disp = min(self.op.max_categorical, num_unique)
col_list = [f"{offset}{indent * 2}{entry[0]}: {num_unique} unique values "
f"(displaying top {num_disp} values)"]
# Create and partition the list of individual entries
value_list = [f"{item[0]}{str(item[1])}" for item in entry[1].items()]
value_list = value_list[:num_disp]
part_list = ColumnValueSummary.partition_list(
value_list, self.op.values_per_line)
return col_list + [f"{offset}{indent * 3}{ColumnValueSummary.get_list_str(item)}" for item in part_list]
[docs] @staticmethod
def get_list_str(lst):
""" Return a str version of a list with items separated by a blank.
Returns:
str: String version of list.
"""
return f"{' '.join(str(item) for item in lst)}"
[docs] @staticmethod
def partition_list(lst, n):
""" Partition a list into lists of n items.
Parameters:
lst (list): List to be partitioned.
n (int): Number of items in each sublist.
Returns:
list: list of lists of n elements, the last might have fewer.
"""
return [lst[i:i + n] for i in range(0, len(lst), n)]
@staticmethod
def _get_value_string(val_dict, offset="", indent=""):
sum_list = [f"{offset}{indent}Value columns[Events, Files]:"]
for col_name, val_counts in val_dict.items():
sum_list.append(f"{offset}{indent*2}{col_name}{str(val_counts)}")
return "\n".join(sum_list)
[docs] @staticmethod
def sort_dict(count_dict, reverse=False):
sorted_tuples = sorted(
count_dict.items(), key=lambda x: x[1][0], reverse=reverse)
return len(sorted_tuples), sorted_tuples