""" Map values in m columns into a new combinations in n columns. """
import pandas as pd
import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.tools.analysis.key_map import KeyMap
[docs]class RemapColumnsOp(BaseOp):
""" Map values in m columns into a new combinations in n columns.
Required remodeling parameters:
- **source_columns** (*list*): The key columns to map (m key columns).
- **destination_columns** (*list*): The destination columns to have the mapped values (n destination columns).
- **map_list** (*list*): A list of lists with the mapping.
- **ignore_missing** (*bool*): If True, entries whose key column values are not in map_list are ignored.
Optional remodeling parameters:
**integer_sources** (*list*): Source columns that should be treated as integers rather than strings.
Notes:
Each list element list is of length m + n with the key columns followed by mapped columns.
TODO: Allow wildcards
"""
PARAMS = {
"operation": "remap_columns",
"required_parameters": {
"source_columns": list,
"destination_columns": list,
"map_list": list,
"ignore_missing": bool
},
"optional_parameters": {
"integer_sources": list
}
}
[docs] def __init__(self, parameters):
""" Constructor for the remap columns operation.
Parameters:
parameters (dict): Parameter values for required and optional parameters.
:raises KeyError:
- If a required parameter is missing.
- If an unexpected parameter is provided.
:raises TypeError:
- If a parameter has the wrong type.
:raises ValueError:
- If an integer column is not a key column.
- If a column designated as an integer source does not have valid integers.
- If no source columns are specified.
- If no destination columns are specified.
- If a map_list entry has the wrong number of items (source columns + destination columns).
"""
super().__init__(self.PARAMS, parameters)
self.source_columns = parameters['source_columns']
self.integer_sources = []
self.string_sources = self.source_columns
if "integer_sources" in parameters:
self.integer_sources = parameters['integer_sources']
if not set(self.integer_sources).issubset(set(self.source_columns)):
raise ValueError("IntegerSourceColumnsInvalid",
f"Integer courses {str(self.integer_sources)} must be in {str(self.source_columns)}")
self.string_sources = list(set(self.source_columns).difference(set(self.integer_sources)))
self.destination_columns = parameters['destination_columns']
self.map_list = parameters['map_list']
self.ignore_missing = parameters['ignore_missing']
if len(self.source_columns) < 1:
raise ValueError("EmptySourceColumns",
f"The source column list {str(self.source_columns)} must be non-empty")
if len(self.destination_columns) < 1:
raise ValueError("EmptyDestinationColumns",
f"The destination column list {str(self.destination_columns)} must be non-empty")
entry_len = len(self.source_columns) + len(self.destination_columns)
for index, item in enumerate(self.map_list):
if len(item) != entry_len:
raise ValueError("BadColumnMapEntry",
f"Map list entry {index} has {len(item)} elements, but must have {entry_len} elements")
self.key_map = self._make_key_map()
def _make_key_map(self):
key_df = pd.DataFrame(self.map_list, columns=self.source_columns+self.destination_columns)
key_map = KeyMap(self.source_columns, target_cols=self.destination_columns, name="remap")
key_map.update(key_df)
return key_map
[docs] def do_op(self, dispatcher, df, name, sidecar=None):
""" Remap new columns from combinations of others.
Parameters:
dispatcher (Dispatcher): Manages the operation I/O.
df (DataFrame): The DataFrame to be remodeled.
name (str): Unique identifier for the dataframe -- often the original file path.
sidecar (Sidecar or file-like): Not needed for this operation.
Returns:
Dataframe: A new dataframe after processing.
:raises ValueError:
- If ignore_missing is false and source values from the data are not in the map.
"""
df1 = df.copy()
df1[self.source_columns] = df1[self.source_columns].replace(np.NaN, 'n/a')
for column in self.integer_sources:
int_mask = df1[column] != 'n/a'
df1.loc[int_mask, column] = df1.loc[int_mask, column].astype(int)
df1[self.source_columns] = df1[self.source_columns].astype(str)
df_new, missing = self.key_map.remap(df1)
if missing and not self.ignore_missing:
raise ValueError("MapSourceValueMissing",
f"{name}: Ignore missing is false, but source values [{missing}] are in data but not map")
return df_new