Source code for gordo.machine.model.transformers.imputer

# -*- coding: utf-8 -*-

from typing import Union

import pandas as pd
import numpy as np
from numpy.ma import masked_invalid

from sklearn.base import TransformerMixin


[docs]class InfImputer(TransformerMixin): def __init__( self, inf_fill_value=None, neg_inf_fill_value=None, strategy="minmax", delta: float = 2.0, ): """ Fill inf/-inf values of a 2d array/dataframe with imputed or provided values By default it will find the min and max of each feature/column and fill -infs/infs with those values +/- ``delta`` Parameters ---------- inf_fill_value: numeric Value to fill 'inf' values neg_inf_fill_value: numeric Value to fill '-inf' values strategy: str How to fill values, irrelevant if fill value is provided. choices: 'extremes', 'minmax' -'extremes' will use the min and max values for the current datatype. such that 'inf' in a float32 dataset will have float32's largest value inserted. - 'minmax' will look at the min and max values in the feature where the -inf / inf appears and fill with the max/min found in that feature. delta: float Only applicable if ``strategy='minmax'`` Will add/subtract the max/min value, by feature, by this delta. If the max value in a feature was 10 and ``delta=2`` any inf value will be filled with 12. Likewise, if the min feature was -10 any -inf will be filled with -12. """ self.inf_fill_value = inf_fill_value self.neg_inf_fill_value = neg_inf_fill_value self.strategy = strategy self.delta = delta
[docs] def get_params(self, deep=True): return { "inf_fill_value": self.inf_fill_value, "neg_inf_fill_value": self.neg_inf_fill_value, "strategy": self.strategy, "delta": self.delta, }
[docs] def fit(self, X: Union[pd.DataFrame, np.ndarray], y=None): # Store the min/max for features in training. if self.strategy == "minmax": data = pd.DataFrame(X) # ensure a dataframe # Calculate max/min allowable values max_allowable_value = np.finfo(data.values.dtype).max min_allowable_value = np.finfo(data.values.dtype).min # Get the max/min values in each feature, ignoring infs _posinf_fill_values = data.apply(lambda col: masked_invalid(col).max()) _neginf_fill_values = data.apply(lambda col: masked_invalid(col).min()) # Calculate a 1d arrays of fill values for each feature self._posinf_fill_values = _posinf_fill_values.apply( lambda val: val + self.delta if max_allowable_value - self.delta > val else max_allowable_value ) self._neginf_fill_values = _neginf_fill_values.apply( lambda val: val - self.delta if min_allowable_value + self.delta < val else min_allowable_value ) return self
[docs] def transform(self, X: Union[pd.DataFrame, np.ndarray], y=None): # Ensure we're dealing with numpy array if it's a dataframe or similar X = X.values if hasattr(X, "values") else X # Apply specific fill values if provided. if self.inf_fill_value is not None: X[np.isposinf(X)] = self.inf_fill_value if self.neg_inf_fill_value is not None: X[np.isneginf(X)] = self.neg_inf_fill_value # May still be left over infs, if only one fill value was supplied for example if self.strategy is not None: return getattr(self, f"_fill_{self.strategy}")(X) return X
def _fill_extremes(self, X: np.ndarray): """ Fill negative and postive infs with their dtype's min/max values """ X[np.isposinf(X)] = np.finfo(X.dtype).max X[np.isneginf(X)] = np.finfo(X.dtype).min return X def _fill_minmax(self, X: np.ndarray): """ Fill inf/-inf values in features of the array based on their min & max values. Compounded by the ``power`` value so long as the result doesn't exceed the current array's dtype's max/min. Otherwise it will use those. """ # For each feature fill inf/-inf with pre-calculate fill values for feature_idx, (posinf_fill, neginf_fill) in enumerate( zip(self._posinf_fill_values, self._neginf_fill_values) ): X[:, feature_idx][np.isposinf(X[:, feature_idx])] = posinf_fill X[:, feature_idx][np.isneginf(X[:, feature_idx])] = neginf_fill return X