|
1 | 1 | """Transformers for categorical data.""" |
2 | 2 |
|
| 3 | +import logging |
3 | 4 | import warnings |
4 | 5 |
|
5 | 6 | import numpy as np |
|
9 | 10 |
|
10 | 11 | from rdt.errors import TransformerInputError |
11 | 12 | from rdt.transformers.base import BaseTransformer |
| 13 | +from rdt.transformers.utils import fill_nan_with_none |
| 14 | + |
| 15 | +LOGGER = logging.getLogger(__name__) |
| 16 | + |
| 17 | + |
| 18 | +class UniformEncoder(BaseTransformer): |
| 19 | + """Transformer for categorical data. |
| 20 | +
|
| 21 | + This transformer computes a float representative for each one of the categories |
| 22 | + found in the fit data, and then replaces the instances of these categories with |
| 23 | + the corresponding representative. |
| 24 | +
|
| 25 | + The representatives are decided by computing the frequencies of each labels and |
| 26 | + then dividing the ``[0, 1]`` interval according to these frequencies. |
| 27 | +
|
| 28 | + When the transformation is reverted, each value is assigned the category that |
| 29 | + corresponds to the interval it falls in. |
| 30 | +
|
| 31 | + Null values are considered just another category. |
| 32 | +
|
| 33 | + Args: |
| 34 | + order_by (str or None): |
| 35 | + String defining how to order the data before applying the labels. Options are |
| 36 | + 'alphabetical', 'numerical' and ``None``. Defaults to ``None``. |
| 37 | + """ |
| 38 | + |
| 39 | + INPUT_SDTYPE = 'categorical' |
| 40 | + SUPPORTED_SDTYPES = ['categorical', 'boolean'] |
| 41 | + frequencies = None |
| 42 | + intervals = None |
| 43 | + dtype = None |
| 44 | + |
| 45 | + def __init__(self, order_by=None): |
| 46 | + super().__init__() |
| 47 | + if order_by not in [None, 'alphabetical', 'numerical_value']: |
| 48 | + raise TransformerInputError( |
| 49 | + "order_by must be one of the following values: None, 'numerical_value' or " |
| 50 | + "'alphabetical'" |
| 51 | + ) |
| 52 | + |
| 53 | + self.order_by = order_by |
| 54 | + |
| 55 | + def _order_categories(self, unique_data): |
| 56 | + nans = pd.isna(unique_data) |
| 57 | + if self.order_by == 'alphabetical': |
| 58 | + # pylint: disable=invalid-unary-operand-type |
| 59 | + if any(map(lambda item: not isinstance(item, str), unique_data[~nans])): |
| 60 | + raise TransformerInputError( |
| 61 | + "The data must be of type string if order_by is 'alphabetical'." |
| 62 | + ) |
| 63 | + elif self.order_by == 'numerical_value': |
| 64 | + if not np.issubdtype(unique_data.dtype.type, np.number): |
| 65 | + raise TransformerInputError( |
| 66 | + "The data must be numerical if order_by is 'numerical_value'." |
| 67 | + ) |
| 68 | + |
| 69 | + if self.order_by is not None: |
| 70 | + unique_data = np.sort(unique_data[~nans]) # pylint: disable=invalid-unary-operand-type |
| 71 | + if nans.any(): |
| 72 | + unique_data = np.append(unique_data, [None]) |
| 73 | + |
| 74 | + return unique_data |
| 75 | + |
| 76 | + @classmethod |
| 77 | + def _get_message_unseen_categories(cls, unseen_categories): |
| 78 | + """Message to raise when there is unseen categories. |
| 79 | +
|
| 80 | + Args: |
| 81 | + unseen_categories (list): list of unseen categories |
| 82 | +
|
| 83 | + Returns: |
| 84 | + message to print |
| 85 | + """ |
| 86 | + categories_to_print = ', '.join(str(x) for x in unseen_categories[:3]) |
| 87 | + if len(unseen_categories) > 3: |
| 88 | + categories_to_print = f'{categories_to_print}, +{len(unseen_categories) - 3} more' |
| 89 | + |
| 90 | + return categories_to_print |
| 91 | + |
| 92 | + @staticmethod |
| 93 | + def _compute_frequencies_intervals(categories, freq): |
| 94 | + """Compute the frequencies and intervals of the categories. |
| 95 | +
|
| 96 | + Args: |
| 97 | + categories (list): |
| 98 | + List of categories. |
| 99 | + freq (list): |
| 100 | + List of frequencies. |
| 101 | +
|
| 102 | + Returns: |
| 103 | + tuple[dict, dict]: |
| 104 | + First dict maps categories to their frequency and the |
| 105 | + second dict maps the categories to their intervals. |
| 106 | + """ |
| 107 | + frequencies = dict(zip(categories, freq)) |
| 108 | + shift = np.cumsum(np.hstack([0, freq])) |
| 109 | + shift[-1] = 1 |
| 110 | + list_int = [[shift[i], shift[i + 1]] for i in range(len(shift) - 1)] |
| 111 | + intervals = dict(zip(categories, list_int)) |
| 112 | + |
| 113 | + return frequencies, intervals |
| 114 | + |
| 115 | + def _fit(self, data): |
| 116 | + """Fit the transformer to the data. |
| 117 | +
|
| 118 | + Compute the frequencies of each category and use them |
| 119 | + to map the column to a numerical one. |
| 120 | +
|
| 121 | + Args: |
| 122 | + data (pandas.Series): |
| 123 | + Data to fit the transformer to. |
| 124 | + """ |
| 125 | + self.dtype = data.dtypes |
| 126 | + data = fill_nan_with_none(data) |
| 127 | + labels = pd.unique(data) |
| 128 | + labels = self._order_categories(labels) |
| 129 | + freq = data.value_counts(normalize=True, dropna=False) |
| 130 | + nan_value = freq[np.nan] if np.nan in freq.index else None |
| 131 | + freq = freq.reindex(labels, fill_value=nan_value).array |
| 132 | + |
| 133 | + self.frequencies, self.intervals = self._compute_frequencies_intervals(labels, freq) |
| 134 | + |
| 135 | + def _transform(self, data): |
| 136 | + """Map the category to a continuous value. |
| 137 | +
|
| 138 | + This value is sampled from a uniform distribution |
| 139 | + with boudaries defined by the frequencies. |
| 140 | +
|
| 141 | + Args: |
| 142 | + data (pandas.Series): |
| 143 | + Data to transform. |
| 144 | +
|
| 145 | + Returns: |
| 146 | + pandas.Series |
| 147 | + """ |
| 148 | + data_with_none = fill_nan_with_none(data) |
| 149 | + unseen_indexes = ~(data_with_none.isin(self.frequencies)) |
| 150 | + if unseen_indexes.any(): |
| 151 | + # Keep the 3 first unseen categories |
| 152 | + unseen_categories = list(data.loc[unseen_indexes].unique()) |
| 153 | + categories_to_print = self._get_message_unseen_categories(unseen_categories) |
| 154 | + warnings.warn( |
| 155 | + f"The data in column '{self.get_input_column()}' contains new categories " |
| 156 | + f"that did not appear during 'fit' ({categories_to_print}). Assigning " |
| 157 | + 'them random values. If you want to model new categories, ' |
| 158 | + "please fit the data again using 'fit'.", |
| 159 | + category=UserWarning |
| 160 | + ) |
| 161 | + |
| 162 | + choices = list(self.frequencies.keys()) |
| 163 | + size = unseen_indexes.size |
| 164 | + data_with_none[unseen_indexes] = np.random.choice(choices, size=size) |
| 165 | + |
| 166 | + def map_labels(label): |
| 167 | + return np.random.uniform(self.intervals[label][0], self.intervals[label][1]) |
| 168 | + |
| 169 | + return data_with_none.map(map_labels).astype(float) |
| 170 | + |
| 171 | + def _reverse_transform(self, data): |
| 172 | + """Convert float values back to the original categorical values. |
| 173 | +
|
| 174 | + Args: |
| 175 | + data (pandas.Series): |
| 176 | + Data to revert. |
| 177 | +
|
| 178 | + Returns: |
| 179 | + pandas.Series |
| 180 | + """ |
| 181 | + data = data.clip(0, 1) |
| 182 | + bins = [0] |
| 183 | + labels = [] |
| 184 | + nan_name = 'NaN' |
| 185 | + while nan_name in self.intervals.keys(): |
| 186 | + nan_name += '_' |
| 187 | + |
| 188 | + for key, interval in self.intervals.items(): |
| 189 | + bins.append(interval[1]) |
| 190 | + if pd.isna(key): |
| 191 | + labels.append(nan_name) |
| 192 | + else: |
| 193 | + labels.append(key) |
| 194 | + |
| 195 | + result = pd.cut(data, bins=bins, labels=labels) |
| 196 | + return result.replace(nan_name, np.nan).astype(self.dtype) |
| 197 | + |
| 198 | + |
| 199 | +class OrderedUniformEncoder(UniformEncoder): |
| 200 | + """Ordered uniform encoder for categorical data. |
| 201 | +
|
| 202 | + This class works very similarly to the ``UniformEncoder``, except that it requires the ordering |
| 203 | + for the labels to be provided. |
| 204 | + Null values are considered just another category. |
| 205 | +
|
| 206 | + Args: |
| 207 | + order (list): |
| 208 | + A list of all the unique categories for the data. The order of the list determines the |
| 209 | + label that each category will get. |
| 210 | + """ |
| 211 | + |
| 212 | + def __init__(self, order): |
| 213 | + self.order = fill_nan_with_none(pd.Series(order)) |
| 214 | + super().__init__() |
| 215 | + |
| 216 | + def __repr__(self): |
| 217 | + """Represent initialization of transformer as text. |
| 218 | +
|
| 219 | + Returns: |
| 220 | + str: |
| 221 | + The name of the transformer followed by any non-default parameters. |
| 222 | + """ |
| 223 | + class_name = self.__class__.get_name() |
| 224 | + custom_args = ['order=<CUSTOM>'] |
| 225 | + args_string = ', '.join(custom_args) |
| 226 | + return f'{class_name}({args_string})' |
| 227 | + |
| 228 | + def _check_unknown_categories(self, data): |
| 229 | + missing = list(data[~data.isin(self.order)].unique()) |
| 230 | + if len(missing) > 0: |
| 231 | + raise TransformerInputError( |
| 232 | + f"Unknown categories '{missing}'. All possible categories must be defined in the " |
| 233 | + "'order' parameter." |
| 234 | + ) |
| 235 | + |
| 236 | + def _fit(self, data): |
| 237 | + """Fit the transformer to the data. |
| 238 | +
|
| 239 | + Create all the class attributes while respecting the speicified |
| 240 | + order of the labels. |
| 241 | +
|
| 242 | + Args: |
| 243 | + data (pandas.Series): |
| 244 | + Data to fit the transformer to. |
| 245 | + """ |
| 246 | + self.dtype = data.dtypes |
| 247 | + data = fill_nan_with_none(data) |
| 248 | + self._check_unknown_categories(data) |
| 249 | + |
| 250 | + category_not_seen = (set(self.order.dropna()) != set(data.dropna())) |
| 251 | + nans_not_seen = (pd.isna(self.order).any() and not pd.isna(data).any()) |
| 252 | + if category_not_seen or nans_not_seen: |
| 253 | + unseen_categories = [x for x in self.order if x not in data.array] |
| 254 | + categories_to_print = self._get_message_unseen_categories(unseen_categories) |
| 255 | + LOGGER.info( |
| 256 | + "For column '%s', some of the provided category values were not present in the" |
| 257 | + ' data during fit: (%s).', |
| 258 | + self.get_input_column(), |
| 259 | + categories_to_print |
| 260 | + ) |
| 261 | + |
| 262 | + freq = data.value_counts(normalize=True, dropna=False) |
| 263 | + freq = 0.9 * freq |
| 264 | + for category in unseen_categories: |
| 265 | + freq[category] = 0.1 / len(unseen_categories) |
| 266 | + |
| 267 | + else: |
| 268 | + freq = data.value_counts(normalize=True, dropna=False) |
| 269 | + |
| 270 | + nan_value = freq[np.nan] if np.nan in freq.index else None |
| 271 | + freq = freq.reindex(self.order, fill_value=nan_value).array |
| 272 | + |
| 273 | + self.frequencies, self.intervals = self._compute_frequencies_intervals(self.order, freq) |
| 274 | + |
| 275 | + def _transform(self, data): |
| 276 | + """Map the category to a continuous value.""" |
| 277 | + data = fill_nan_with_none(data) |
| 278 | + self._check_unknown_categories(data) |
| 279 | + return super()._transform(data) |
12 | 280 |
|
13 | 281 |
|
14 | 282 | class FrequencyEncoder(BaseTransformer): |
|
0 commit comments