Skip to content

Commit 19d53f9

Browse files
committed
make release-tag: Merge branch 'master' into stable
2 parents c2d3ac7 + 68f77db commit 19d53f9

23 files changed

+2622
-1471
lines changed

HISTORY.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,39 @@
11
# History
22

3+
## 1.7.0 - 2023-08-22
4+
5+
This release adds 3 new transformers:
6+
7+
1. `UniformEncoder` - A categorical and boolean transformer that converts the column into a uniform distribution.
8+
2. `OrderedUniformEncoder` - The same as above, but the order for the categories can be specified, changing which range in the uniform distribution each category belongs to.
9+
3. `IDGenerator`- A text transformer that drops the input column during transform and returns IDs during reverse transform. The IDs all take the form \<prefix>\<number>\<suffix> and can be configured with a custom prefix, suffix and starting point.
10+
11+
Additionally, the `AnonymizedFaker` is enhanced to support the text sdtype.
12+
13+
### Deprecations
14+
15+
* The `get_input_sdtype` method is being deprecated in favor of `get_supported_sdtypes`.
16+
17+
### New Features
18+
19+
* Create IDGenerator transformer - Issue [#675](https://github.com/sdv-dev/RDT/issues/675) by @R-Palazzo
20+
* Add UniformEncoder (and its ordered version) - Issue [#678](https://github.com/sdv-dev/RDT/issues/678) by @R-Palazzo
21+
* Allow me to use AnonymizedFaker with sdtype text columns - Issue [#688](https://github.com/sdv-dev/RDT/issues/688) by @amontanez24
22+
23+
### Maintenance
24+
25+
* Deprecate get_input_sdtype - Issue [#682](https://github.com/sdv-dev/RDT/issues/682) by @R-Palazzo
26+
327
## 1.6.1 - 2023-08-02
428

529
This release updates the default transformers used for certain sdtypes. It also enables the `AnonymizedFaker` and `PseudoAnonymizedFaker` to work with any sdtype besides boolean, categorical, datetime, numerical or text.
630

731
### Bugs
32+
833
* [Enterprise Usage] Unable to assign generic PII transformers (eg. AnonymizedFaker) - Issue [#674](https://github.com/sdv-dev/RDT/issues/674) by @amontanez24
934

1035
### New Features
36+
1137
* Update the default transformers that HyperTransformer assigns to each sdtype - Issue [#664](https://github.com/sdv-dev/RDT/issues/664) by @amontanez24
1238

1339
## 1.6.0 - 2023-07-12

rdt/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
__author__ = 'DataCebo, Inc.'
77
__email__ = '[email protected]'
8-
__version__ = '1.6.1'
8+
__version__ = '1.7.0.dev3'
99

1010

1111
import sys

rdt/transformers/__init__.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@
1212
from rdt.transformers.base import BaseTransformer
1313
from rdt.transformers.boolean import BinaryEncoder
1414
from rdt.transformers.categorical import (
15-
CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder)
15+
CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder,
16+
OrderedUniformEncoder, UniformEncoder)
1617
from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder
1718
from rdt.transformers.null import NullTransformer
1819
from rdt.transformers.numerical import ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer
1920
from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker
20-
from rdt.transformers.text import RegexGenerator
21+
from rdt.transformers.text import IDGenerator, RegexGenerator
2122

2223
__all__ = [
2324
'BaseTransformer',
@@ -36,11 +37,14 @@
3637
'RegexGenerator',
3738
'AnonymizedFaker',
3839
'PseudoAnonymizedFaker',
40+
'IDGenerator',
3941
'get_transformer_name',
4042
'get_transformer_class',
4143
'get_transformers_by_type',
4244
'get_default_transformers',
4345
'get_default_transformer',
46+
'UniformEncoder',
47+
'OrderedUniformEncoder',
4448
]
4549

4650

@@ -88,8 +92,8 @@ def get_transformer_name(transformer):
8892

8993
DEFAULT_TRANSFORMERS = {
9094
'numerical': FloatFormatter(),
91-
'categorical': LabelEncoder(add_noise=True),
92-
'boolean': LabelEncoder(add_noise=True),
95+
'categorical': UniformEncoder(),
96+
'boolean': UniformEncoder(),
9397
'datetime': UnixTimestampEncoder(),
9498
'text': RegexGenerator(),
9599
'pii': AnonymizedFaker(),
@@ -141,8 +145,8 @@ def get_transformers_by_type():
141145
sdtype_transformers = defaultdict(list)
142146
transformer_classes = BaseTransformer.get_subclasses()
143147
for transformer in transformer_classes:
144-
input_sdtype = transformer.get_input_sdtype()
145-
sdtype_transformers[input_sdtype].append(transformer)
148+
for sdtype in transformer.get_supported_sdtypes():
149+
sdtype_transformers[sdtype].append(transformer)
146150

147151
return sdtype_transformers
148152

rdt/transformers/base.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,11 @@ def get_input_sdtype(cls):
184184
string:
185185
Accepted input sdtype of the transformer.
186186
"""
187-
return cls.INPUT_SDTYPE
187+
warnings.warn(
188+
'`get_input_sdtype` is deprecated. Please use `get_supported_sdtypes` instead.',
189+
FutureWarning
190+
)
191+
return cls.get_supported_sdtypes()[0]
188192

189193
@classmethod
190194
def get_supported_sdtypes(cls):

rdt/transformers/categorical.py

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Transformers for categorical data."""
22

3+
import logging
34
import warnings
45

56
import numpy as np
@@ -9,6 +10,273 @@
910

1011
from rdt.errors import TransformerInputError
1112
from rdt.transformers.base import BaseTransformer
13+
from rdt.transformers.utils import fill_nan_with_none
14+
15+
LOGGER = logging.getLogger(__name__)
16+
17+
18+
class UniformEncoder(BaseTransformer):
19+
"""Transformer for categorical data.
20+
21+
This transformer computes a float representative for each one of the categories
22+
found in the fit data, and then replaces the instances of these categories with
23+
the corresponding representative.
24+
25+
The representatives are decided by computing the frequencies of each labels and
26+
then dividing the ``[0, 1]`` interval according to these frequencies.
27+
28+
When the transformation is reverted, each value is assigned the category that
29+
corresponds to the interval it falls in.
30+
31+
Null values are considered just another category.
32+
33+
Args:
34+
order_by (str or None):
35+
String defining how to order the data before applying the labels. Options are
36+
'alphabetical', 'numerical' and ``None``. Defaults to ``None``.
37+
"""
38+
39+
INPUT_SDTYPE = 'categorical'
40+
SUPPORTED_SDTYPES = ['categorical', 'boolean']
41+
frequencies = None
42+
intervals = None
43+
dtype = None
44+
45+
def __init__(self, order_by=None):
46+
super().__init__()
47+
if order_by not in [None, 'alphabetical', 'numerical_value']:
48+
raise TransformerInputError(
49+
"order_by must be one of the following values: None, 'numerical_value' or "
50+
"'alphabetical'"
51+
)
52+
53+
self.order_by = order_by
54+
55+
def _order_categories(self, unique_data):
56+
nans = pd.isna(unique_data)
57+
if self.order_by == 'alphabetical':
58+
# pylint: disable=invalid-unary-operand-type
59+
if any(map(lambda item: not isinstance(item, str), unique_data[~nans])):
60+
raise TransformerInputError(
61+
"The data must be of type string if order_by is 'alphabetical'."
62+
)
63+
elif self.order_by == 'numerical_value':
64+
if not np.issubdtype(unique_data.dtype.type, np.number):
65+
raise TransformerInputError(
66+
"The data must be numerical if order_by is 'numerical_value'."
67+
)
68+
69+
if self.order_by is not None:
70+
unique_data = np.sort(unique_data[~nans]) # pylint: disable=invalid-unary-operand-type
71+
if nans.any():
72+
unique_data = np.append(unique_data, [None])
73+
74+
return unique_data
75+
76+
@classmethod
77+
def _get_message_unseen_categories(cls, unseen_categories):
78+
"""Message to raise when there is unseen categories.
79+
80+
Args:
81+
unseen_categories (list): list of unseen categories
82+
83+
Returns:
84+
message to print
85+
"""
86+
categories_to_print = ', '.join(str(x) for x in unseen_categories[:3])
87+
if len(unseen_categories) > 3:
88+
categories_to_print = f'{categories_to_print}, +{len(unseen_categories) - 3} more'
89+
90+
return categories_to_print
91+
92+
@staticmethod
93+
def _compute_frequencies_intervals(categories, freq):
94+
"""Compute the frequencies and intervals of the categories.
95+
96+
Args:
97+
categories (list):
98+
List of categories.
99+
freq (list):
100+
List of frequencies.
101+
102+
Returns:
103+
tuple[dict, dict]:
104+
First dict maps categories to their frequency and the
105+
second dict maps the categories to their intervals.
106+
"""
107+
frequencies = dict(zip(categories, freq))
108+
shift = np.cumsum(np.hstack([0, freq]))
109+
shift[-1] = 1
110+
list_int = [[shift[i], shift[i + 1]] for i in range(len(shift) - 1)]
111+
intervals = dict(zip(categories, list_int))
112+
113+
return frequencies, intervals
114+
115+
def _fit(self, data):
116+
"""Fit the transformer to the data.
117+
118+
Compute the frequencies of each category and use them
119+
to map the column to a numerical one.
120+
121+
Args:
122+
data (pandas.Series):
123+
Data to fit the transformer to.
124+
"""
125+
self.dtype = data.dtypes
126+
data = fill_nan_with_none(data)
127+
labels = pd.unique(data)
128+
labels = self._order_categories(labels)
129+
freq = data.value_counts(normalize=True, dropna=False)
130+
nan_value = freq[np.nan] if np.nan in freq.index else None
131+
freq = freq.reindex(labels, fill_value=nan_value).array
132+
133+
self.frequencies, self.intervals = self._compute_frequencies_intervals(labels, freq)
134+
135+
def _transform(self, data):
136+
"""Map the category to a continuous value.
137+
138+
This value is sampled from a uniform distribution
139+
with boudaries defined by the frequencies.
140+
141+
Args:
142+
data (pandas.Series):
143+
Data to transform.
144+
145+
Returns:
146+
pandas.Series
147+
"""
148+
data_with_none = fill_nan_with_none(data)
149+
unseen_indexes = ~(data_with_none.isin(self.frequencies))
150+
if unseen_indexes.any():
151+
# Keep the 3 first unseen categories
152+
unseen_categories = list(data.loc[unseen_indexes].unique())
153+
categories_to_print = self._get_message_unseen_categories(unseen_categories)
154+
warnings.warn(
155+
f"The data in column '{self.get_input_column()}' contains new categories "
156+
f"that did not appear during 'fit' ({categories_to_print}). Assigning "
157+
'them random values. If you want to model new categories, '
158+
"please fit the data again using 'fit'.",
159+
category=UserWarning
160+
)
161+
162+
choices = list(self.frequencies.keys())
163+
size = unseen_indexes.size
164+
data_with_none[unseen_indexes] = np.random.choice(choices, size=size)
165+
166+
def map_labels(label):
167+
return np.random.uniform(self.intervals[label][0], self.intervals[label][1])
168+
169+
return data_with_none.map(map_labels).astype(float)
170+
171+
def _reverse_transform(self, data):
172+
"""Convert float values back to the original categorical values.
173+
174+
Args:
175+
data (pandas.Series):
176+
Data to revert.
177+
178+
Returns:
179+
pandas.Series
180+
"""
181+
data = data.clip(0, 1)
182+
bins = [0]
183+
labels = []
184+
nan_name = 'NaN'
185+
while nan_name in self.intervals.keys():
186+
nan_name += '_'
187+
188+
for key, interval in self.intervals.items():
189+
bins.append(interval[1])
190+
if pd.isna(key):
191+
labels.append(nan_name)
192+
else:
193+
labels.append(key)
194+
195+
result = pd.cut(data, bins=bins, labels=labels)
196+
return result.replace(nan_name, np.nan).astype(self.dtype)
197+
198+
199+
class OrderedUniformEncoder(UniformEncoder):
200+
"""Ordered uniform encoder for categorical data.
201+
202+
This class works very similarly to the ``UniformEncoder``, except that it requires the ordering
203+
for the labels to be provided.
204+
Null values are considered just another category.
205+
206+
Args:
207+
order (list):
208+
A list of all the unique categories for the data. The order of the list determines the
209+
label that each category will get.
210+
"""
211+
212+
def __init__(self, order):
213+
self.order = fill_nan_with_none(pd.Series(order))
214+
super().__init__()
215+
216+
def __repr__(self):
217+
"""Represent initialization of transformer as text.
218+
219+
Returns:
220+
str:
221+
The name of the transformer followed by any non-default parameters.
222+
"""
223+
class_name = self.__class__.get_name()
224+
custom_args = ['order=<CUSTOM>']
225+
args_string = ', '.join(custom_args)
226+
return f'{class_name}({args_string})'
227+
228+
def _check_unknown_categories(self, data):
229+
missing = list(data[~data.isin(self.order)].unique())
230+
if len(missing) > 0:
231+
raise TransformerInputError(
232+
f"Unknown categories '{missing}'. All possible categories must be defined in the "
233+
"'order' parameter."
234+
)
235+
236+
def _fit(self, data):
237+
"""Fit the transformer to the data.
238+
239+
Create all the class attributes while respecting the speicified
240+
order of the labels.
241+
242+
Args:
243+
data (pandas.Series):
244+
Data to fit the transformer to.
245+
"""
246+
self.dtype = data.dtypes
247+
data = fill_nan_with_none(data)
248+
self._check_unknown_categories(data)
249+
250+
category_not_seen = (set(self.order.dropna()) != set(data.dropna()))
251+
nans_not_seen = (pd.isna(self.order).any() and not pd.isna(data).any())
252+
if category_not_seen or nans_not_seen:
253+
unseen_categories = [x for x in self.order if x not in data.array]
254+
categories_to_print = self._get_message_unseen_categories(unseen_categories)
255+
LOGGER.info(
256+
"For column '%s', some of the provided category values were not present in the"
257+
' data during fit: (%s).',
258+
self.get_input_column(),
259+
categories_to_print
260+
)
261+
262+
freq = data.value_counts(normalize=True, dropna=False)
263+
freq = 0.9 * freq
264+
for category in unseen_categories:
265+
freq[category] = 0.1 / len(unseen_categories)
266+
267+
else:
268+
freq = data.value_counts(normalize=True, dropna=False)
269+
270+
nan_value = freq[np.nan] if np.nan in freq.index else None
271+
freq = freq.reindex(self.order, fill_value=nan_value).array
272+
273+
self.frequencies, self.intervals = self._compute_frequencies_intervals(self.order, freq)
274+
275+
def _transform(self, data):
276+
"""Map the category to a continuous value."""
277+
data = fill_nan_with_none(data)
278+
self._check_unknown_categories(data)
279+
return super()._transform(data)
12280

13281

14282
class FrequencyEncoder(BaseTransformer):

0 commit comments

Comments
 (0)