Skip to content

Commit 6985ec8

Browse files
committed
make release-tag: Merge branch 'master' into stable
2 parents eb1ad13 + 83d24a7 commit 6985ec8

File tree

14 files changed

+170
-46
lines changed

14 files changed

+170
-46
lines changed

HISTORY.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
# History
22

3-
## 1.6.0 -2023-07-12
3+
## 1.6.1 - 2023-08-02
4+
5+
This release updates the default transformers used for certain sdtypes. It also enables the `AnonymizedFaker` and `PseudoAnonymizedFaker` to work with any sdtype besides boolean, categorical, datetime, numerical or text.
6+
7+
### Bugs
8+
* [Enterprise Usage] Unable to assign generic PII transformers (eg. AnonymizedFaker) - Issue [#674](https://github.com/sdv-dev/RDT/issues/674) by @amontanez24
9+
10+
### New Features
11+
* Update the default transformers that HyperTransformer assigns to each sdtype - Issue [#664](https://github.com/sdv-dev/RDT/issues/664) by @amontanez24
12+
13+
## 1.6.0 - 2023-07-12
414

515
This release adds the ability to generate missing values to the `AnonymizedFaker`. Users can now provide the `missing_value_generation` parameter during initialization. They can set it to `None` to not generate any missing values, or `'random'` to generate random missing values in the same proportion as the fitted data.
616

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<i>This repository is part of <a href="https://sdv.dev">The Synthetic Data Vault Project</a>, a project from <a href="https://datacebo.com">DataCebo</a>.</i>
55
</p>
66

7-
[![Development Status](https://img.shields.io/badge/Development%20Status-3%20--%20Alpha-yellow)](https://pypi.org/search/?q=&o=&c=Development+Status+%3A%3A+3+-+Alpha)
7+
[![Development Status](https://img.shields.io/badge/Development%20Status-5%20--%20Production/Stable-green)](https://pypi.org/search/?q=&o=&c=Development+Status+%3A%3A+5+-+Production%2FStable)
88
[![PyPi Shield](https://img.shields.io/pypi/v/RDT.svg)](https://pypi.python.org/pypi/RDT)
99
[![Unit Tests](https://github.com/sdv-dev/RDT/actions/workflows/unit.yml/badge.svg)](https://github.com/sdv-dev/RDT/actions/workflows/unit.yml)
1010
[![Downloads](https://pepy.tech/badge/rdt)](https://pepy.tech/project/rdt)

rdt/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
__author__ = 'DataCebo, Inc.'
77
__email__ = '[email protected]'
8-
__version__ = '1.6.0'
8+
__version__ = '1.6.1.dev2'
99

1010

1111
import sys

rdt/hyper_transformer.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,8 @@ def _validate_field_transformers(self):
110110
self._add_field_to_set(field, self._specified_fields)
111111

112112
def __init__(self):
113-
self._default_sdtype_transformers = {}
114113
self.field_sdtypes = {}
115114
self.field_transformers = {}
116-
117115
self._specified_fields = set()
118116
self._validate_field_transformers()
119117
self._valid_output_sdtypes = self._DEFAULT_OUTPUT_SDTYPES
@@ -489,11 +487,7 @@ def _learn_config(self, data):
489487
self._set_field_sdtype(data, field)
490488
if field not in self.field_transformers:
491489
sdtype = self.field_sdtypes[field]
492-
if sdtype in self._default_sdtype_transformers:
493-
self.field_transformers[field] = deepcopy(
494-
self._default_sdtype_transformers[sdtype])
495-
else:
496-
self.field_transformers[field] = deepcopy(get_default_transformer(sdtype))
490+
self.field_transformers[field] = deepcopy(get_default_transformer(sdtype))
497491

498492
def detect_initial_config(self, data):
499493
"""Print the configuration of the data.
@@ -508,7 +502,6 @@ def detect_initial_config(self, data):
508502
Data which will have its configuration detected.
509503
"""
510504
# Reset the state of the HyperTransformer
511-
self._default_sdtype_transformers = {}
512505
self.field_sdtypes = {}
513506
self.field_transformers = {}
514507

rdt/transformers/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,11 @@ def get_transformer_name(transformer):
8888

8989
DEFAULT_TRANSFORMERS = {
9090
'numerical': FloatFormatter(),
91-
'categorical': FrequencyEncoder(),
92-
'boolean': BinaryEncoder(),
91+
'categorical': LabelEncoder(add_noise=True),
92+
'boolean': LabelEncoder(add_noise=True),
9393
'datetime': UnixTimestampEncoder(),
94+
'text': RegexGenerator(),
95+
'pii': AnonymizedFaker(),
9496
}
9597

9698

rdt/transformers/categorical.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ class LabelEncoder(BaseTransformer):
439439
SUPPORTED_SDTYPES = ['categorical', 'boolean']
440440
values_to_categories = None
441441
categories_to_values = None
442+
dtype = 'O'
442443

443444
def __init__(self, add_noise=False, order_by=None):
444445
super().__init__()
@@ -483,6 +484,7 @@ def _fit(self, data):
483484
data (pandas.Series):
484485
Data to fit the transformer to.
485486
"""
487+
self.dtype = data.dtype
486488
unique_data = pd.unique(data.fillna(np.nan))
487489
unique_data = self._order_categories(unique_data)
488490
self.values_to_categories = dict(enumerate(unique_data))
@@ -524,6 +526,7 @@ def _transform(self, data):
524526
)
525527

526528
if self.add_noise:
529+
mapped = mapped.astype(float)
527530
mapped = np.random.uniform(mapped, mapped + 1)
528531

529532
return mapped
@@ -542,7 +545,9 @@ def _reverse_transform(self, data):
542545
data = np.floor(data)
543546

544547
data = data.clip(min(self.values_to_categories), max(self.values_to_categories))
545-
return data.round().map(self.values_to_categories)
548+
data = data.round().map(self.values_to_categories)
549+
550+
return data.astype(self.dtype)
546551

547552

548553
class OrderedLabelEncoder(LabelEncoder):

rdt/transformers/pii/anonymizer.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,23 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
124124
self.missing_value_generation = missing_value_generation
125125
self._nan_frequency = 0.0
126126

127+
@classmethod
128+
def get_supported_sdtypes(cls):
129+
"""Return the supported sdtypes by the transformer.
130+
131+
Returns:
132+
list:
133+
Accepted input sdtypes of the transformer.
134+
"""
135+
unsupported_sdtypes = {'numerical', 'datetime', 'categorical', 'boolean', 'text', None}
136+
all_sdtypes = {cls.INPUT_SDTYPE}
137+
for transformer in BaseTransformer.get_subclasses():
138+
if not issubclass(transformer, cls):
139+
all_sdtypes.update(transformer.get_supported_sdtypes())
140+
141+
supported_sdtypes = all_sdtypes - unsupported_sdtypes
142+
return list(supported_sdtypes)
143+
127144
def reset_randomization(self):
128145
"""Create a new ``Faker`` instance."""
129146
super().reset_randomization()

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 1.6.0
2+
current_version = 1.6.1.dev2
33
commit = True
44
tag = True
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@
106106
author='DataCebo, Inc.',
107107
author_email='[email protected]',
108108
classifiers=[
109-
'Development Status :: 2 - Pre-Alpha',
109+
'Development Status :: 5 - Production/Stable',
110110
'Intended Audience :: Developers',
111111
'License :: Free for non-commercial use',
112112
'Natural Language :: English',
@@ -139,6 +139,6 @@
139139
test_suite='tests',
140140
tests_require=tests_require,
141141
url='https://github.com/sdv-dev/RDT',
142-
version='1.6.0',
142+
version='1.6.1.dev2',
143143
zip_safe=False,
144144
)

tests/integration/test_hyper_transformer.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,25 @@ def get_transformed_data():
9292
return pd.DataFrame({
9393
'integer': [1., 2., 1., 3., 1., 4., 2., 3.],
9494
'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3],
95-
'categorical': [0.3125, 0.3125, .8125, 0.8125, 0.3125, 0.8125, 0.3125, 0.3125],
96-
'bool': [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
95+
'categorical': [
96+
0.9690758764963199, 0.8816575994729887, 1.1326495454234662, 1.7988488918189502,
97+
0.9265972159030215, 1.885454600378942, 0.9280858691537548, 0.5093227924068265
98+
],
99+
'bool': [
100+
0.26161253184788935, 0.5735484647493089, 0.026673806296574787, 1.197229599974477,
101+
0.8860641570557322, 0.33432787358513416, 1.1089412122841389, 0.6182653878449814
102+
],
97103
'datetime': datetimes,
98-
'names': [0.3125, 0.75, 0.75, 0.3125, 0.3125, 0.9375, 0.3125, 0.3125]
104+
'names': [
105+
0.24180193241041126, 1.9297787196579723, 1.5617500744772101, 0.6811042561384157,
106+
0.48017218468846856, 2.2867787591284823, 0.25476586891248476, 0.620052082101593
107+
]
99108
}, index=TEST_DATA_INDEX)
100109

101110

102111
def get_reversed_data():
103112
data = get_input_data()
104-
data['bool'] = data['bool'].astype('object')
113+
105114
return data
106115

107116

@@ -161,10 +170,19 @@ def test_hypertransformer_default_inputs():
161170
expected_transformed = pd.DataFrame({
162171
'integer': [1., 2., 1., 3., 1., 4., 2., 3.],
163172
'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3],
164-
'categorical': [0.3125, 0.3125, 0.9375, 0.75, 0.3125, 0.75, 0.3125, 0.3125],
165-
'bool': [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
173+
'categorical': [
174+
0.9690758764963199, 0.8816575994729887, 1.1326495454234662, 2.7988488918189502,
175+
0.9265972159030215, 2.8854546003789423, 0.9280858691537548, 0.5093227924068265
176+
],
177+
'bool': [
178+
0.26161253184788935, 1.573548464749309, 0.026673806296574787, 2.1972295999744773,
179+
0.8860641570557322, 1.334327873585134, 2.108941212284139, 0.6182653878449814
180+
],
166181
'datetime': expected_datetimes,
167-
'names': [0.3125, 0.75, 0.75, 0.3125, 0.3125, 0.9375, 0.3125, 0.3125]
182+
'names': [
183+
0.24180193241041126, 1.9297787196579723, 1.5617500744772101, 0.6811042561384157,
184+
0.48017218468846856, 2.2867787591284823, 0.25476586891248476, 0.620052082101593
185+
]
168186
}, index=TEST_DATA_INDEX)
169187
pd.testing.assert_frame_equal(transformed, expected_transformed)
170188

@@ -194,10 +212,10 @@ def test_hypertransformer_default_inputs():
194212

195213
assert isinstance(ht.field_transformers['integer'], FloatFormatter)
196214
assert isinstance(ht.field_transformers['float'], FloatFormatter)
197-
assert isinstance(ht.field_transformers['categorical'], FrequencyEncoder)
198-
assert isinstance(ht.field_transformers['bool'], BinaryEncoder)
215+
assert isinstance(ht.field_transformers['categorical'], LabelEncoder)
216+
assert isinstance(ht.field_transformers['bool'], LabelEncoder)
199217
assert isinstance(ht.field_transformers['datetime'], UnixTimestampEncoder)
200-
assert isinstance(ht.field_transformers['names'], FrequencyEncoder)
218+
assert isinstance(ht.field_transformers['names'], LabelEncoder)
201219

202220
get_default_transformers.cache_clear()
203221
get_default_transformer.cache_clear()
@@ -236,10 +254,10 @@ def test_hypertransformer_field_transformers():
236254
'transformers': {
237255
'integer': FloatFormatter(missing_value_replacement='mean'),
238256
'float': FloatFormatter(missing_value_replacement='mean'),
239-
'categorical': FrequencyEncoder(),
240-
'bool': BinaryEncoder(missing_value_replacement='mode'),
257+
'categorical': LabelEncoder(add_noise=True),
258+
'bool': LabelEncoder(add_noise=True),
241259
'datetime': DummyTransformerNotMLReady(),
242-
'names': FrequencyEncoder()
260+
'names': LabelEncoder(add_noise=True)
243261
}
244262
}
245263

0 commit comments

Comments
 (0)