Skip to content

Commit 7ef5539

Browse files
committed
make release-tag: Merge branch 'main' into stable
2 parents cd40c84 + 22e8cf8 commit 7ef5539

File tree

13 files changed

+168
-142
lines changed

13 files changed

+168
-142
lines changed

HISTORY.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
# History
22

3+
## v1.15.1 - 2025-04-02
4+
5+
### Bugs Fixed
6+
7+
* `learn_rounding_digits` should work for numerical data of `object` dtype - Issue [#972](https://github.com/sdv-dev/RDT/issues/972) by @fealho
8+
* `BaseTransformer.__repr__` errors if transformer has required arguments - Issue [#961](https://github.com/sdv-dev/RDT/issues/961) by @fealho
9+
* The `AnonymizedFaker` does not match cardinality across multiple applications of `reverse_transform` - Issue [#951](https://github.com/sdv-dev/RDT/issues/951) by @pvk-developer
10+
11+
### Maintenance
12+
13+
* All bump-version commands are failing - Issue [#974](https://github.com/sdv-dev/RDT/issues/974) by @amontanez24
14+
315
## v1.15.0 - 2025-03-14
416

517
### New Features

latest_requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Faker==37.0.0
1+
Faker==37.1.0
22
copulas==0.12.1
33
numpy==2.0.2
44
pandas==2.2.3

pyproject.toml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,14 @@ rdt = { main = 'rdt.cli.__main__:main' }
5454

5555
[project.optional-dependencies]
5656
copulas = ['copulas>=0.12.1',]
57-
pyarrow = ['pyarrow>=17.0.0']
57+
pyarrow = ['pyarrow>=17.0.0',]
5858
test = [
5959
'rdt[pyarrow]',
6060
'rdt[copulas]',
6161

6262
'pytest>=3.4.2',
6363
'pytest-cov>=2.6.0',
6464
'jupyter>=1.0.0,<2',
65-
'rundoc>=0.4.3,<0.5',
6665
'pytest-subtests>=0.5,<1.0',
6766
'pytest-runner >= 2.11.1',
6867
'tomli>=2.0.0,<3',
@@ -72,15 +71,15 @@ dev = [
7271

7372
# general
7473
'build>=1.0.0,<2',
75-
'bump-my-version>=0.18.3,<1',
74+
'bump-my-version>=0.18.3',
7675
'pip>=9.0.1',
7776
'watchdog>=1.0.1,<5',
7877

7978
# style check
8079
'ruff>=0.3.2,<1',
8180

8281
# distribute on PyPI
83-
'twine>=1.10.0,<6',
82+
'twine>=1.10.0',
8483
'wheel>=0.30.0',
8584

8685
# Advanced testing
@@ -89,8 +88,9 @@ dev = [
8988
'tabulate>=0.8.9,<1',
9089

9190
# Invoking test commands
92-
'invoke'
91+
'invoke',
9392
]
93+
readme = ['rundoc>=0.4.3,<0.5',]
9494

9595
[tool.setuptools]
9696
include-package-data = true
@@ -144,7 +144,7 @@ collect_ignore = ['pyproject.toml']
144144
exclude_lines = ['NotImplementedError()']
145145

146146
[tool.bumpversion]
147-
current_version = "1.15.0"
147+
current_version = "1.15.1.dev1"
148148
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
149149
serialize = [
150150
'{major}.{minor}.{patch}.{release}{candidate}',

rdt/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
__author__ = 'DataCebo, Inc.'
66
__email__ = 'info@sdv.dev'
7-
__version__ = '1.15.0'
7+
__version__ = '1.15.1.dev1'
88

99

1010
import sys

rdt/transformers/base.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -327,19 +327,19 @@ def __repr__(self):
327327
custom_args = []
328328
args = inspect.getfullargspec(self.__init__)
329329
keys = args.args[1:]
330-
defaults = args.defaults or []
331-
defaults = dict(zip(keys, defaults))
332330
instanced = {
333331
key: getattr(self, key)
334332
for key in keys
335333
if key != 'model_missing_values' and hasattr(self, key) # Remove after deprecation
336334
}
337335

336+
defaults = args.defaults or []
337+
defaults = dict(zip(keys, defaults))
338338
if defaults == instanced:
339339
return f'{class_name}()'
340340

341341
for arg, value in instanced.items():
342-
if defaults[arg] != value:
342+
if arg not in defaults or defaults[arg] != value:
343343
custom_args.append(f'{arg}={repr(value)}')
344344

345345
args_string = ', '.join(custom_args)

rdt/transformers/pii/anonymizer.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def __init__(
155155

156156
self.missing_value_generation = missing_value_generation
157157
self._nan_frequency = 0.0
158+
self._unique_categories = None
158159

159160
@classmethod
160161
def get_supported_sdtypes(cls):
@@ -192,11 +193,11 @@ def _function(self):
192193
faker_attr = self.faker.unique
193194
else:
194195
faker_attr = self.faker
196+
195197
except AttributeError:
196198
faker_attr = self.faker.unique if self.enforce_uniqueness else self.faker
197199

198200
result = getattr(faker_attr, self.function_name)(**self.function_kwargs)
199-
200201
if isinstance(result, Iterable) and not isinstance(result, str):
201202
result = ', '.join(map(str, result))
202203

@@ -236,39 +237,47 @@ def _get_unique_categories(self, samples):
236237

237238
def _reverse_transform_cardinality_rule_match(self, sample_size):
238239
"""Reverse transform the data when the cardinality rule is 'match'."""
239-
reverse_transformed = np.array([], dtype=object)
240-
if self.missing_value_generation == 'random':
241-
num_nans = int(self._nan_frequency * sample_size)
242-
reverse_transformed = np.concatenate([
243-
reverse_transformed,
244-
np.full(num_nans, np.nan),
245-
])
246-
else:
247-
num_nans = 0
240+
num_nans = self._calculate_num_nans(sample_size)
241+
reverse_transformed = self._generate_nans(num_nans)
248242

249243
if sample_size <= num_nans:
250244
return reverse_transformed
251245

252-
if sample_size < num_nans + self._data_cardinality:
253-
unique_categories = self._get_unique_categories(sample_size - num_nans)
254-
reverse_transformed = np.concatenate([
255-
reverse_transformed,
256-
unique_categories,
257-
])
258-
else:
259-
unique_categories = self._get_unique_categories(self._data_cardinality)
260-
num_copies = sample_size - self._data_cardinality - num_nans
261-
copies = np.random.choice(unique_categories, num_copies)
262-
reverse_transformed = np.concatenate([
263-
reverse_transformed,
264-
unique_categories,
265-
copies,
266-
])
246+
remaining_samples = sample_size - num_nans
247+
sampled_values = self._generate_cardinality_match_values(remaining_samples)
267248

249+
reverse_transformed = np.concatenate([reverse_transformed, sampled_values])
268250
np.random.shuffle(reverse_transformed)
269251

270252
return reverse_transformed
271253

254+
def _calculate_num_nans(self, sample_size):
255+
"""Calculate the number of NaN values to generate."""
256+
if self.missing_value_generation == 'random':
257+
return int(self._nan_frequency * sample_size)
258+
259+
return 0
260+
261+
def _generate_nans(self, num_nans):
262+
"""Generate an array of NaN values."""
263+
return np.full(num_nans, np.nan, dtype=object)
264+
265+
def _generate_cardinality_match_values(self, remaining_samples):
266+
"""Generate sampled values while ensuring each unique category appears at least once."""
267+
# Backwards compatibility requires us to generate the values at this point
268+
if self._unique_categories is None:
269+
self._unique_categories = self._get_unique_categories(self._data_cardinality)
270+
271+
unique_categories = np.array(self._unique_categories)
272+
if remaining_samples <= len(unique_categories):
273+
return np.random.choice(unique_categories, remaining_samples, replace=False)
274+
275+
# Ensure all unique categories appear at least once
276+
extra_samples_needed = remaining_samples - len(unique_categories)
277+
extra_samples = np.random.choice(unique_categories, extra_samples_needed, replace=True)
278+
279+
return np.concatenate((unique_categories, extra_samples))
280+
272281
def _reverse_transform(self, data):
273282
"""Generate new anonymized data using a ``faker.provider.function``.
274283
@@ -328,6 +337,7 @@ def _set_fitted_parameters(self, column_name, nan_frequency=0.0, cardinality=Non
328337
raise TransformerInputError(
329338
'Cardinality "match" rule must specify a cardinality value.'
330339
)
340+
331341
self._data_cardinality = cardinality
332342
self._nan_frequency = nan_frequency
333343

rdt/transformers/utils.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
LOGGER = logging.getLogger(__name__)
1717

18-
MAX_DECIMALS = sys.float_info.dig - 1
18+
MAX_DECIMALS = sys.float_info.dig
1919
DEPRECATED_SDTYPES_MAPPING = {'text': 'id'}
2020

2121

@@ -270,17 +270,19 @@ def learn_rounding_digits(data):
270270
data = data.to_numpy()
271271
roundable_data = data[~(np.isinf(data.astype(float)) | pd.isna(data))]
272272

273-
# Doesn't contain numbers
273+
# Empty dataset
274274
if len(roundable_data) == 0:
275275
return None
276276

277-
# Doesn't contain decimal digits
278-
if (roundable_data == roundable_data.astype(int)).all():
279-
return 0
277+
if roundable_data.dtype == 'object':
278+
roundable_data = roundable_data.astype(float)
280279

281280
# Try to round to fewer digits
282-
if (roundable_data == roundable_data.round(MAX_DECIMALS)).all():
283-
for decimal in range(MAX_DECIMALS + 1):
281+
highest_int = int(np.max(np.abs(roundable_data)))
282+
most_digits = len(str(highest_int)) if highest_int != 0 else 0
283+
max_decimals = max(0, MAX_DECIMALS - most_digits)
284+
if (roundable_data == roundable_data.round(max_decimals)).all():
285+
for decimal in range(max_decimals + 1):
284286
if (roundable_data == roundable_data.round(decimal)).all():
285287
return decimal
286288

static_code_analysis.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
Run started:2025-03-13 20:46:37.629049
1+
Run started:2025-04-02 20:29:51.424416
22

33
Test results:
44
No issues identified.
55

66
Code scanned:
7-
Total lines of code: 5864
7+
Total lines of code: 5867
88
Total lines skipped (#nosec): 0
99
Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0
1010

tests/integration/transformers/pii/test_anonymizer.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,30 @@ def test__reverse_transform_from_manually_set_parameters(self):
297297
# Assert
298298
assert missing_values / output.size == freq
299299

300+
def test_anonymized_faker_produces_only_n_values_for_each_reverse_transform_cardinality_match(
301+
self,
302+
):
303+
"""Test `AnonymizedFaker` when `cardinality_rule` is set to `match`.
304+
305+
Ensure that the AnonymizedFaker transformer with `cardinality_rule='match'`
306+
maintains the correct number of unique values across multiple `reverse_transform` calls.
307+
"""
308+
# Setup
309+
data = pd.DataFrame(data={'name': ['Amy'] * 10 + ['Bob'] * 20 + ['Carla'] * 50})
310+
transformer = AnonymizedFaker(
311+
provider_name='person', function_name='name', cardinality_rule='match'
312+
)
313+
314+
# Run
315+
transformed_data = transformer.fit_transform(data, 'name')
316+
first_reverse_transformed = transformer.reverse_transform(transformed_data)
317+
318+
transformed_again = transformer.transform(first_reverse_transformed)
319+
second_reverse_transformed = transformer.reverse_transform(transformed_again)
320+
321+
# Assert
322+
assert set(first_reverse_transformed['name']) == set(second_reverse_transformed['name'])
323+
300324

301325
class TestPsuedoAnonymizedFaker:
302326
def test_default_settings(self):

tests/unit/transformers/pii/test_anonymizer.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ def test__reverse_transform_cardinality_rule_match_no_missing_value(self):
603603
instance = AnonymizedFaker(missing_value_generation=None)
604604
instance._data_cardinality = 2
605605
instance._nan_frequency = 0
606+
instance._unique_categories = ['a', 'b', 'c']
606607
function = Mock()
607608
function.side_effect = ['a', 'b', 'c']
608609

@@ -612,25 +613,22 @@ def test__reverse_transform_cardinality_rule_match_no_missing_value(self):
612613
result = instance._reverse_transform_cardinality_rule_match(3)
613614

614615
# Assert
615-
assert function.call_args_list == [call(), call()]
616-
assert set(result).issubset({'a', 'b'})
616+
assert set(result) == set(['a', 'b', 'c'])
617617

618618
def test__reverse_transform_cardinality_rule_match_not_enough_unique(self):
619619
"""Test it when there are not enough unique values."""
620620
# Setup
621621
instance = AnonymizedFaker()
622-
instance._data_cardinality = 4
622+
instance._data_cardinality = 3
623623
instance._nan_frequency = 0
624624
function = Mock()
625625
function.side_effect = ['a', 'b', 'c', 'd']
626-
627626
instance._function = function
628627

629628
# Run
630-
result = instance._reverse_transform_cardinality_rule_match(3)
629+
result = instance._reverse_transform_cardinality_rule_match(6)
631630

632631
# Assert
633-
assert function.call_args_list == [call(), call(), call()]
634632
assert set(result) == {'a', 'b', 'c'}
635633

636634
def test__reverse_transform_cardinality_rule_missing_attribute(self):

0 commit comments

Comments
 (0)