Skip to content

Commit 5a624e6

Browse files
committed
Add tests
1 parent 677a5af commit 5a624e6

File tree

3 files changed

+306
-23
lines changed

3 files changed

+306
-23
lines changed

sdmetrics/single_table/privacy/disclosure_protection.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -339,15 +339,18 @@ def _compute_estimated_cap_metric(
339339

340340
cap_metric = CAP_METHODS.get(computation_method)
341341
estimated_score_sum = 0
342-
for iter in estimation_iterator:
342+
for i in estimation_iterator:
343+
real_data_samp = real_data.sample(min(num_rows_subsample, len(real_data)))
344+
synth_data_samp = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data)))
345+
343346
estimated_cap_protection = cap_metric.compute(
344-
real_data.sample(min(num_rows_subsample, len(real_data))),
345-
synthetic_data.sample(min(num_rows_subsample, len(synthetic_data))),
347+
real_data_samp,
348+
synth_data_samp,
346349
key_fields=known_column_names,
347350
sensitive_fields=sensitive_column_names,
348351
)
349352
estimated_score_sum += estimated_cap_protection
350-
average_computed_score = estimated_score_sum / (iter + 1.0)
353+
average_computed_score = estimated_score_sum / (i + 1.0)
351354
if baseline_protection == 0:
352355
average_score = 0 if average_computed_score == 0 else 1
353356
else:
@@ -438,13 +441,13 @@ def compute_breakdown(
438441
average_score, average_computed_score = cls._compute_estimated_cap_metric(
439442
real_data,
440443
synthetic_data,
441-
baseline_protection,
442-
known_column_names,
443-
sensitive_column_names,
444-
computation_method,
445-
num_rows_subsample,
446-
num_iterations,
447-
verbose,
444+
baseline_protection=baseline_protection,
445+
known_column_names=known_column_names,
446+
sensitive_column_names=sensitive_column_names,
447+
computation_method=computation_method,
448+
num_rows_subsample=num_rows_subsample,
449+
num_iterations=num_iterations,
450+
verbose=verbose,
448451
)
449452

450453
return {

tests/integration/single_table/privacy/test_disclosure_protection.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import pandas as pd
33
import pytest
44

5-
from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
5+
from sdmetrics.single_table.privacy.disclosure_protection import (
6+
DisclosureProtection,
7+
DisclosureProtectionEstimate,
8+
)
69

710

811
@pytest.fixture
@@ -23,7 +26,7 @@ def perfect_synthetic_data():
2326
'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
2427
'key2': range(20),
2528
'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20),
26-
'sensitive2': random_state.randint(5, 10, size=20),
29+
'sensitive2': random_state.randint(100, 200, size=20),
2730
})
2831

2932

@@ -142,3 +145,53 @@ def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
142145
'cap_protection': 1.0,
143146
'baseline_protection': 0.98,
144147
}
148+
149+
150+
class TestDisclosureProtectionEstimate:
151+
def test_end_to_end_perfect(self, real_data, perfect_synthetic_data):
152+
"""Test DisclosureProtectionEstimate metric end to end with perfect synthetic data."""
153+
# Setup
154+
sensitive_columns = ['sensitive1', 'sensitive2']
155+
known_columns = ['key1', 'key2']
156+
continous_columns = ['key2', 'sensitive2']
157+
158+
# Run
159+
score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
160+
real_data,
161+
perfect_synthetic_data,
162+
sensitive_column_names=sensitive_columns,
163+
known_column_names=known_columns,
164+
continuous_column_names=continous_columns,
165+
num_discrete_bins=10,
166+
num_rows_subsample=20,
167+
)
168+
169+
# Assert
170+
assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98}
171+
172+
@pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap'])
173+
def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
174+
"""Test DisclosureProtectionEstimate metric with all possible CAP methods."""
175+
# Setup
176+
sensitive_columns = ['sensitive1', 'sensitive2']
177+
known_columns = ['key1', 'key2']
178+
continuous_columns = ['key2', 'sensitive2']
179+
180+
# Run
181+
score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
182+
real_data,
183+
perfect_synthetic_data,
184+
sensitive_column_names=sensitive_columns,
185+
known_column_names=known_columns,
186+
continuous_column_names=continuous_columns,
187+
computation_method=cap_method,
188+
num_discrete_bins=10,
189+
num_rows_subsample=20,
190+
)
191+
192+
# Assert
193+
assert score_breakdown == {
194+
'score': 1.0,
195+
'cap_protection': 1.0,
196+
'baseline_protection': 0.98,
197+
}

tests/unit/single_table/privacy/test_disclosure_protection.py

Lines changed: 237 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
"""Test for the disclosure metrics."""
22

33
import re
4-
from unittest.mock import Mock, patch
4+
from unittest.mock import MagicMock, Mock, call, patch
55

66
import numpy as np
77
import pandas as pd
88
import pytest
99

10-
from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
10+
from sdmetrics.single_table.privacy.disclosure_protection import (
11+
DisclosureProtection,
12+
DisclosureProtectionEstimate,
13+
)
14+
from tests.utils import DataFrameMatcher
1115

1216

1317
class TestDisclosureProtection:
@@ -151,21 +155,51 @@ def test__discretize_column_float_dtypes(self, dtype):
151155
expected_synthetic = np.array(['0', '0', '1', np.nan, '3', np.nan, '4'], dtype='object')
152156
assert list(binned_synthetic) == list(expected_synthetic)
153157

154-
def test__compute_baseline(self):
155-
"""Test computing the baseline score for random data."""
158+
def test__discretize_and_fillna(self):
159+
"""Test helper method to discretize continous columns and fill nan values."""
156160
# Setup
157161
real_data = pd.DataFrame({
158-
'col1': ['A', 'A', 'A', 'A', 'A'],
159-
'col2': ['A', 'B', 'A', 'B', 'A'],
160-
'col3': range(5),
162+
'known': ['A', 'A', pd.NA, 'B', 'B'],
163+
'continous': [0, 1, 3, 8, 10],
164+
'continous_nan': [0, 7, 2, np.nan, 10],
165+
'extra': [None, pd.NA, 0, 10, 100],
161166
})
162-
sensitive_column_names = ['col1', 'col2']
167+
synthetic_data = pd.DataFrame({
168+
'known': ['A', 'A', 'B', 'B', None],
169+
'continous': [-1, 0, 3, 5, 11],
170+
'continous_nan': [0, 1, 2, np.nan, 100],
171+
'extra': [None, pd.NA, 0, 10, 100],
172+
})
173+
known_column_names = ['known']
174+
sensitive_column_names = ['continous', 'continous_nan']
175+
continuous_column_names = ['continous', 'continous_nan']
176+
num_discrete_bins = 5
163177

164178
# Run
165-
baseline_score = DisclosureProtection._compute_baseline(real_data, sensitive_column_names)
179+
processed_real, processed_synthetic = DisclosureProtection._discretize_and_fillna(
180+
real_data,
181+
synthetic_data,
182+
known_column_names,
183+
sensitive_column_names,
184+
continuous_column_names,
185+
num_discrete_bins,
186+
)
166187

167188
# Assert
168-
assert baseline_score == 0.5
189+
expected_real = pd.DataFrame({
190+
'known': ['A', 'A', '__NULL_VALUE__', 'B', 'B'],
191+
'continous': ['0', '0', '1', '3', '4'],
192+
'continous_nan': ['0', '3', '0', '__NULL_VALUE__', '4'],
193+
'extra': real_data['extra'],
194+
})
195+
expected_synthetic = pd.DataFrame({
196+
'known': ['A', 'A', 'B', 'B', '__NULL_VALUE__'],
197+
'continous': ['0', '0', '1', '2', '4'],
198+
'continous_nan': ['0', '0', '0', '__NULL_VALUE__', '4'],
199+
'extra': synthetic_data['extra'],
200+
})
201+
pd.testing.assert_frame_equal(expected_real, processed_real)
202+
pd.testing.assert_frame_equal(expected_synthetic, processed_synthetic)
169203

170204
def test__compute_baseline(self):
171205
"""Test computing the baseline score for random data."""
@@ -287,3 +321,196 @@ def test_compute(self, compute_breakdown_mock):
287321

288322
# Assert
289323
assert score == 0.8
324+
325+
326+
class TestDisclosureProtectionEstimate:
327+
def test__validate_inputs(self):
328+
"""Test input validation."""
329+
# Setup
330+
default_kwargs = {
331+
'real_data': pd.DataFrame({'col1': range(5), 'col2': range(5)}),
332+
'synthetic_data': pd.DataFrame({'col1': range(10), 'col2': range(10)}),
333+
'known_column_names': ['col1'],
334+
'sensitive_column_names': ['col2'],
335+
'computation_method': 'cap',
336+
'continuous_column_names': ['col2'],
337+
'num_discrete_bins': 10,
338+
'num_rows_subsample': 1000,
339+
'num_iterations': 10,
340+
}
341+
bad_rows_subsample = 0
342+
bad_num_iterations = 0
343+
344+
# Run and Assert
345+
DisclosureProtectionEstimate._validate_inputs(**default_kwargs)
346+
347+
bad_rows_subsample_error = re.escape(
348+
'`num_rows_subsample` must be an integer greater than zero.'
349+
)
350+
with pytest.raises(ValueError, match=bad_rows_subsample_error):
351+
DisclosureProtectionEstimate._validate_inputs(**{
352+
**default_kwargs,
353+
'num_rows_subsample': bad_rows_subsample,
354+
})
355+
356+
bad_num_iterations_error = re.escape(
357+
'`num_iterations` must be an integer greater than zero.'
358+
)
359+
with pytest.raises(ValueError, match=bad_num_iterations_error):
360+
DisclosureProtectionEstimate._validate_inputs(**{
361+
**default_kwargs,
362+
'num_iterations': bad_num_iterations,
363+
})
364+
365+
@patch('sdmetrics.single_table.privacy.disclosure_protection.tqdm')
366+
@patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
367+
def test__compute_estimated_cap_metric(self, CAPMethodsMock, mock_tqdm):
368+
"""Test the ``_compute_estimated_cap_metric`` method."""
369+
# Setup
370+
real_data = pd.DataFrame({
371+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
372+
'col2': np.random.choice(['X', 'Y'], size=5),
373+
})
374+
synthetic_data = pd.DataFrame({
375+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
376+
'col2': np.random.choice(['X', 'Y'], size=100),
377+
})
378+
CAPMock = Mock()
379+
CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
380+
CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
381+
CAPMethodsMock.get.return_value = CAPMock
382+
progress_bar = MagicMock()
383+
progress_bar.__iter__.return_value = range(5)
384+
mock_tqdm.tqdm.return_value = progress_bar
385+
386+
# Run
387+
avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
388+
real_data,
389+
synthetic_data,
390+
baseline_protection=0.5,
391+
known_column_names=['col1'],
392+
sensitive_column_names=['col2'],
393+
computation_method='CAP',
394+
num_rows_subsample=10,
395+
num_iterations=5,
396+
verbose=True,
397+
)
398+
399+
# Assert
400+
assert avg_score == 0.76
401+
assert avg_computed_score == 0.38
402+
progress_bar.set_description.assert_has_calls([
403+
call('Estimating Disclosure Protection (Score=0.000)'),
404+
call('Estimating Disclosure Protection (Score=0.800)'),
405+
call('Estimating Disclosure Protection (Score=0.900)'),
406+
call('Estimating Disclosure Protection (Score=0.733)'),
407+
call('Estimating Disclosure Protection (Score=0.850)'),
408+
call('Estimating Disclosure Protection (Score=0.760)'),
409+
])
410+
411+
@patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
412+
def test__compute_estimated_cap_metric_zero_baseline(self, CAPMethodsMock):
413+
"""Test the ``_compute_estimated_cap_metric`` method with a zero baseline."""
414+
# Setup
415+
real_data = pd.DataFrame({
416+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
417+
'col2': ['A'] * 5,
418+
})
419+
synthetic_data = pd.DataFrame({
420+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
421+
'col2': ['A'] * 100,
422+
})
423+
CAPMock = Mock()
424+
CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
425+
CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
426+
CAPMethodsMock.get.return_value = CAPMock
427+
428+
# Run
429+
avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
430+
real_data,
431+
synthetic_data,
432+
baseline_protection=0,
433+
known_column_names=['col1'],
434+
sensitive_column_names=['col2'],
435+
computation_method='CAP',
436+
num_rows_subsample=10,
437+
num_iterations=5,
438+
verbose=False,
439+
)
440+
441+
# Assert
442+
assert avg_score == 1
443+
assert avg_computed_score == 0.38
444+
445+
@patch(
446+
'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate._compute_estimated_cap_metric'
447+
)
448+
def test_compute_breakdown(self, mock__compute_estimated_cap_metric):
449+
"""Test computing the breakdown."""
450+
# Setup
451+
real_data = pd.DataFrame({
452+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
453+
'col2': ['X', 'Y', 'Z', 'Y', 'X', 'X', 'Y', 'Z', 'X', 'A'],
454+
'col3': ['A', 'B'] * 5,
455+
})
456+
synthetic_data = pd.DataFrame({
457+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
458+
'col2': np.random.choice(['X', 'Y', 'Z', 'X', 'X'], size=10),
459+
'col3': ['A'] * 10,
460+
})
461+
mock__compute_estimated_cap_metric.return_value = (0.8, 0.6)
462+
463+
# Run
464+
score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
465+
real_data=real_data,
466+
synthetic_data=synthetic_data,
467+
known_column_names=['col1'],
468+
sensitive_column_names=['col2', 'col3'],
469+
num_discrete_bins=2,
470+
)
471+
472+
# Assert
473+
assert score_breakdown == {
474+
'score': 0.8,
475+
'baseline_protection': 0.875,
476+
'cap_protection': 0.6,
477+
}
478+
mock__compute_estimated_cap_metric.assert_called_once_with(
479+
DataFrameMatcher(real_data),
480+
DataFrameMatcher(synthetic_data),
481+
baseline_protection=0.875,
482+
known_column_names=['col1'],
483+
sensitive_column_names=['col2', 'col3'],
484+
computation_method='CAP',
485+
num_rows_subsample=1000,
486+
num_iterations=10,
487+
verbose=True,
488+
)
489+
490+
@patch(
491+
'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate.compute_breakdown'
492+
)
493+
def test_compute(self, compute_breakdown_mock):
494+
"""Test the ``compute`` method."""
495+
# Setup
496+
real_data = pd.DataFrame({
497+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
498+
'col2': ['A'] * 10,
499+
})
500+
synthetic_data = pd.DataFrame({
501+
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
502+
'col2': ['A'] * 10,
503+
})
504+
compute_breakdown_mock.return_value = {
505+
'score': 0.8,
506+
'baseline_protection': 0.6,
507+
'cap_protection': 0.64,
508+
}
509+
510+
# Run
511+
score = DisclosureProtectionEstimate.compute(
512+
real_data, synthetic_data, known_column_names=['col1'], sensitive_column_names=['col2']
513+
)
514+
515+
# Assert
516+
assert score == 0.8

0 commit comments

Comments
 (0)