|
1 | 1 | """Test for the disclosure metrics.""" |
2 | 2 |
|
3 | 3 | import re |
4 | | -from unittest.mock import Mock, patch |
| 4 | +from unittest.mock import MagicMock, Mock, call, patch |
5 | 5 |
|
6 | 6 | import numpy as np |
7 | 7 | import pandas as pd |
8 | 8 | import pytest |
9 | 9 |
|
10 | | -from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection |
| 10 | +from sdmetrics.single_table.privacy.disclosure_protection import ( |
| 11 | + DisclosureProtection, |
| 12 | + DisclosureProtectionEstimate, |
| 13 | +) |
| 14 | +from tests.utils import DataFrameMatcher |
11 | 15 |
|
12 | 16 |
|
13 | 17 | class TestDisclosureProtection: |
@@ -151,21 +155,51 @@ def test__discretize_column_float_dtypes(self, dtype): |
151 | 155 | expected_synthetic = np.array(['0', '0', '1', np.nan, '3', np.nan, '4'], dtype='object') |
152 | 156 | assert list(binned_synthetic) == list(expected_synthetic) |
153 | 157 |
|
154 | | - def test__compute_baseline(self): |
155 | | - """Test computing the baseline score for random data.""" |
| 158 | + def test__discretize_and_fillna(self): |
| 159 | + """Test helper method to discretize continous columns and fill nan values.""" |
156 | 160 | # Setup |
157 | 161 | real_data = pd.DataFrame({ |
158 | | - 'col1': ['A', 'A', 'A', 'A', 'A'], |
159 | | - 'col2': ['A', 'B', 'A', 'B', 'A'], |
160 | | - 'col3': range(5), |
| 162 | + 'known': ['A', 'A', pd.NA, 'B', 'B'], |
| 163 | + 'continous': [0, 1, 3, 8, 10], |
| 164 | + 'continous_nan': [0, 7, 2, np.nan, 10], |
| 165 | + 'extra': [None, pd.NA, 0, 10, 100], |
161 | 166 | }) |
162 | | - sensitive_column_names = ['col1', 'col2'] |
| 167 | + synthetic_data = pd.DataFrame({ |
| 168 | + 'known': ['A', 'A', 'B', 'B', None], |
| 169 | + 'continous': [-1, 0, 3, 5, 11], |
| 170 | + 'continous_nan': [0, 1, 2, np.nan, 100], |
| 171 | + 'extra': [None, pd.NA, 0, 10, 100], |
| 172 | + }) |
| 173 | + known_column_names = ['known'] |
| 174 | + sensitive_column_names = ['continous', 'continous_nan'] |
| 175 | + continuous_column_names = ['continous', 'continous_nan'] |
| 176 | + num_discrete_bins = 5 |
163 | 177 |
|
164 | 178 | # Run |
165 | | - baseline_score = DisclosureProtection._compute_baseline(real_data, sensitive_column_names) |
| 179 | + processed_real, processed_synthetic = DisclosureProtection._discretize_and_fillna( |
| 180 | + real_data, |
| 181 | + synthetic_data, |
| 182 | + known_column_names, |
| 183 | + sensitive_column_names, |
| 184 | + continuous_column_names, |
| 185 | + num_discrete_bins, |
| 186 | + ) |
166 | 187 |
|
167 | 188 | # Assert |
168 | | - assert baseline_score == 0.5 |
| 189 | + expected_real = pd.DataFrame({ |
| 190 | + 'known': ['A', 'A', '__NULL_VALUE__', 'B', 'B'], |
| 191 | + 'continous': ['0', '0', '1', '3', '4'], |
| 192 | + 'continous_nan': ['0', '3', '0', '__NULL_VALUE__', '4'], |
| 193 | + 'extra': real_data['extra'], |
| 194 | + }) |
| 195 | + expected_synthetic = pd.DataFrame({ |
| 196 | + 'known': ['A', 'A', 'B', 'B', '__NULL_VALUE__'], |
| 197 | + 'continous': ['0', '0', '1', '2', '4'], |
| 198 | + 'continous_nan': ['0', '0', '0', '__NULL_VALUE__', '4'], |
| 199 | + 'extra': synthetic_data['extra'], |
| 200 | + }) |
| 201 | + pd.testing.assert_frame_equal(expected_real, processed_real) |
| 202 | + pd.testing.assert_frame_equal(expected_synthetic, processed_synthetic) |
169 | 203 |
|
170 | 204 | def test__compute_baseline(self): |
171 | 205 | """Test computing the baseline score for random data.""" |
@@ -287,3 +321,196 @@ def test_compute(self, compute_breakdown_mock): |
287 | 321 |
|
288 | 322 | # Assert |
289 | 323 | assert score == 0.8 |
| 324 | + |
| 325 | + |
| 326 | +class TestDisclosureProtectionEstimate: |
| 327 | + def test__validate_inputs(self): |
| 328 | + """Test input validation.""" |
| 329 | + # Setup |
| 330 | + default_kwargs = { |
| 331 | + 'real_data': pd.DataFrame({'col1': range(5), 'col2': range(5)}), |
| 332 | + 'synthetic_data': pd.DataFrame({'col1': range(10), 'col2': range(10)}), |
| 333 | + 'known_column_names': ['col1'], |
| 334 | + 'sensitive_column_names': ['col2'], |
| 335 | + 'computation_method': 'cap', |
| 336 | + 'continuous_column_names': ['col2'], |
| 337 | + 'num_discrete_bins': 10, |
| 338 | + 'num_rows_subsample': 1000, |
| 339 | + 'num_iterations': 10, |
| 340 | + } |
| 341 | + bad_rows_subsample = 0 |
| 342 | + bad_num_iterations = 0 |
| 343 | + |
| 344 | + # Run and Assert |
| 345 | + DisclosureProtectionEstimate._validate_inputs(**default_kwargs) |
| 346 | + |
| 347 | + bad_rows_subsample_error = re.escape( |
| 348 | + '`num_rows_subsample` must be an integer greater than zero.' |
| 349 | + ) |
| 350 | + with pytest.raises(ValueError, match=bad_rows_subsample_error): |
| 351 | + DisclosureProtectionEstimate._validate_inputs(**{ |
| 352 | + **default_kwargs, |
| 353 | + 'num_rows_subsample': bad_rows_subsample, |
| 354 | + }) |
| 355 | + |
| 356 | + bad_num_iterations_error = re.escape( |
| 357 | + '`num_iterations` must be an integer greater than zero.' |
| 358 | + ) |
| 359 | + with pytest.raises(ValueError, match=bad_num_iterations_error): |
| 360 | + DisclosureProtectionEstimate._validate_inputs(**{ |
| 361 | + **default_kwargs, |
| 362 | + 'num_iterations': bad_num_iterations, |
| 363 | + }) |
| 364 | + |
| 365 | + @patch('sdmetrics.single_table.privacy.disclosure_protection.tqdm') |
| 366 | + @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS') |
| 367 | + def test__compute_estimated_cap_metric(self, CAPMethodsMock, mock_tqdm): |
| 368 | + """Test the ``_compute_estimated_cap_metric`` method.""" |
| 369 | + # Setup |
| 370 | + real_data = pd.DataFrame({ |
| 371 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5), |
| 372 | + 'col2': np.random.choice(['X', 'Y'], size=5), |
| 373 | + }) |
| 374 | + synthetic_data = pd.DataFrame({ |
| 375 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100), |
| 376 | + 'col2': np.random.choice(['X', 'Y'], size=100), |
| 377 | + }) |
| 378 | + CAPMock = Mock() |
| 379 | + CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] |
| 380 | + CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] |
| 381 | + CAPMethodsMock.get.return_value = CAPMock |
| 382 | + progress_bar = MagicMock() |
| 383 | + progress_bar.__iter__.return_value = range(5) |
| 384 | + mock_tqdm.tqdm.return_value = progress_bar |
| 385 | + |
| 386 | + # Run |
| 387 | + avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric( |
| 388 | + real_data, |
| 389 | + synthetic_data, |
| 390 | + baseline_protection=0.5, |
| 391 | + known_column_names=['col1'], |
| 392 | + sensitive_column_names=['col2'], |
| 393 | + computation_method='CAP', |
| 394 | + num_rows_subsample=10, |
| 395 | + num_iterations=5, |
| 396 | + verbose=True, |
| 397 | + ) |
| 398 | + |
| 399 | + # Assert |
| 400 | + assert avg_score == 0.76 |
| 401 | + assert avg_computed_score == 0.38 |
| 402 | + progress_bar.set_description.assert_has_calls([ |
| 403 | + call('Estimating Disclosure Protection (Score=0.000)'), |
| 404 | + call('Estimating Disclosure Protection (Score=0.800)'), |
| 405 | + call('Estimating Disclosure Protection (Score=0.900)'), |
| 406 | + call('Estimating Disclosure Protection (Score=0.733)'), |
| 407 | + call('Estimating Disclosure Protection (Score=0.850)'), |
| 408 | + call('Estimating Disclosure Protection (Score=0.760)'), |
| 409 | + ]) |
| 410 | + |
| 411 | + @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS') |
| 412 | + def test__compute_estimated_cap_metric_zero_baseline(self, CAPMethodsMock): |
| 413 | + """Test the ``_compute_estimated_cap_metric`` method with a zero baseline.""" |
| 414 | + # Setup |
| 415 | + real_data = pd.DataFrame({ |
| 416 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5), |
| 417 | + 'col2': ['A'] * 5, |
| 418 | + }) |
| 419 | + synthetic_data = pd.DataFrame({ |
| 420 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100), |
| 421 | + 'col2': ['A'] * 100, |
| 422 | + }) |
| 423 | + CAPMock = Mock() |
| 424 | + CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] |
| 425 | + CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] |
| 426 | + CAPMethodsMock.get.return_value = CAPMock |
| 427 | + |
| 428 | + # Run |
| 429 | + avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric( |
| 430 | + real_data, |
| 431 | + synthetic_data, |
| 432 | + baseline_protection=0, |
| 433 | + known_column_names=['col1'], |
| 434 | + sensitive_column_names=['col2'], |
| 435 | + computation_method='CAP', |
| 436 | + num_rows_subsample=10, |
| 437 | + num_iterations=5, |
| 438 | + verbose=False, |
| 439 | + ) |
| 440 | + |
| 441 | + # Assert |
| 442 | + assert avg_score == 1 |
| 443 | + assert avg_computed_score == 0.38 |
| 444 | + |
| 445 | + @patch( |
| 446 | + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate._compute_estimated_cap_metric' |
| 447 | + ) |
| 448 | + def test_compute_breakdown(self, mock__compute_estimated_cap_metric): |
| 449 | + """Test computing the breakdown.""" |
| 450 | + # Setup |
| 451 | + real_data = pd.DataFrame({ |
| 452 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), |
| 453 | + 'col2': ['X', 'Y', 'Z', 'Y', 'X', 'X', 'Y', 'Z', 'X', 'A'], |
| 454 | + 'col3': ['A', 'B'] * 5, |
| 455 | + }) |
| 456 | + synthetic_data = pd.DataFrame({ |
| 457 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), |
| 458 | + 'col2': np.random.choice(['X', 'Y', 'Z', 'X', 'X'], size=10), |
| 459 | + 'col3': ['A'] * 10, |
| 460 | + }) |
| 461 | + mock__compute_estimated_cap_metric.return_value = (0.8, 0.6) |
| 462 | + |
| 463 | + # Run |
| 464 | + score_breakdown = DisclosureProtectionEstimate.compute_breakdown( |
| 465 | + real_data=real_data, |
| 466 | + synthetic_data=synthetic_data, |
| 467 | + known_column_names=['col1'], |
| 468 | + sensitive_column_names=['col2', 'col3'], |
| 469 | + num_discrete_bins=2, |
| 470 | + ) |
| 471 | + |
| 472 | + # Assert |
| 473 | + assert score_breakdown == { |
| 474 | + 'score': 0.8, |
| 475 | + 'baseline_protection': 0.875, |
| 476 | + 'cap_protection': 0.6, |
| 477 | + } |
| 478 | + mock__compute_estimated_cap_metric.assert_called_once_with( |
| 479 | + DataFrameMatcher(real_data), |
| 480 | + DataFrameMatcher(synthetic_data), |
| 481 | + baseline_protection=0.875, |
| 482 | + known_column_names=['col1'], |
| 483 | + sensitive_column_names=['col2', 'col3'], |
| 484 | + computation_method='CAP', |
| 485 | + num_rows_subsample=1000, |
| 486 | + num_iterations=10, |
| 487 | + verbose=True, |
| 488 | + ) |
| 489 | + |
| 490 | + @patch( |
| 491 | + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate.compute_breakdown' |
| 492 | + ) |
| 493 | + def test_compute(self, compute_breakdown_mock): |
| 494 | + """Test the ``compute`` method.""" |
| 495 | + # Setup |
| 496 | + real_data = pd.DataFrame({ |
| 497 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), |
| 498 | + 'col2': ['A'] * 10, |
| 499 | + }) |
| 500 | + synthetic_data = pd.DataFrame({ |
| 501 | + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), |
| 502 | + 'col2': ['A'] * 10, |
| 503 | + }) |
| 504 | + compute_breakdown_mock.return_value = { |
| 505 | + 'score': 0.8, |
| 506 | + 'baseline_protection': 0.6, |
| 507 | + 'cap_protection': 0.64, |
| 508 | + } |
| 509 | + |
| 510 | + # Run |
| 511 | + score = DisclosureProtectionEstimate.compute( |
| 512 | + real_data, synthetic_data, known_column_names=['col1'], sensitive_column_names=['col2'] |
| 513 | + ) |
| 514 | + |
| 515 | + # Assert |
| 516 | + assert score == 0.8 |
0 commit comments