Add tests

frances-h · frances-h · commit 5a624e6f44ba · 2024-12-05T11:16:29.000-05:00
diff --git a/sdmetrics/single_table/privacy/disclosure_protection.py b/sdmetrics/single_table/privacy/disclosure_protection.py
@@ -339,15 +339,18 @@ def _compute_estimated_cap_metric(
 
         cap_metric = CAP_METHODS.get(computation_method)
         estimated_score_sum = 0
-        for iter in estimation_iterator:
+        for i in estimation_iterator:
+            real_data_samp = real_data.sample(min(num_rows_subsample, len(real_data)))
+            synth_data_samp = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data)))
+
             estimated_cap_protection = cap_metric.compute(
-                real_data.sample(min(num_rows_subsample, len(real_data))),
-                synthetic_data.sample(min(num_rows_subsample, len(synthetic_data))),
+                real_data_samp,
+                synth_data_samp,
                 key_fields=known_column_names,
                 sensitive_fields=sensitive_column_names,
             )
             estimated_score_sum += estimated_cap_protection
-            average_computed_score = estimated_score_sum / (iter + 1.0)
+            average_computed_score = estimated_score_sum / (i + 1.0)
             if baseline_protection == 0:
                 average_score = 0 if average_computed_score == 0 else 1
             else:
@@ -438,13 +441,13 @@ def compute_breakdown(
         average_score, average_computed_score = cls._compute_estimated_cap_metric(
             real_data,
             synthetic_data,
-            baseline_protection,
-            known_column_names,
-            sensitive_column_names,
-            computation_method,
-            num_rows_subsample,
-            num_iterations,
-            verbose,
+            baseline_protection=baseline_protection,
+            known_column_names=known_column_names,
+            sensitive_column_names=sensitive_column_names,
+            computation_method=computation_method,
+            num_rows_subsample=num_rows_subsample,
+            num_iterations=num_iterations,
+            verbose=verbose,
         )
 
         return {
diff --git a/tests/integration/single_table/privacy/test_disclosure_protection.py b/tests/integration/single_table/privacy/test_disclosure_protection.py
@@ -2,7 +2,10 @@
 import pandas as pd
 import pytest
 
-from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
+from sdmetrics.single_table.privacy.disclosure_protection import (
+    DisclosureProtection,
+    DisclosureProtectionEstimate,
+)
 
 
 @pytest.fixture
@@ -23,7 +26,7 @@ def perfect_synthetic_data():
         'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
         'key2': range(20),
         'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20),
-        'sensitive2': random_state.randint(5, 10, size=20),
+        'sensitive2': random_state.randint(100, 200, size=20),
     })
 
 
@@ -142,3 +145,53 @@ def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
             'cap_protection': 1.0,
             'baseline_protection': 0.98,
         }
+
+
+class TestDisclosureProtectionEstimate:
+    def test_end_to_end_perfect(self, real_data, perfect_synthetic_data):
+        """Test DisclosureProtectionEstimate metric end to end with perfect synthetic data."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
+            real_data,
+            perfect_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continous_columns,
+            num_discrete_bins=10,
+            num_rows_subsample=20,
+        )
+
+        # Assert
+        assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98}
+
+    @pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap'])
+    def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
+        """Test DisclosureProtectionEstimate metric with all possible CAP methods."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continuous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
+            real_data,
+            perfect_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continuous_columns,
+            computation_method=cap_method,
+            num_discrete_bins=10,
+            num_rows_subsample=20,
+        )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 1.0,
+            'cap_protection': 1.0,
+            'baseline_protection': 0.98,
+        }
diff --git a/tests/unit/single_table/privacy/test_disclosure_protection.py b/tests/unit/single_table/privacy/test_disclosure_protection.py
@@ -1,13 +1,17 @@
 """Test for the disclosure metrics."""
 
 import re
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, call, patch
 
 import numpy as np
 import pandas as pd
 import pytest
 
-from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
+from sdmetrics.single_table.privacy.disclosure_protection import (
+    DisclosureProtection,
+    DisclosureProtectionEstimate,
+)
+from tests.utils import DataFrameMatcher
 
 
 class TestDisclosureProtection:
@@ -151,21 +155,51 @@ def test__discretize_column_float_dtypes(self, dtype):
         expected_synthetic = np.array(['0', '0', '1', np.nan, '3', np.nan, '4'], dtype='object')
         assert list(binned_synthetic) == list(expected_synthetic)
 
-    def test__compute_baseline(self):
-        """Test computing the baseline score for random data."""
+    def test__discretize_and_fillna(self):
+        """Test helper method to discretize continous columns and fill nan values."""
         # Setup
         real_data = pd.DataFrame({
-            'col1': ['A', 'A', 'A', 'A', 'A'],
-            'col2': ['A', 'B', 'A', 'B', 'A'],
-            'col3': range(5),
+            'known': ['A', 'A', pd.NA, 'B', 'B'],
+            'continous': [0, 1, 3, 8, 10],
+            'continous_nan': [0, 7, 2, np.nan, 10],
+            'extra': [None, pd.NA, 0, 10, 100],
         })
-        sensitive_column_names = ['col1', 'col2']
+        synthetic_data = pd.DataFrame({
+            'known': ['A', 'A', 'B', 'B', None],
+            'continous': [-1, 0, 3, 5, 11],
+            'continous_nan': [0, 1, 2, np.nan, 100],
+            'extra': [None, pd.NA, 0, 10, 100],
+        })
+        known_column_names = ['known']
+        sensitive_column_names = ['continous', 'continous_nan']
+        continuous_column_names = ['continous', 'continous_nan']
+        num_discrete_bins = 5
 
         # Run
-        baseline_score = DisclosureProtection._compute_baseline(real_data, sensitive_column_names)
+        processed_real, processed_synthetic = DisclosureProtection._discretize_and_fillna(
+            real_data,
+            synthetic_data,
+            known_column_names,
+            sensitive_column_names,
+            continuous_column_names,
+            num_discrete_bins,
+        )
 
         # Assert
-        assert baseline_score == 0.5
+        expected_real = pd.DataFrame({
+            'known': ['A', 'A', '__NULL_VALUE__', 'B', 'B'],
+            'continous': ['0', '0', '1', '3', '4'],
+            'continous_nan': ['0', '3', '0', '__NULL_VALUE__', '4'],
+            'extra': real_data['extra'],
+        })
+        expected_synthetic = pd.DataFrame({
+            'known': ['A', 'A', 'B', 'B', '__NULL_VALUE__'],
+            'continous': ['0', '0', '1', '2', '4'],
+            'continous_nan': ['0', '0', '0', '__NULL_VALUE__', '4'],
+            'extra': synthetic_data['extra'],
+        })
+        pd.testing.assert_frame_equal(expected_real, processed_real)
+        pd.testing.assert_frame_equal(expected_synthetic, processed_synthetic)
 
     def test__compute_baseline(self):
         """Test computing the baseline score for random data."""
@@ -287,3 +321,196 @@ def test_compute(self, compute_breakdown_mock):
 
         # Assert
         assert score == 0.8
+
+
+class TestDisclosureProtectionEstimate:
+    def test__validate_inputs(self):
+        """Test input validation."""
+        # Setup
+        default_kwargs = {
+            'real_data': pd.DataFrame({'col1': range(5), 'col2': range(5)}),
+            'synthetic_data': pd.DataFrame({'col1': range(10), 'col2': range(10)}),
+            'known_column_names': ['col1'],
+            'sensitive_column_names': ['col2'],
+            'computation_method': 'cap',
+            'continuous_column_names': ['col2'],
+            'num_discrete_bins': 10,
+            'num_rows_subsample': 1000,
+            'num_iterations': 10,
+        }
+        bad_rows_subsample = 0
+        bad_num_iterations = 0
+
+        # Run and Assert
+        DisclosureProtectionEstimate._validate_inputs(**default_kwargs)
+
+        bad_rows_subsample_error = re.escape(
+            '`num_rows_subsample` must be an integer greater than zero.'
+        )
+        with pytest.raises(ValueError, match=bad_rows_subsample_error):
+            DisclosureProtectionEstimate._validate_inputs(**{
+                **default_kwargs,
+                'num_rows_subsample': bad_rows_subsample,
+            })
+
+        bad_num_iterations_error = re.escape(
+            '`num_iterations` must be an integer greater than zero.'
+        )
+        with pytest.raises(ValueError, match=bad_num_iterations_error):
+            DisclosureProtectionEstimate._validate_inputs(**{
+                **default_kwargs,
+                'num_iterations': bad_num_iterations,
+            })
+
+    @patch('sdmetrics.single_table.privacy.disclosure_protection.tqdm')
+    @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
+    def test__compute_estimated_cap_metric(self, CAPMethodsMock, mock_tqdm):
+        """Test the ``_compute_estimated_cap_metric`` method."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
+            'col2': np.random.choice(['X', 'Y'], size=5),
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
+            'col2': np.random.choice(['X', 'Y'], size=100),
+        })
+        CAPMock = Mock()
+        CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
+        CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
+        CAPMethodsMock.get.return_value = CAPMock
+        progress_bar = MagicMock()
+        progress_bar.__iter__.return_value = range(5)
+        mock_tqdm.tqdm.return_value = progress_bar
+
+        # Run
+        avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
+            real_data,
+            synthetic_data,
+            baseline_protection=0.5,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2'],
+            computation_method='CAP',
+            num_rows_subsample=10,
+            num_iterations=5,
+            verbose=True,
+        )
+
+        # Assert
+        assert avg_score == 0.76
+        assert avg_computed_score == 0.38
+        progress_bar.set_description.assert_has_calls([
+            call('Estimating Disclosure Protection (Score=0.000)'),
+            call('Estimating Disclosure Protection (Score=0.800)'),
+            call('Estimating Disclosure Protection (Score=0.900)'),
+            call('Estimating Disclosure Protection (Score=0.733)'),
+            call('Estimating Disclosure Protection (Score=0.850)'),
+            call('Estimating Disclosure Protection (Score=0.760)'),
+        ])
+
+    @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
+    def test__compute_estimated_cap_metric_zero_baseline(self, CAPMethodsMock):
+        """Test the ``_compute_estimated_cap_metric`` method with a zero baseline."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
+            'col2': ['A'] * 5,
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
+            'col2': ['A'] * 100,
+        })
+        CAPMock = Mock()
+        CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
+        CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
+        CAPMethodsMock.get.return_value = CAPMock
+
+        # Run
+        avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
+            real_data,
+            synthetic_data,
+            baseline_protection=0,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2'],
+            computation_method='CAP',
+            num_rows_subsample=10,
+            num_iterations=5,
+            verbose=False,
+        )
+
+        # Assert
+        assert avg_score == 1
+        assert avg_computed_score == 0.38
+
+    @patch(
+        'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate._compute_estimated_cap_metric'
+    )
+    def test_compute_breakdown(self, mock__compute_estimated_cap_metric):
+        """Test computing the breakdown."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': ['X', 'Y', 'Z', 'Y', 'X', 'X', 'Y', 'Z', 'X', 'A'],
+            'col3': ['A', 'B'] * 5,
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': np.random.choice(['X', 'Y', 'Z', 'X', 'X'], size=10),
+            'col3': ['A'] * 10,
+        })
+        mock__compute_estimated_cap_metric.return_value = (0.8, 0.6)
+
+        # Run
+        score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2', 'col3'],
+            num_discrete_bins=2,
+        )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 0.8,
+            'baseline_protection': 0.875,
+            'cap_protection': 0.6,
+        }
+        mock__compute_estimated_cap_metric.assert_called_once_with(
+            DataFrameMatcher(real_data),
+            DataFrameMatcher(synthetic_data),
+            baseline_protection=0.875,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2', 'col3'],
+            computation_method='CAP',
+            num_rows_subsample=1000,
+            num_iterations=10,
+            verbose=True,
+        )
+
+    @patch(
+        'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate.compute_breakdown'
+    )
+    def test_compute(self, compute_breakdown_mock):
+        """Test the ``compute`` method."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': ['A'] * 10,
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': ['A'] * 10,
+        })
+        compute_breakdown_mock.return_value = {
+            'score': 0.8,
+            'baseline_protection': 0.6,
+            'cap_protection': 0.64,
+        }
+
+        # Run
+        score = DisclosureProtectionEstimate.compute(
+            real_data, synthetic_data, known_column_names=['col1'], sensitive_column_names=['col2']
+        )
+
+        # Assert
+        assert score == 0.8