Skip to content

Commit 7a067fd

Browse files
committed
Add dtype tests and fix issues with minimum pandas version
1 parent 6d21816 commit 7a067fd

File tree

2 files changed

+44
-5
lines changed

2 files changed

+44
-5
lines changed

sdmetrics/single_table/privacy/disclosure_protection.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,13 @@ def _get_null_categories(cls, real_data, synthetic_data, columns):
8585
@classmethod
8686
def _discretize_column(cls, real_column, synthetic_column, num_bins):
8787
bin_labels = [str(x) for x in range(num_bins)]
88-
real_binned, bins = pd.cut(real_column, num_bins, labels=bin_labels, retbins=True)
88+
real_binned, bins = pd.cut(
89+
pd.to_numeric(real_column.to_numpy()), num_bins, labels=bin_labels, retbins=True
90+
)
8991
bins[0], bins[-1] = -np.inf, np.inf
90-
synthetic_binned = pd.cut(synthetic_column, bins, labels=bin_labels)
92+
synthetic_binned = pd.cut(
93+
pd.to_numeric(synthetic_column.to_numpy()), bins, labels=bin_labels
94+
)
9195

9296
return real_binned.to_numpy(), synthetic_binned.to_numpy()
9397

tests/unit/single_table/privacy/test_disclosure_protection.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,12 @@ def test__get_null_categories(self):
115115
# Assert
116116
assert null_category_map == {'col1': '__NULL_VALUE___', 'col2': '__NULL_VALUE___'}
117117

118-
def test__discreteize_column(self):
118+
@pytest.mark.parametrize('dtype', ['int32', 'int64', 'Int32', 'Int64'])
119+
def test__discretize_column_int_dtypes(self, dtype):
119120
"""Test discretizing a continous column"""
120121
# Setup
121-
real_column = pd.Series([0, 2, 6, 8, 10])
122-
synthetic_column = pd.Series([-10, 1, 3, 5, 7, 9, 20])
122+
real_column = pd.Series([0, 2, 6, 8, 10], dtype=dtype)
123+
synthetic_column = pd.Series([-10, 1, 3, 5, 7, 9, 20], dtype=dtype)
123124

124125
# Run
125126
binned_real, binned_synthetic = DisclosureProtection._discretize_column(
@@ -132,6 +133,40 @@ def test__discreteize_column(self):
132133
expected_synthetic = pd.Series(pd.Categorical(['0', '0', '1', '2', '3', '4', '4']))
133134
np.testing.assert_array_equal(binned_synthetic, expected_synthetic)
134135

136+
@pytest.mark.parametrize('dtype', ['float32', 'float64', 'Float32', 'Float64'])
137+
def test__discretize_column_float_dtypes(self, dtype):
138+
"""Test discretizing a continous column"""
139+
# Setup
140+
real_column = pd.Series([0, 0.2, 6.99, np.nan, 10.02], dtype=dtype)
141+
synthetic_column = pd.Series([-10.0, 0.1, 3.77, np.nan, 7.89, np.nan, 20.99], dtype=dtype)
142+
143+
# Run
144+
binned_real, binned_synthetic = DisclosureProtection._discretize_column(
145+
real_column, synthetic_column, 5
146+
)
147+
148+
# Assert
149+
expected_real = np.array(['0', '0', '3', np.nan, '4'], dtype='object')
150+
assert list(binned_real) == list(expected_real)
151+
expected_synthetic = np.array(['0', '0', '1', np.nan, '3', np.nan, '4'], dtype='object')
152+
assert list(binned_synthetic) == list(expected_synthetic)
153+
154+
def test__compute_baseline(self):
155+
"""Test computing the baseline score for random data."""
156+
# Setup
157+
real_data = pd.DataFrame({
158+
'col1': ['A', 'A', 'A', 'A', 'A'],
159+
'col2': ['A', 'B', 'A', 'B', 'A'],
160+
'col3': range(5),
161+
})
162+
sensitive_column_names = ['col1', 'col2']
163+
164+
# Run
165+
baseline_score = DisclosureProtection._compute_baseline(real_data, sensitive_column_names)
166+
167+
# Assert
168+
assert baseline_score == 0.5
169+
135170
def test__compute_baseline(self):
136171
"""Test computing the baseline score for random data."""
137172
# Setup

0 commit comments

Comments
 (0)