@@ -115,11 +115,12 @@ def test__get_null_categories(self):
115115 # Assert
116116 assert null_category_map == {'col1' : '__NULL_VALUE___' , 'col2' : '__NULL_VALUE___' }
117117
118- def test__discreteize_column (self ):
118+ @pytest .mark .parametrize ('dtype' , ['int32' , 'int64' , 'Int32' , 'Int64' ])
119+ def test__discretize_column_int_dtypes (self , dtype ):
119120 """Test discretizing a continous column"""
120121 # Setup
121- real_column = pd .Series ([0 , 2 , 6 , 8 , 10 ])
122- synthetic_column = pd .Series ([- 10 , 1 , 3 , 5 , 7 , 9 , 20 ])
122+ real_column = pd .Series ([0 , 2 , 6 , 8 , 10 ], dtype = dtype )
123+ synthetic_column = pd .Series ([- 10 , 1 , 3 , 5 , 7 , 9 , 20 ], dtype = dtype )
123124
124125 # Run
125126 binned_real , binned_synthetic = DisclosureProtection ._discretize_column (
@@ -132,6 +133,40 @@ def test__discreteize_column(self):
132133 expected_synthetic = pd .Series (pd .Categorical (['0' , '0' , '1' , '2' , '3' , '4' , '4' ]))
133134 np .testing .assert_array_equal (binned_synthetic , expected_synthetic )
134135
136+ @pytest .mark .parametrize ('dtype' , ['float32' , 'float64' , 'Float32' , 'Float64' ])
137+ def test__discretize_column_float_dtypes (self , dtype ):
138+ """Test discretizing a continous column"""
139+ # Setup
140+ real_column = pd .Series ([0 , 0.2 , 6.99 , np .nan , 10.02 ], dtype = dtype )
141+ synthetic_column = pd .Series ([- 10.0 , 0.1 , 3.77 , np .nan , 7.89 , np .nan , 20.99 ], dtype = dtype )
142+
143+ # Run
144+ binned_real , binned_synthetic = DisclosureProtection ._discretize_column (
145+ real_column , synthetic_column , 5
146+ )
147+
148+ # Assert
149+ expected_real = np .array (['0' , '0' , '3' , np .nan , '4' ], dtype = 'object' )
150+ assert list (binned_real ) == list (expected_real )
151+ expected_synthetic = np .array (['0' , '0' , '1' , np .nan , '3' , np .nan , '4' ], dtype = 'object' )
152+ assert list (binned_synthetic ) == list (expected_synthetic )
153+
154+ def test__compute_baseline (self ):
155+ """Test computing the baseline score for random data."""
156+ # Setup
157+ real_data = pd .DataFrame ({
158+ 'col1' : ['A' , 'A' , 'A' , 'A' , 'A' ],
159+ 'col2' : ['A' , 'B' , 'A' , 'B' , 'A' ],
160+ 'col3' : range (5 ),
161+ })
162+ sensitive_column_names = ['col1' , 'col2' ]
163+
164+ # Run
165+ baseline_score = DisclosureProtection ._compute_baseline (real_data , sensitive_column_names )
166+
167+ # Assert
168+ assert baseline_score == 0.5
169+
135170 def test__compute_baseline (self ):
136171 """Test computing the baseline score for random data."""
137172 # Setup
0 commit comments