Fix BaseNEncoder number of output columns (#296)

kerrickstaley · web-flow · commit 17add9f1d449 · 2021-10-07T18:48:08.000-04:00
BaseNEncoder encoder used an incorrect formula for calculating the number of required bits in the output. If there are `nvals` distinct values and we reserve one encoding to represent "missing or unknown", then the correct number of bits is `ceil(log(nvals + 1, base))`. However, the code was previously using the formula `ceil(log(nvals, base)) + 1`. Fixes #264 - Change the formula to `ceil(log(nvals + 1, base))`. - Switch the formula to use integer math so we don't have to worry about floating point rounding errors. - Add a test. - Fix a non-deterministic test.
diff --git a/category_encoders/basen.py b/category_encoders/basen.py
@@ -12,6 +12,27 @@
 __author__ = 'willmcginnis'
 
 
+def _ceillogint(n, base):
+    """
+    Returns ceil(log(n, base)) for integers n and base.
+
+    Uses integer math, so the result is not subject to floating point rounding errors.
+
+    base must be >= 2 and n must be >= 1.
+    """
+    if base < 2:
+        raise ValueError('base must be >= 2')
+    if n < 1:
+        raise ValueError('n must be >= 1')
+
+    n -= 1
+    ret = 0
+    while n > 0:
+        ret += 1
+        n //= base
+    return ret
+
+
 class BaseNEncoder(BaseEstimator, TransformerMixin):
     """Base-N encoder encodes the categories into arrays of their base-N representation.  A base of 1 is equivalent to
     one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual
@@ -296,7 +317,7 @@ def calc_required_digits(self, values):
         if self.base == 1:
             digits = len(values) + 1
         else:
-            digits = int(np.ceil(math.log(len(values), self.base))) + 1
+            digits = _ceillogint(len(values) + 1, self.base)
 
         return digits
 
diff --git a/tests/test_basen.py b/tests/test_basen.py
@@ -82,8 +82,8 @@ def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self):
         result = encoder.fit_transform(train)
 
         self.assertEqual(2, result.shape[0])
-        self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist())
-        self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist())
+        self.assertListEqual([0, 1], result.iloc[0, :].tolist())
+        self.assertListEqual([1, 0], result.iloc[1, :].tolist())
 
     def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self):
         train = pd.DataFrame({'city': ['chicago', np.nan]})
@@ -139,3 +139,39 @@ def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_Expec
         original = enc.inverse_transform(result)
 
         pd.testing.assert_frame_equal(expected, original)
+
+    def test_num_cols(self):
+        """
+        Test that BaseNEncoder produces the correct number of output columns.
+
+        Since the value 0 is reserved for encoding unseen values, there need to be enough digits to
+        represent up to nvals + 1 distinct encodings, where nvals is the number of distinct input
+        values. This is ceil(log(nvals + 1, base)) digits.
+
+        This test specifically checks the case where BaseNEncoder is initialized with
+        handle_unknown='value' and handle_missing='value' (i.e. the defaults).
+        """
+        def num_cols(nvals, base):
+            """Returns the number of columns output for a given number of distinct input values"""
+            vals = [str(i) for i in range(nvals)]
+            df = pd.DataFrame({'vals': vals})
+            encoder = encoders.BaseNEncoder(base=base)
+            encoder.fit(df)
+            return len(list(encoder.transform(df)))
+
+        self.assertEqual(num_cols(1, 2), 1)
+        self.assertEqual(num_cols(2, 2), 2)
+        self.assertEqual(num_cols(3, 2), 2)
+        self.assertEqual(num_cols(4, 2), 3)
+        self.assertEqual(num_cols(7, 2), 3)
+        self.assertEqual(num_cols(8, 2), 4)
+        self.assertEqual(num_cols(62, 2), 6)
+        self.assertEqual(num_cols(63, 2), 6)
+        self.assertEqual(num_cols(64, 2), 7)
+        self.assertEqual(num_cols(65, 2), 7)
+
+        # nvals = 0 returns the original dataframe unchanged, so it still has 1 column even though
+        # logically there should be zero.
+        self.assertEqual(num_cols(0, 2), 1)
+
+        self.assertEqual(num_cols(55, 7), 3)
diff --git a/tests/test_glmm.py b/tests/test_glmm.py
@@ -5,7 +5,7 @@
 
 # data definitions
 X = th.create_dataset(n_rows=100)
-np_y = np.random.randn(100) > 0.5
+np_y = np.random.default_rng(42).standard_normal(100) > 0.5
 
 class TestGLMMEncoder(TestCase):
     def test_continuous(self):