Remove intercept from Contrast Coding Schemes. Fixes #370

PaulWestenthanner · PaulWestenthanner · commit 86d4ce0650fb · 2025-01-07T17:43:10.000+01:00
diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py
@@ -56,7 +56,6 @@ class BackwardDifferenceEncoder(BaseContrastEncoder):
     Data columns (total 12 columns):
      #   Column        Non-Null Count  Dtype
     ---  ------        --------------  -----
-     0   intercept     1460 non-null   int64
      1   Id            1460 non-null   float64
      2   MSSubClass    1460 non-null   float64
      3   MSZoning      1460 non-null   object
diff --git a/category_encoders/base_contrast_encoder.py b/category_encoders/base_contrast_encoder.py
@@ -1,6 +1,5 @@
 """Base encoder for various contrast coding schemes."""
 
-import warnings
 from abc import abstractmethod
 
 import numpy as np
@@ -182,14 +181,6 @@ def transform_contrast_coding(
         """
         cols = X.columns.tolist()
 
-        # See issue 370 if it is necessary to add an intercept or not.
-        X['intercept'] = pd.Series([1] * X.shape[0], index=X.index)
-        warnings.warn(
-            'Intercept column might not be added anymore in future releases (c.f. issue #370)',
-            category=FutureWarning,
-            stacklevel=4,
-        )
-
         for switch in mapping:
             col = switch.get('col')
             mod = switch.get('mapping')
@@ -202,8 +193,4 @@ def transform_contrast_coding(
             old_column_index = cols.index(col)
             cols[old_column_index : old_column_index + 1] = mod.columns
 
-        # this could lead to problems if an intercept column is already present
-        # (e.g. if another column has been encoded with another contrast coding scheme)
-        cols = ['intercept'] + cols
-
         return X.reindex(columns=cols)
diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py
@@ -58,7 +58,6 @@ class HelmertEncoder(BaseContrastEncoder):
     Data columns (total 12 columns):
      #   Column        Non-Null Count  Dtype
     ---  ------        --------------  -----
-     0   intercept     1460 non-null   int64
      1   Id            1460 non-null   float64
      2   MSSubClass    1460 non-null   float64
      3   MSZoning      1460 non-null   object
diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py
@@ -57,7 +57,6 @@ class PolynomialEncoder(BaseContrastEncoder):
     Data columns (total 12 columns):
      #   Column        Non-Null Count  Dtype
     ---  ------        --------------  -----
-     0   intercept     1460 non-null   int64
      1   Id            1460 non-null   float64
      2   MSSubClass    1460 non-null   float64
      3   MSZoning      1460 non-null   object
diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py
@@ -58,7 +58,6 @@ class SumEncoder(BaseContrastEncoder):
     Data columns (total 12 columns):
      #   Column        Non-Null Count  Dtype
     ---  ------        --------------  -----
-     0   intercept     1460 non-null   int64
      1   Id            1460 non-null   float64
      2   MSSubClass    1460 non-null   float64
      3   MSZoning      1460 non-null   object
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = ">=3.9"
 numpy = ">=1.14.0"
-scikit-learn = ">=1.0.0"
+scikit-learn = ">=1.0.0, <1.6.0"
 scipy = ">=1.0.0"
 statsmodels = ">=0.9.0"
 pandas = ">=1.0.5"
diff --git a/tests/test_encoders.py b/tests/test_encoders.py
@@ -702,9 +702,13 @@ def test_drop_invariant(self):
         )
         y = [0, 0, 1, 1, 1]
 
-        for encoder_name in set(encoders.__all__) - {
-            'CatBoostEncoder'
-        }:  # CatBoost does not generally deliver a constant column when the feature is constant
+        # CatBoost does not generally deliver a constant column when the feature is constant
+        # ContrastCoding schemes will always ignore invariant columns, even if set to false
+        encoders_to_ignore = {
+            'CatBoostEncoder', 'PolynomialEncoder', 'SumEncoder',
+            'BackwardDifferenceEncoder', 'HelmertEncoder'
+        }
+        for encoder_name in set(encoders.__all__) - encoders_to_ignore:
             with self.subTest(encoder_name=encoder_name):
                 enc1 = getattr(encoders, encoder_name)(drop_invariant=False)
                 enc2 = getattr(encoders, encoder_name)(drop_invariant=True)
diff --git a/tests/test_polynomial.py b/tests/test_polynomial.py
@@ -7,9 +7,9 @@
 
 from tests.helpers import deep_round
 
-a_encoding = [1, -0.7071067811865476, 0.40824829046386313]
-b_encoding = [1, -5.551115123125783e-17, -0.8164965809277261]
-c_encoding = [1, 0.7071067811865475, 0.4082482904638631]
+a_encoding = [-0.7071067811865476, 0.40824829046386313]
+b_encoding = [-5.551115123125783e-17, -0.8164965809277261]
+c_encoding = [0.7071067811865475, 0.4082482904638631]
 
 
 class TestPolynomialEncoder(TestCase):
@@ -18,7 +18,7 @@ class TestPolynomialEncoder(TestCase):
     def test_handle_missing_and_unknown(self):
         """Test that missing and unknown values are treated as values."""
         train = ['A', 'B', 'C']
-        expected_encoding_unknown = [1, 0, 0]
+        expected_encoding_unknown = [0, 0]
         expected_1 = [a_encoding, expected_encoding_unknown, expected_encoding_unknown]
         expected_2 = [b_encoding, expected_encoding_unknown, expected_encoding_unknown]
         expected_3 = [a_encoding, b_encoding, c_encoding, expected_encoding_unknown]
@@ -44,9 +44,9 @@ def test_polynomial_encoder_2cols(self):
         obtained = encoder.transform(train)
 
         expected = [
-            [1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]],
-            [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]],
-            [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]],
+            a_encoding* 2,
+            b_encoding* 2,
+            c_encoding* 2,
         ]
         self.assertEqual(deep_round(obtained.to_numpy().tolist()), deep_round(expected))
 
@@ -62,7 +62,6 @@ def test_correct_order(self):
             columns=['col1', 'col2', 'col3', 'col4'],
         )
         expected_columns = [
-            'intercept',
             'col1',
             'col2_0',
             'col2_1',
@@ -105,10 +104,10 @@ def test_handle_missing_is_indicator(self):
             expected = [a_encoding, b_encoding, c_encoding]
             self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected))
 
-            # unknown value is encoded as zeros (only one at indicator)
+            # unknown value is encoded as zeros
             test = ['A', 'B', 'C']
             result = encoder.transform(test)
-            expected = [a_encoding, b_encoding, [1, 0, 0]]
+            expected = [a_encoding, b_encoding, [0, 0]]
             self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected))
 
 
diff --git a/tests/test_sum_coding.py b/tests/test_sum_coding.py
@@ -5,9 +5,9 @@
 import numpy as np
 import pandas as pd
 
-a_encoding = [1, 1, 0]
-b_encoding = [1, 0, 1]
-c_encoding = [1, -1, -1]
+a_encoding = [1, 0]
+b_encoding = [0, 1]
+c_encoding = [-1, -1]
 
 
 class TestSumEncoder(TestCase):
@@ -20,14 +20,14 @@ def test_unknown_and_missing(self):
         encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value')
         encoder.fit(train)
         dim_1_test = ['A', 'D', 'E']
-        dim_1_expected = [a_encoding, [1, 0, 0], [1, 0, 0]]
+        dim_1_expected = [a_encoding, [0, 0], [0, 0]]
         dim_2_test = ['B', 'D', 'E']
-        dim_2_expected = [b_encoding, [1, 0, 0], [1, 0, 0]]
+        dim_2_expected = [b_encoding, [0, 0], [0, 0]]
         dim_3_test = ['A', 'B', 'C', None]
-        dim_3_expected = [a_encoding, b_encoding, c_encoding, [1, 0, 0]]
+        dim_3_expected = [a_encoding, b_encoding, c_encoding, [0, 0]]
 
         dim_4_test = ['D', 'B', 'C', None]
-        dim_4_expected = [[1, 0, 0], b_encoding, c_encoding, [1, 0, 0]]
+        dim_4_expected = [[0, 0], b_encoding, c_encoding, [0, 0]]
         cases = {"should preserve dimension 1": (dim_1_test, dim_1_expected),
                  "should preserve dimension 2": (dim_2_test, dim_2_expected),
                  "should preserve dimension 3": (dim_3_test, dim_3_expected),
@@ -47,9 +47,9 @@ def test_sum_encoder_2cols(self):
         obtained = encoder.transform(train)
 
         expected = [
-            [1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]],
-            [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]],
-            [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]],
+            a_encoding*2,
+            b_encoding*2,
+            c_encoding*2,
         ]
         self.assertEqual(obtained.to_numpy().tolist(), expected)
 
@@ -65,7 +65,6 @@ def test_multiple_columns_correct_order(self):
             columns=['col1', 'col2', 'col3', 'col4'],
         )
         expected_columns = [
-            'intercept',
             'col1',
             'col2_0',
             'col2_1',
@@ -108,9 +107,8 @@ def test_handle_missing_is_indicator(self):
             expected = [a_encoding, b_encoding, c_encoding]
             self.assertEqual(result.to_numpy().tolist(), expected)
 
-            # unknown value should be encoded with value strategy,
-            # i.e. indicator 1 and all other columns zeros
+            # unknown value should be encoded with value strategy, i.e. zeros for all columns
             test = ['A', 'B', 'C']
             result = encoder.transform(test)
-            expected = [a_encoding, b_encoding, [1, 0, 0]]
+            expected = [a_encoding, b_encoding, [0, 0]]
             self.assertEqual(result.to_numpy().tolist(), expected)