Skip to content

Commit 86d4ce0

Browse files
author
PaulWestenthanner
committed
Remove intercept from Contrast Coding Schemes. Fixes #370
1 parent 889120f commit 86d4ce0

File tree

10 files changed

+301
-285
lines changed

10 files changed

+301
-285
lines changed

category_encoders/backward_difference.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ class BackwardDifferenceEncoder(BaseContrastEncoder):
5656
Data columns (total 12 columns):
5757
# Column Non-Null Count Dtype
5858
--- ------ -------------- -----
59-
0 intercept 1460 non-null int64
6059
1 Id 1460 non-null float64
6160
2 MSSubClass 1460 non-null float64
6261
3 MSZoning 1460 non-null object

category_encoders/base_contrast_encoder.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Base encoder for various contrast coding schemes."""
22

3-
import warnings
43
from abc import abstractmethod
54

65
import numpy as np
@@ -182,14 +181,6 @@ def transform_contrast_coding(
182181
"""
183182
cols = X.columns.tolist()
184183

185-
# See issue 370 if it is necessary to add an intercept or not.
186-
X['intercept'] = pd.Series([1] * X.shape[0], index=X.index)
187-
warnings.warn(
188-
'Intercept column might not be added anymore in future releases (c.f. issue #370)',
189-
category=FutureWarning,
190-
stacklevel=4,
191-
)
192-
193184
for switch in mapping:
194185
col = switch.get('col')
195186
mod = switch.get('mapping')
@@ -202,8 +193,4 @@ def transform_contrast_coding(
202193
old_column_index = cols.index(col)
203194
cols[old_column_index : old_column_index + 1] = mod.columns
204195

205-
# this could lead to problems if an intercept column is already present
206-
# (e.g. if another column has been encoded with another contrast coding scheme)
207-
cols = ['intercept'] + cols
208-
209196
return X.reindex(columns=cols)

category_encoders/helmert.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ class HelmertEncoder(BaseContrastEncoder):
5858
Data columns (total 12 columns):
5959
# Column Non-Null Count Dtype
6060
--- ------ -------------- -----
61-
0 intercept 1460 non-null int64
6261
1 Id 1460 non-null float64
6362
2 MSSubClass 1460 non-null float64
6463
3 MSZoning 1460 non-null object

category_encoders/polynomial.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ class PolynomialEncoder(BaseContrastEncoder):
5757
Data columns (total 12 columns):
5858
# Column Non-Null Count Dtype
5959
--- ------ -------------- -----
60-
0 intercept 1460 non-null int64
6160
1 Id 1460 non-null float64
6261
2 MSSubClass 1460 non-null float64
6362
3 MSZoning 1460 non-null object

category_encoders/sum_coding.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ class SumEncoder(BaseContrastEncoder):
5858
Data columns (total 12 columns):
5959
# Column Non-Null Count Dtype
6060
--- ------ -------------- -----
61-
0 intercept 1460 non-null int64
6261
1 Id 1460 non-null float64
6362
2 MSSubClass 1460 non-null float64
6463
3 MSZoning 1460 non-null object

poetry.lock

Lines changed: 272 additions & 240 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ readme = "README.md"
99
[tool.poetry.dependencies]
1010
python = ">=3.9"
1111
numpy = ">=1.14.0"
12-
scikit-learn = ">=1.0.0"
12+
scikit-learn = ">=1.0.0, <1.6.0"
1313
scipy = ">=1.0.0"
1414
statsmodels = ">=0.9.0"
1515
pandas = ">=1.0.5"

tests/test_encoders.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -702,9 +702,13 @@ def test_drop_invariant(self):
702702
)
703703
y = [0, 0, 1, 1, 1]
704704

705-
for encoder_name in set(encoders.__all__) - {
706-
'CatBoostEncoder'
707-
}: # CatBoost does not generally deliver a constant column when the feature is constant
705+
# CatBoost does not generally deliver a constant column when the feature is constant
706+
# ContrastCoding schemes will always ignore invariant columns, even if set to false
707+
encoders_to_ignore = {
708+
'CatBoostEncoder', 'PolynomialEncoder', 'SumEncoder',
709+
'BackwardDifferenceEncoder', 'HelmertEncoder'
710+
}
711+
for encoder_name in set(encoders.__all__) - encoders_to_ignore:
708712
with self.subTest(encoder_name=encoder_name):
709713
enc1 = getattr(encoders, encoder_name)(drop_invariant=False)
710714
enc2 = getattr(encoders, encoder_name)(drop_invariant=True)

tests/test_polynomial.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
from tests.helpers import deep_round
99

10-
a_encoding = [1, -0.7071067811865476, 0.40824829046386313]
11-
b_encoding = [1, -5.551115123125783e-17, -0.8164965809277261]
12-
c_encoding = [1, 0.7071067811865475, 0.4082482904638631]
10+
a_encoding = [-0.7071067811865476, 0.40824829046386313]
11+
b_encoding = [-5.551115123125783e-17, -0.8164965809277261]
12+
c_encoding = [0.7071067811865475, 0.4082482904638631]
1313

1414

1515
class TestPolynomialEncoder(TestCase):
@@ -18,7 +18,7 @@ class TestPolynomialEncoder(TestCase):
1818
def test_handle_missing_and_unknown(self):
1919
"""Test that missing and unknown values are treated as values."""
2020
train = ['A', 'B', 'C']
21-
expected_encoding_unknown = [1, 0, 0]
21+
expected_encoding_unknown = [0, 0]
2222
expected_1 = [a_encoding, expected_encoding_unknown, expected_encoding_unknown]
2323
expected_2 = [b_encoding, expected_encoding_unknown, expected_encoding_unknown]
2424
expected_3 = [a_encoding, b_encoding, c_encoding, expected_encoding_unknown]
@@ -44,9 +44,9 @@ def test_polynomial_encoder_2cols(self):
4444
obtained = encoder.transform(train)
4545

4646
expected = [
47-
[1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]],
48-
[1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]],
49-
[1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]],
47+
a_encoding* 2,
48+
b_encoding* 2,
49+
c_encoding* 2,
5050
]
5151
self.assertEqual(deep_round(obtained.to_numpy().tolist()), deep_round(expected))
5252

@@ -62,7 +62,6 @@ def test_correct_order(self):
6262
columns=['col1', 'col2', 'col3', 'col4'],
6363
)
6464
expected_columns = [
65-
'intercept',
6665
'col1',
6766
'col2_0',
6867
'col2_1',
@@ -105,10 +104,10 @@ def test_handle_missing_is_indicator(self):
105104
expected = [a_encoding, b_encoding, c_encoding]
106105
self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected))
107106

108-
# unknown value is encoded as zeros (only one at indicator)
107+
# unknown value is encoded as zeros
109108
test = ['A', 'B', 'C']
110109
result = encoder.transform(test)
111-
expected = [a_encoding, b_encoding, [1, 0, 0]]
110+
expected = [a_encoding, b_encoding, [0, 0]]
112111
self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected))
113112

114113

tests/test_sum_coding.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
import numpy as np
66
import pandas as pd
77

8-
a_encoding = [1, 1, 0]
9-
b_encoding = [1, 0, 1]
10-
c_encoding = [1, -1, -1]
8+
a_encoding = [1, 0]
9+
b_encoding = [0, 1]
10+
c_encoding = [-1, -1]
1111

1212

1313
class TestSumEncoder(TestCase):
@@ -20,14 +20,14 @@ def test_unknown_and_missing(self):
2020
encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value')
2121
encoder.fit(train)
2222
dim_1_test = ['A', 'D', 'E']
23-
dim_1_expected = [a_encoding, [1, 0, 0], [1, 0, 0]]
23+
dim_1_expected = [a_encoding, [0, 0], [0, 0]]
2424
dim_2_test = ['B', 'D', 'E']
25-
dim_2_expected = [b_encoding, [1, 0, 0], [1, 0, 0]]
25+
dim_2_expected = [b_encoding, [0, 0], [0, 0]]
2626
dim_3_test = ['A', 'B', 'C', None]
27-
dim_3_expected = [a_encoding, b_encoding, c_encoding, [1, 0, 0]]
27+
dim_3_expected = [a_encoding, b_encoding, c_encoding, [0, 0]]
2828

2929
dim_4_test = ['D', 'B', 'C', None]
30-
dim_4_expected = [[1, 0, 0], b_encoding, c_encoding, [1, 0, 0]]
30+
dim_4_expected = [[0, 0], b_encoding, c_encoding, [0, 0]]
3131
cases = {"should preserve dimension 1": (dim_1_test, dim_1_expected),
3232
"should preserve dimension 2": (dim_2_test, dim_2_expected),
3333
"should preserve dimension 3": (dim_3_test, dim_3_expected),
@@ -47,9 +47,9 @@ def test_sum_encoder_2cols(self):
4747
obtained = encoder.transform(train)
4848

4949
expected = [
50-
[1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]],
51-
[1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]],
52-
[1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]],
50+
a_encoding*2,
51+
b_encoding*2,
52+
c_encoding*2,
5353
]
5454
self.assertEqual(obtained.to_numpy().tolist(), expected)
5555

@@ -65,7 +65,6 @@ def test_multiple_columns_correct_order(self):
6565
columns=['col1', 'col2', 'col3', 'col4'],
6666
)
6767
expected_columns = [
68-
'intercept',
6968
'col1',
7069
'col2_0',
7170
'col2_1',
@@ -108,9 +107,8 @@ def test_handle_missing_is_indicator(self):
108107
expected = [a_encoding, b_encoding, c_encoding]
109108
self.assertEqual(result.to_numpy().tolist(), expected)
110109

111-
# unknown value should be encoded with value strategy,
112-
# i.e. indicator 1 and all other columns zeros
110+
# unknown value should be encoded with value strategy, i.e. zeros for all columns
113111
test = ['A', 'B', 'C']
114112
result = encoder.transform(test)
115-
expected = [a_encoding, b_encoding, [1, 0, 0]]
113+
expected = [a_encoding, b_encoding, [0, 0]]
116114
self.assertEqual(result.to_numpy().tolist(), expected)

0 commit comments

Comments
 (0)