Skip to content

Commit 1def428

Browse files
Merge pull request #391 from PaulWestenthanner/fix/issue_313
simplified polynomial wrapper, added tests, fixed bugs
2 parents ac2f789 + 6ea9f14 commit 1def428

File tree

2 files changed

+44
-51
lines changed

2 files changed

+44
-51
lines changed

category_encoders/wrapper.py

Lines changed: 20 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from sklearn.model_selection import StratifiedKFold
55
import category_encoders as encoders
66
import pandas as pd
7-
from typing import Dict
7+
from typing import Dict, Optional
88

99

1010
class PolynomialWrapper(BaseEstimator, TransformerMixin):
@@ -70,28 +70,32 @@ class PolynomialWrapper(BaseEstimator, TransformerMixin):
7070
None
7171
"""
7272

73-
def __init__(self, feature_encoder):
74-
self.feature_encoder = feature_encoder
73+
def __init__(self, feature_encoder: utils.BaseEncoder):
74+
self.feature_encoder: utils.BaseEncoder = feature_encoder
7575
self.feature_encoders: Dict[str, utils.BaseEncoder] = {}
76-
self.label_encoder = None
76+
self.label_encoder: Optional[encoders.OneHotEncoder] = None
7777

7878
def fit(self, X, y, **kwargs):
7979
# unite the input into pandas types
8080
X, y = utils.convert_inputs(X, y)
8181
y = pd.DataFrame(y, columns=['target'])
8282

8383
# apply one-hot-encoder on the label
84-
self.label_encoder = encoders.OneHotEncoder(handle_missing='error', handle_unknown='error', cols=['target'], drop_invariant=True,
84+
self.label_encoder = encoders.OneHotEncoder(handle_missing='error',
85+
handle_unknown='error',
86+
cols=['target'],
87+
drop_invariant=True,
8588
use_cat_names=True)
8689
labels = self.label_encoder.fit_transform(y)
8790
labels.columns = [column[7:] for column in labels.columns]
8891
labels = labels.iloc[:, 1:] # drop one label
8992

90-
# train the feature encoders
93+
# train the feature encoders, it is important to reset feature encoders first
94+
self.feature_encoders = {}
9195
for class_name, label in labels.items():
9296
self.feature_encoders[class_name] = copy.deepcopy(self.feature_encoder).fit(X, label)
9397

94-
def transform(self, X):
98+
def transform(self, X, y=None):
9599
# unite the input into pandas types
96100
X = utils.convert_input(X)
97101

@@ -101,8 +105,14 @@ def transform(self, X):
101105
all_new_features = pd.DataFrame()
102106

103107
# transform the features
108+
if y is not None:
109+
y = self.label_encoder.transform(pd.DataFrame({"target": y}))
104110
for class_name, feature_encoder in self.feature_encoders.items():
105-
encoded = feature_encoder.transform(X)
111+
if y is not None:
112+
y_transform = y[f"target_{class_name}"]
113+
else:
114+
y_transform = None
115+
encoded = feature_encoder.transform(X, y_transform)
106116

107117
# decorate the encoded features with the label class suffix
108118
new_features = encoded[feature_encoder.cols]
@@ -117,42 +127,8 @@ def transform(self, X):
117127
return result
118128

119129
def fit_transform(self, X, y=None, **fit_params):
120-
# When we are training the feature encoders, we have to use fit_transform() method on the features.
121-
122-
# unite the input into pandas types
123-
X, y = utils.convert_inputs(X, y)
124-
y = y.to_frame()
125-
y.columns = ["target"]
126-
127-
# apply one-hot-encoder on the label
128-
self.label_encoder = encoders.OneHotEncoder(handle_missing='error', handle_unknown='error', cols=['target'], drop_invariant=True,
129-
use_cat_names=True)
130-
labels = self.label_encoder.fit_transform(y)
131-
labels.columns = [column[7:] for column in labels.columns]
132-
labels = labels.iloc[:, 1:] # drop one label
133-
134-
# initialization of the feature encoders
135-
encoded = None
136-
feature_encoder = None
137-
all_new_features = pd.DataFrame()
138-
139-
# fit_transform the feature encoders
140-
for class_name, label in labels.items():
141-
feature_encoder = copy.deepcopy(self.feature_encoder)
142-
encoded = feature_encoder.fit_transform(X, label)
143-
144-
# decorate the encoded features with the label class suffix
145-
new_features = encoded[feature_encoder.cols]
146-
new_features.columns = [str(column) + '_' + class_name for column in new_features.columns]
147-
148-
all_new_features = pd.concat((all_new_features, new_features), axis=1)
149-
self.feature_encoders[class_name] = feature_encoder
150-
151-
# add features that were not encoded
152-
result = pd.concat((encoded[encoded.columns[~encoded.columns.isin(feature_encoder.cols)]],
153-
all_new_features), axis=1)
154-
155-
return result
130+
self.fit(X, y, **fit_params)
131+
return self.transform(X, y)
156132

157133

158134
class NestedCVWrapper(BaseEstimator, TransformerMixin):

tests/test_wrapper.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,37 @@ def test_transform_only_selected(self):
5151

5252
# combination fit() + transform()
5353
wrapper.fit(x, y)
54-
result = wrapper.transform(x)
55-
print(result)
54+
result = wrapper.transform(x, y)
5655
self.assertEqual(len(result.columns), 4, 'We expect 2 untouched features + f2 target encoded into 2 features')
5756

5857
# directly fit_transform()
5958
wrapper = PolynomialWrapper(encoders.LeaveOneOutEncoder(cols=['f2']))
6059
result2 = wrapper.fit_transform(x, y)
61-
print(result2)
6260
self.assertEqual(len(result2.columns), 4, 'We expect 2 untouched features + f2 target encoded into 2 features')
6361

64-
# in the case of leave-one-out, we expect different results, because leave-one-out principle
65-
# is applied only on the training data (to decrease overfitting) while the testing data
66-
# use the whole statistics (to be as accurate as possible).
67-
self.assertFalse(result.iloc[0, 3] == result2.iloc[0, 3])
62+
pd.testing.assert_frame_equal(result, result2)
63+
64+
def test_refit_stateless(self):
65+
# test that when the encoder is fitted multiple times no old state is carried
66+
x = pd.DataFrame([
67+
['a', 'b', 'c'],
68+
['a', 'b', 'c'],
69+
['b', 'b', 'c'],
70+
['b', 'b', 'b'],
71+
['b', 'b', 'b'],
72+
['a', 'b', 'a'],
73+
], columns=['f1', 'f2', 'f3'])
74+
y1 = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog']
75+
y2 = ['bee', 'cat', 'duck', 'duck', 'duck', 'duck']
76+
wrapper = PolynomialWrapper(encoders.TargetEncoder())
77+
result_first_fit = wrapper.fit_transform(x, y1)
78+
expected_categories_1 = {"cat", "dog"} # 'bee' is dropped since first label is always dropped
79+
expected_categories_2 = {"cat", "duck"}
80+
self.assertEqual(set(wrapper.label_encoder.category_mapping[0]["mapping"].index), {"bee", "cat", "dog"})
81+
self.assertEqual(set(wrapper.feature_encoders.keys()), expected_categories_1)
82+
result_second_fit = wrapper.fit_transform(x, y2)
83+
self.assertEqual(set(wrapper.label_encoder.category_mapping[0]["mapping"].index), {"bee", "cat", "duck"})
84+
self.assertEqual(set(wrapper.feature_encoders.keys()), expected_categories_2)
6885

6986

7087
class TestNestedCVWrapper(TestCase):

0 commit comments

Comments
 (0)