Skip to content

Commit f732631

Browse files
authored
fixes bug in ohe drop_binary (#304)
* fixes bug in ohe drop_binary * augments version for release
1 parent 3a231e9 commit f732631

File tree

4 files changed

+144
-49
lines changed

4 files changed

+144
-49
lines changed

docs/whats_new/v1.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
1+
Version 1.1.2
2+
=============
3+
4+
Deployed: 31th August 2021
5+
6+
Contributors
7+
------------
8+
9+
- Soledad Galli
10+
11+
This small release fixes a Bug in how the OneHotEncoder handles binary categorical variables
12+
when the parameter `drop_last_binary` is set to True. It also ensures that the values in the
13+
`OneHotEncoder.encoder_dict_` are lists of categories and not arrays. These bugs were
14+
introduced in v1.1.0.
15+
16+
Bug fix
17+
-------
18+
- **OneHotEncoder**: drop_last_binary now outputs 1 dummy variable per binary variable when set to true
19+
120
Version 1.1.1
221
=============
322

feature_engine/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.1.1
1+
1.1.2

feature_engine/encoding/one_hot.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
226226
# return k dummies
227227
else:
228228
for var in self.variables_:
229-
self.encoder_dict_[var] = X[var].unique()
229+
self.encoder_dict_[var] = [x for x in X[var].unique()]
230230

231231
self.variables_binary_ = [
232232
var for var in self.variables_ if X[var].nunique() == 2
@@ -235,7 +235,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
235235
# automatically encode binary variables as 1 dummy
236236
if self.drop_last_binary:
237237
for var in self.variables_binary_:
238-
self.encoder_dict_[var] = X[var].unique()[0]
238+
category = X[var].unique()[0]
239+
self.encoder_dict_[var] = [category]
239240

240241
self._check_encoding_dictionary()
241242

tests/test_encoding/test_onehot_encoder.py

Lines changed: 121 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,13 @@ def test_encode_categories_in_k_binary_plus_select_vars_automatically(df_enc_big
4040
}
4141

4242
assert encoder.variables_ == ["var_A", "var_B", "var_C"]
43+
assert encoder.variables_binary_ == []
4344
assert encoder.n_features_in_ == 3
45+
assert encoder.encoder_dict_ == {
46+
"var_A": ["A", "B", "C", "D", "E", "F", "G"],
47+
"var_B": ["A", "B", "C", "D", "E", "F", "G"],
48+
"var_C": ["A", "B", "C", "D", "E", "F", "G"],
49+
}
4450
# test transform output
4551
assert X.sum().to_dict() == transf
4652
assert "var_A" not in X.columns
@@ -75,7 +81,12 @@ def test_encode_categories_in_k_minus_1_binary_plus_list_of_variables(df_enc_big
7581
}
7682

7783
assert encoder.variables_ == ["var_A", "var_B"]
84+
assert encoder.variables_binary_ == []
7885
assert encoder.n_features_in_ == 3
86+
assert encoder.encoder_dict_ == {
87+
"var_A": ["A", "B", "C", "D", "E", "F"],
88+
"var_B": ["A", "B", "C", "D", "E", "F"],
89+
}
7990
# test transform output
8091
for col in transf.keys():
8192
assert X[col].sum() == transf[col]
@@ -84,30 +95,65 @@ def test_encode_categories_in_k_minus_1_binary_plus_list_of_variables(df_enc_big
8495
assert "var_C" in X.columns
8596

8697

87-
def test_encode_top_categories(df_enc_big):
98+
def test_encode_top_categories():
8899
# test case 3: encode only the most popular categories
100+
101+
df = pd.DataFrame(
102+
{
103+
"var_A": ["A"] * 5
104+
+ ["B"] * 11
105+
+ ["C"] * 4
106+
+ ["D"] * 9
107+
+ ["E"] * 2
108+
+ ["F"] * 2
109+
+ ["G"] * 7,
110+
"var_B": ["A"] * 11
111+
+ ["B"] * 7
112+
+ ["C"] * 4
113+
+ ["D"] * 9
114+
+ ["E"] * 2
115+
+ ["F"] * 2
116+
+ ["G"] * 5,
117+
"var_C": ["A"] * 4
118+
+ ["B"] * 5
119+
+ ["C"] * 11
120+
+ ["D"] * 9
121+
+ ["E"] * 2
122+
+ ["F"] * 2
123+
+ ["G"] * 7,
124+
}
125+
)
126+
89127
encoder = OneHotEncoder(top_categories=4, variables=None, drop_last=False)
90-
X = encoder.fit_transform(df_enc_big)
128+
X = encoder.fit_transform(df)
91129

92130
# test init params
93131
assert encoder.top_categories == 4
94132
# test fit attr
95133
transf = {
96-
"var_A_D": 10,
97-
"var_A_B": 10,
98-
"var_A_A": 6,
99-
"var_A_G": 6,
100-
"var_B_A": 10,
101-
"var_B_D": 10,
102-
"var_B_G": 6,
103-
"var_B_B": 6,
104-
"var_C_D": 10,
105-
"var_C_C": 10,
106-
"var_C_G": 6,
107-
"var_C_B": 6,
134+
"var_A_D": 9,
135+
"var_A_B": 11,
136+
"var_A_A": 5,
137+
"var_A_G": 7,
138+
"var_B_A": 11,
139+
"var_B_D": 9,
140+
"var_B_G": 5,
141+
"var_B_B": 7,
142+
"var_C_D": 9,
143+
"var_C_C": 11,
144+
"var_C_G": 7,
145+
"var_C_B": 5,
108146
}
109147

148+
# test fit attr
149+
assert encoder.variables_ == ["var_A", "var_B", "var_C"]
150+
assert encoder.variables_binary_ == []
110151
assert encoder.n_features_in_ == 3
152+
assert encoder.encoder_dict_ == {
153+
"var_A": ["B", "D", "G", "A"],
154+
"var_B": ["A", "D", "B", "G"],
155+
"var_C": ["C", "D", "G", "B"],
156+
}
111157
# test transform output
112158
for col in transf.keys():
113159
assert X[col].sum() == transf[col]
@@ -125,14 +171,12 @@ def test_error_if_drop_last_not_bool():
125171
OneHotEncoder(drop_last=0.5)
126172

127173

128-
def test_fit_raises_error_if_df_contains_na(df_enc_big_na):
174+
def test_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na):
129175
# test case 4: when dataset contains na, fit method
130176
with pytest.raises(ValueError):
131177
encoder = OneHotEncoder()
132178
encoder.fit(df_enc_big_na)
133179

134-
135-
def test_transform_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na):
136180
# test case 4: when dataset contains na, transform method
137181
with pytest.raises(ValueError):
138182
encoder = OneHotEncoder()
@@ -164,7 +208,9 @@ def test_encode_numerical_variables(df_enc_numeric):
164208
X = pd.DataFrame(X).astype("int32")
165209

166210
assert encoder.variables_ == ["var_A", "var_B"]
211+
assert encoder.variables_binary_ == []
167212
assert encoder.n_features_in_ == 2
213+
assert encoder.encoder_dict_ == {"var_A": [1, 2, 3], "var_B": [1, 2, 3]}
168214
# test transform output
169215
pd.testing.assert_frame_equal(X, transf)
170216

@@ -197,6 +243,7 @@ def test_variables_cast_as_category(df_enc_numeric):
197243

198244
assert encoder.variables_ == ["var_A", "var_B"]
199245
assert encoder.n_features_in_ == 2
246+
assert encoder.encoder_dict_ == {"var_A": [1, 2, 3], "var_B": [1, 2, 3]}
200247
# test transform output
201248
pd.testing.assert_frame_equal(X, transf)
202249

@@ -206,89 +253,110 @@ def df_enc_binary():
206253
df = {
207254
"var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4,
208255
"var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
209-
"var_C": ["A"] * 10 + ["B"] * 10,
210-
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
256+
"var_C": ["AHA"] * 12 + ["UHU"] * 8,
257+
"var_D": ["OHO"] * 5 + ["EHE"] * 15,
258+
"var_num": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
211259
}
212260
df = pd.DataFrame(df)
213261

214262
return df
215263

216264

217-
def test_encode_into_k_binary_plus_drop_binary(df_enc_binary):
265+
def test_encode_into_k_dummy_plus_drop_binary(df_enc_binary):
218266
encoder = OneHotEncoder(
219267
top_categories=None, variables=None, drop_last=False, drop_last_binary=True
220268
)
221269
X = encoder.fit_transform(df_enc_binary)
270+
X = X.astype("int32")
222271

223272
# test fit attr
224273
transf = {
225-
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
274+
"var_num": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
226275
"var_A_A": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
227276
"var_A_B": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
228277
"var_A_C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
229278
"var_B_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
230279
"var_B_B": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
231280
"var_B_C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
232-
"var_C_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
281+
"var_C_AHA": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
282+
"var_D_OHO": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
233283
}
234-
235284
transf = pd.DataFrame(transf).astype("int32")
236-
X = pd.DataFrame(X).astype("int32")
237285

238-
assert encoder.variables_ == ["var_A", "var_B", "var_C"]
239-
assert encoder.n_features_in_ == 4
286+
assert encoder.variables_ == ["var_A", "var_B", "var_C", "var_D"]
287+
assert encoder.variables_binary_ == ["var_C", "var_D"]
288+
assert encoder.n_features_in_ == 5
289+
assert encoder.encoder_dict_ == {
290+
"var_A": ["A", "B", "C"],
291+
"var_B": ["A", "B", "C"],
292+
"var_C": ["AHA"],
293+
"var_D": ["OHO"],
294+
}
240295
# test transform output
241296
pd.testing.assert_frame_equal(X, transf)
242297
assert "var_C_B" not in X.columns
243298

244299

245-
def test_encode_into_kminus1_binary_plus_drop_binary(df_enc_binary):
300+
def test_encode_into_kminus1_dummyy_plus_drop_binary(df_enc_binary):
246301
encoder = OneHotEncoder(
247302
top_categories=None, variables=None, drop_last=True, drop_last_binary=True
248303
)
249304
X = encoder.fit_transform(df_enc_binary)
305+
X = X.astype("int32")
250306

251307
# test fit attr
252308
transf = {
253-
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
309+
"var_num": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
254310
"var_A_A": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
255311
"var_A_B": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
256312
"var_B_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
257313
"var_B_B": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
258-
"var_C_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
314+
"var_C_AHA": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
315+
"var_D_OHO": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
259316
}
260-
261317
transf = pd.DataFrame(transf).astype("int32")
262-
X = pd.DataFrame(X).astype("int32")
263318

264-
assert encoder.variables_ == ["var_A", "var_B", "var_C"]
265-
assert encoder.n_features_in_ == 4
319+
assert encoder.variables_ == ["var_A", "var_B", "var_C", "var_D"]
320+
assert encoder.variables_binary_ == ["var_C", "var_D"]
321+
assert encoder.n_features_in_ == 5
322+
assert encoder.encoder_dict_ == {
323+
"var_A": ["A", "B"],
324+
"var_B": ["A", "B"],
325+
"var_C": ["AHA"],
326+
"var_D": ["OHO"],
327+
}
266328
# test transform output
267329
pd.testing.assert_frame_equal(X, transf)
268330
assert "var_C_B" not in X.columns
269331

270332

271333
def test_encode_into_top_categories_plus_drop_binary(df_enc_binary):
272-
273334
# top_categories = 1
274335
encoder = OneHotEncoder(
275336
top_categories=1, variables=None, drop_last=False, drop_last_binary=True
276337
)
277338
X = encoder.fit_transform(df_enc_binary)
339+
X = X.astype("int32")
278340

279341
# test fit attr
280342
transf = {
281-
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
343+
"var_num": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
282344
"var_A_B": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
283345
"var_B_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
284-
"var_C_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
346+
"var_C_AHA": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
347+
"var_D_OHO": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
285348
}
286-
287349
transf = pd.DataFrame(transf).astype("int32")
288-
X = pd.DataFrame(X).astype("int32")
289350

290-
assert encoder.variables_ == ["var_A", "var_B", "var_C"]
291-
assert encoder.n_features_in_ == 4
351+
assert encoder.variables_ == ["var_A", "var_B", "var_C", "var_D"]
352+
assert encoder.variables_binary_ == ["var_C", "var_D"]
353+
assert encoder.n_features_in_ == 5
354+
assert encoder.encoder_dict_ == {
355+
"var_A": ["B"],
356+
"var_B": ["A"],
357+
"var_C": ["AHA"],
358+
"var_D": ["OHO"],
359+
}
292360
# test transform output
293361
pd.testing.assert_frame_equal(X, transf)
294362
assert "var_C_B" not in X.columns
@@ -298,22 +366,29 @@ def test_encode_into_top_categories_plus_drop_binary(df_enc_binary):
298366
top_categories=2, variables=None, drop_last=False, drop_last_binary=True
299367
)
300368
X = encoder.fit_transform(df_enc_binary)
369+
X = X.astype("int32")
301370

302371
# test fit attr
303372
transf = {
304-
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
373+
"var_num": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
305374
"var_A_B": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
306375
"var_A_A": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
307376
"var_B_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
308377
"var_B_B": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
309-
"var_C_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
378+
"var_C_AHA": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
379+
"var_D_OHO": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
310380
}
311-
312381
transf = pd.DataFrame(transf).astype("int32")
313-
X = pd.DataFrame(X).astype("int32")
314382

315-
assert encoder.variables_ == ["var_A", "var_B", "var_C"]
316-
assert encoder.n_features_in_ == 4
383+
assert encoder.variables_ == ["var_A", "var_B", "var_C", "var_D"]
384+
assert encoder.variables_binary_ == ["var_C", "var_D"]
385+
assert encoder.n_features_in_ == 5
386+
assert encoder.encoder_dict_ == {
387+
"var_A": ["B", "A"],
388+
"var_B": ["A", "B"],
389+
"var_C": ["AHA"],
390+
"var_D": ["OHO"],
391+
}
317392
# test transform output
318393
pd.testing.assert_frame_equal(X, transf)
319394
assert "var_C_B" not in X.columns

0 commit comments

Comments
 (0)