Skip to content

Commit b274c95

Browse files
authored
fix various bugs in RareLabelEncoder (#665)
* fixed a few typos in rare categories * list when fit and nan
1 parent 4d55ed7 commit b274c95

File tree

2 files changed

+197
-13
lines changed

2 files changed

+197
-13
lines changed

feature_engine/encoding/rare_label.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
218218
"indicated in n_categories. Thus, all categories will be "
219219
"considered frequent".format(var)
220220
)
221-
self.encoder_dict_[var] = X[var].unique()
221+
self.encoder_dict_[var] = list(X[var].unique())
222222

223223
self.variables_ = variables_
224224
self._get_feature_names_in(X)
@@ -247,19 +247,19 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
247247
_check_optional_contains_na(X, self.variables_)
248248

249249
for feature in self.variables_:
250-
X[feature] = np.where(
251-
X[feature].isin(self.encoder_dict_[feature]),
252-
X[feature],
253-
self.replace_with,
254-
)
250+
if X[feature].dtype == "category":
251+
X[feature] = X[feature].cat.add_categories(self.replace_with)
252+
X.loc[
253+
~X[feature].isin(self.encoder_dict_[feature]), feature
254+
] = self.replace_with
255255

256256
else:
257257
for feature in self.variables_:
258-
X[feature] = np.where(
259-
X[feature].isin(self.encoder_dict_[feature] + [np.nan]),
260-
X[feature],
261-
self.replace_with,
262-
)
258+
if X[feature].dtype == "category":
259+
X[feature] = X[feature].cat.add_categories(self.replace_with)
260+
X.loc[
261+
~X[feature].isin(self.encoder_dict_[feature] + [np.nan]), feature
262+
] = self.replace_with
263263

264264
return X
265265

tests/test_encoding/test_rare_label_encoder.py

Lines changed: 186 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,34 @@ def test_defo_params_plus_automatically_find_variables(df_enc_big):
5454
pd.testing.assert_frame_equal(X, df)
5555

5656

57+
def test_when_varnames_are_numbers(df_enc_big):
58+
input_df = df_enc_big.copy()
59+
input_df.columns = [1, 2, 3]
60+
61+
encoder = RareLabelEncoder(
62+
tol=0.06, n_categories=5, variables=None, replace_with="Rare"
63+
)
64+
X = encoder.fit_transform(input_df)
65+
66+
# expected output
67+
df = {
68+
1: ["A"] * 6 + ["B"] * 10 + ["C"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6,
69+
2: ["A"] * 10 + ["B"] * 6 + ["C"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6,
70+
3: ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6,
71+
}
72+
df = pd.DataFrame(df)
73+
74+
frequenc_cat = {
75+
1: ["B", "D", "A", "G", "C"],
76+
2: ["A", "D", "B", "G", "C"],
77+
3: ["C", "D", "B", "G", "A"],
78+
}
79+
80+
assert encoder.variables_ == [1, 2, 3]
81+
assert encoder.encoder_dict_ == frequenc_cat
82+
pd.testing.assert_frame_equal(X, df)
83+
84+
5785
def test_correctly_ignores_nan_in_transform(df_enc_big):
5886
encoder = RareLabelEncoder(
5987
tol=0.06,
@@ -102,7 +130,7 @@ def test_correctly_ignores_nan_in_fit(df_enc_big):
102130
n_categories=3,
103131
missing_values="ignore",
104132
)
105-
X = encoder.fit_transform(df)
133+
encoder.fit(df)
106134

107135
# expected:
108136
frequenc_cat = {
@@ -134,6 +162,90 @@ def test_correctly_ignores_nan_in_fit(df_enc_big):
134162
pd.testing.assert_frame_equal(X, tt)
135163

136164

165+
def test_correctly_ignores_nan_in_fit_when_var_is_numerical(df_enc_big):
166+
167+
df = df_enc_big.copy()
168+
df["var_C"] = [
169+
1,
170+
1,
171+
1,
172+
1,
173+
2,
174+
2,
175+
2,
176+
2,
177+
2,
178+
2,
179+
3,
180+
3,
181+
3,
182+
3,
183+
3,
184+
3,
185+
3,
186+
3,
187+
3,
188+
3,
189+
4,
190+
4,
191+
4,
192+
4,
193+
4,
194+
4,
195+
4,
196+
4,
197+
4,
198+
4,
199+
5,
200+
5,
201+
6,
202+
6,
203+
np.nan,
204+
np.nan,
205+
np.nan,
206+
np.nan,
207+
np.nan,
208+
np.nan,
209+
]
210+
211+
encoder = RareLabelEncoder(
212+
tol=0.06,
213+
n_categories=3,
214+
missing_values="ignore",
215+
ignore_format=True,
216+
)
217+
encoder.fit(df)
218+
219+
# expected:
220+
frequenc_cat = {
221+
"var_A": ["B", "D", "A", "G", "C"],
222+
"var_B": ["A", "D", "B", "G", "C"],
223+
"var_C": [3, 4, 2, 1],
224+
}
225+
assert encoder.encoder_dict_ == frequenc_cat
226+
227+
# input
228+
t = pd.DataFrame(
229+
{
230+
"var_A": ["A", np.nan, "J", "G"],
231+
"var_B": ["A", np.nan, "J", "G"],
232+
"var_C": [3, np.nan, 9, 10],
233+
}
234+
)
235+
236+
# expected
237+
tt = pd.DataFrame(
238+
{
239+
"var_A": ["A", np.nan, "Rare", "G"],
240+
"var_B": ["A", np.nan, "Rare", "G"],
241+
"var_C": [3.0, np.nan, "Rare", "Rare"],
242+
}
243+
)
244+
245+
X = encoder.transform(t)
246+
pd.testing.assert_frame_equal(X, tt, check_dtype=False)
247+
248+
137249
def test_user_provides_grouping_label_name_and_variable_list(df_enc_big):
138250
# test case 2: user provides alternative grouping value and variable list
139251
encoder = RareLabelEncoder(
@@ -316,12 +428,84 @@ def test_variables_cast_as_category(df_enc_big):
316428
+ ["G"] * 6,
317429
}
318430
df = pd.DataFrame(df)
431+
df["var_B"] = pd.Categorical(df["var_B"])
319432

320433
# test fit attr
321434
assert encoder.variables_ == ["var_A", "var_B", "var_C"]
322435
assert encoder.n_features_in_ == 3
323436
# test transform output
324-
pd.testing.assert_frame_equal(X, df)
437+
pd.testing.assert_frame_equal(X, df, check_categorical=False)
438+
439+
440+
def test_variables_cast_as_category_with_na_in_transform(df_enc_big):
441+
encoder = RareLabelEncoder(
442+
tol=0.06,
443+
n_categories=5,
444+
variables=None,
445+
replace_with="Rare",
446+
missing_values="ignore",
447+
)
448+
449+
df_enc_big = df_enc_big.copy()
450+
df_enc_big["var_B"] = df_enc_big["var_B"].astype("category")
451+
encoder.fit(df_enc_big)
452+
453+
# input
454+
t = pd.DataFrame(
455+
{
456+
"var_A": ["A", np.nan, "J", "G"],
457+
"var_B": ["A", np.nan, "J", "G"],
458+
"var_C": ["A", np.nan, "J", "G"],
459+
}
460+
)
461+
t["var_B"] = pd.Categorical(t["var_B"])
462+
463+
# expected
464+
tt = pd.DataFrame(
465+
{
466+
"var_A": ["A", np.nan, "Rare", "G"],
467+
"var_B": ["A", np.nan, "Rare", "G"],
468+
"var_C": ["A", np.nan, "Rare", "G"],
469+
}
470+
)
471+
tt["var_B"] = pd.Categorical(tt["var_B"])
472+
pd.testing.assert_frame_equal(encoder.transform(t), tt, check_categorical=False)
473+
474+
475+
def test_variables_cast_as_category_with_na_in_fit(df_enc_big):
476+
477+
df = df_enc_big.copy()
478+
df.loc[df["var_C"] == "G", "var_C"] = np.nan
479+
df["var_C"] = df["var_C"].astype("category")
480+
481+
encoder = RareLabelEncoder(
482+
tol=0.06,
483+
n_categories=3,
484+
missing_values="ignore",
485+
)
486+
encoder.fit(df)
487+
488+
# input
489+
t = pd.DataFrame(
490+
{
491+
"var_A": ["A", np.nan, "J", "G"],
492+
"var_B": ["A", np.nan, "J", "G"],
493+
"var_C": ["C", np.nan, "J", "G"],
494+
}
495+
)
496+
t["var_C"] = pd.Categorical(t["var_C"])
497+
498+
# expected
499+
tt = pd.DataFrame(
500+
{
501+
"var_A": ["A", np.nan, "Rare", "G"],
502+
"var_B": ["A", np.nan, "Rare", "G"],
503+
"var_C": ["C", np.nan, "Rare", "Rare"],
504+
}
505+
)
506+
tt["var_C"] = pd.Categorical(tt["var_C"])
507+
508+
pd.testing.assert_frame_equal(encoder.transform(t), tt, check_categorical=False)
325509

326510

327511
def test_inverse_transform_raises_not_implemented_error(df_enc_big):

0 commit comments

Comments
 (0)