Skip to content

Commit f510dc7

Browse files
base_encoder NaN Detection (#338)
* identifies column names that have NaN values. Informs user via a warning. Need to create test. * tell user which columns have null values when applicable * fixed error in a variable * fix code style errors * add BaseCategoricalTransformer import to init file * add test_base_categorical_transformer_detect_nan fcn * fix style error - no newline at end of init file * return __init__ to prior version. remove BaseCategoricalTransformer * revise warning text * add UserWarning to NaN detection warning statement * implement pytest.raise() and assert statement in test_base_categorical_transformer_detect_nan() * resets init * renamed test file * modifies error messages, removes irrelevant files, adds tests * add init param rare_labels=ignore to the CountFrequencyEncoder, WoEEncoder, PRatioEncoder, MeanEncoder, and Ordinal Encoder classes * add error to BaseCategorialTransformer transform() method if rare_labels = 'raise' * update test_error_if_input_df_contains_categories_not_present_in_fit_df fcn to test for error message if encoder.rare_labels = 'raise' * change ValueError text caused by NaN values in a feature(s) * change message string to check ValueError for test_error_if_input_df_contains_categories_not_present_in_fit_df fcn * add init params - variables and ignore_format - and respective docstrings to BaseCategoricalTransformer() * delete init params -'variables' and 'ignore_format' - and correspodning checks/docstring from CountFrequencyEncoder(), DecisionTreeEncoder(), and MeanEncoder() classes * delete init params - 'variables' and 'ignore_format' - and corresponding checks/docstring from OneHotEncoder, OrdinalEncoder, PRatioEncoder, RareLabelEncoder, and WoEEncoder * reverts last 3 commits * creates new class with 3 shared parameters * refactors init in encoders * refactor init in remaining encoders * removes comma from docstrings * adds param to cat encoder in tree discretizer * add rare_labels='raise' to three encodings test for test_warn_if_transform_df_contains_categories_not_seen_in_fit * add rare_labels param to test_warn_if_transform_df_contains_categories_not_seen_in_fit for OrdinalEncoder, PRatioEncoder, and WoEEncoder * fix bug in test_warn_if_transform_df_contains_categories_not_seen_in_fit for PRatioEncoder * fix assert for ValueError check on test_warning_if_transform_df_contains_categories_not_present_in_fit_df for five encoders * debug test_error_if_input_df_contains_categories_not_present_in_training_df * fix style test fail * create test_error_if_rare_labels_not_permitted_value for five encoders * change 'rare_labels' to 'errors' * change self.rare_labels to self.errors * Delete Documents/09_freelance/feature_engine/envs/feature-engine directory remove virtual environment directory Co-authored-by: Soledad Galli <[email protected]>
1 parent 2f22a2b commit f510dc7

15 files changed

+288
-81
lines changed

feature_engine/encoding/base_encoder.py

Lines changed: 92 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,41 @@
1414
from feature_engine.variable_manipulation import (
1515
_find_all_variables,
1616
_find_or_check_categorical_variables,
17+
_check_input_parameter_variables,
1718
)
1819

1920

2021
class BaseCategoricalTransformer(BaseEstimator, TransformerMixin):
21-
"""shared set-up checks and methods across categorical transformers"""
22+
"""shared set-up checks and methods across categorical transformers
23+
24+
Parameters
25+
----------
26+
variables: list, default=None
27+
The list of categorical variables that will be encoded. If None, the
28+
encoder will find and transform all variables of type object or categorical by
29+
default. You can also make the transformer accept numerical variables, see the
30+
next parameter.
31+
32+
ignore_format: bool, default=False
33+
Whether the format in which the categorical variables are cast should be
34+
ignored. If False, the encoder will automatically select variables of type
35+
object or categorical, or check that the variables entered by the user are of
36+
type object or categorical. If True, the encoder will select all variables or
37+
accept all variables entered by the user, including those cast as numeric.
38+
"""
39+
40+
def __init__(
41+
self,
42+
variables: Union[None, int, str, List[Union[str, int]]] = None,
43+
ignore_format: bool = False,
44+
) -> None:
45+
46+
if not isinstance(ignore_format, bool):
47+
raise ValueError("ignore_format takes only booleans True and False. "
48+
f"Got {ignore_format} instead.")
49+
50+
self.variables = _check_input_parameter_variables(variables)
51+
self.ignore_format = ignore_format
2252

2353
def _check_fit_input_and_variables(self, X: pd.DataFrame) -> pd.DataFrame:
2454
"""
@@ -144,14 +174,23 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
144174

145175
# check if NaN values were introduced by the encoding
146176
if X[self.encoder_dict_.keys()].isnull().sum().sum() > 0:
147-
warnings.warn(
148-
"NaN values were introduced in the returned dataframe by the encoder."
149-
"This means that some of the categories in the input dataframe were "
150-
"not present in the training set used when the fit method was called. "
151-
"Thus, mappings for those categories do not exist. Try using the "
152-
"RareLabelCategoricalEncoder to remove infrequent categories before "
153-
"calling this encoder."
154-
)
177+
# obtain the name(s) of the columns have null values
178+
nan_columns = X.columns[X.isnull().any()].tolist()
179+
if len(nan_columns) > 1:
180+
nan_columns_str = ", ".join(nan_columns)
181+
else:
182+
nan_columns_str = nan_columns[0]
183+
184+
if self.errors == "ignore":
185+
warnings.warn(
186+
"During the encoding, NaN values were introduced in the feature(s) "
187+
f"{nan_columns_str}."
188+
)
189+
elif self.errors == "raise":
190+
raise ValueError(
191+
"During the encoding, NaN values were introduced in the feature(s) "
192+
f"{nan_columns_str}."
193+
)
155194

156195
return X
157196

@@ -186,3 +225,47 @@ def _more_tags(self):
186225
# so we need to leave without this test
187226
tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA"
188227
return tags_dict
228+
229+
230+
class BaseCategorical(BaseCategoricalTransformer):
231+
"""
232+
BaseCategorical() is the parent class to some of the encoders.
233+
It shares set-up checks of init parameters.
234+
235+
Parameters
236+
----------
237+
variables: list, default=None
238+
The list of categorical variables that will be encoded. If None, the
239+
encoder will find and transform all variables of type object or categorical by
240+
default. You can also make the transformer accept numerical variables, see the
241+
next parameter.
242+
243+
ignore_format: bool, default=False
244+
Whether the format in which the categorical variables are cast should be
245+
ignored. If False, the encoder will automatically select variables of type
246+
object or categorical, or check that the variables entered by the user are of
247+
type object or categorical. If True, the encoder will select all variables or
248+
accept all variables entered by the user, including those cast as numeric.
249+
250+
errors: string, default='ignore'
251+
Indicates what to do, when categories not present in the train set are
252+
encountered during transform. If 'raise', then rare categories will raise an
253+
error. If 'ignore', then rare categories will be set as NaN and a warning will
254+
be raised instead.
255+
"""
256+
257+
def __init__(
258+
self,
259+
variables: Union[None, int, str, List[Union[str, int]]] = None,
260+
ignore_format: bool = False,
261+
errors: str = "ignore",
262+
) -> None:
263+
264+
if errors not in ["raise", "ignore"]:
265+
raise ValueError(
266+
"errors takes only values 'raise' and 'ignore ."
267+
f"Got {errors} instead."
268+
)
269+
270+
super().__init__(variables, ignore_format)
271+
self.errors = errors

feature_engine/encoding/count_frequency.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55

66
import pandas as pd
77

8-
from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
9-
from feature_engine.variable_manipulation import _check_input_parameter_variables
8+
from feature_engine.encoding.base_encoder import BaseCategorical
109

1110

12-
class CountFrequencyEncoder(BaseCategoricalTransformer):
11+
class CountFrequencyEncoder(BaseCategorical):
1312
"""
1413
The CountFrequencyEncoder() replaces categories by either the count or the
1514
percentage of observations per category.
@@ -55,6 +54,12 @@ class CountFrequencyEncoder(BaseCategoricalTransformer):
5554
type object or categorical. If True, the encoder will select all variables or
5655
accept all variables entered by the user, including those cast as numeric.
5756
57+
errors: string, default='ignore'
58+
Indicates what to do when categories not present in the train set are
59+
encountered during transform. If 'raise', then rare categories will raise an
60+
error. If 'ignore', then rare categories will be set as NaN and a warning will
61+
be raised instead.
62+
5863
Attributes
5964
----------
6065
encoder_dict_:
@@ -97,18 +102,16 @@ def __init__(
97102
encoding_method: str = "count",
98103
variables: Union[None, int, str, List[Union[str, int]]] = None,
99104
ignore_format: bool = False,
105+
errors: str = "ignore"
100106
) -> None:
101107

102108
if encoding_method not in ["count", "frequency"]:
103109
raise ValueError(
104110
"encoding_method takes only values 'count' and 'frequency'"
105111
)
106-
if not isinstance(ignore_format, bool):
107-
raise ValueError("ignore_format takes only booleans True and False")
112+
super().__init__(variables, ignore_format, errors)
108113

109114
self.encoding_method = encoding_method
110-
self.variables = _check_input_parameter_variables(variables)
111-
self.ignore_format = ignore_format
112115

113116
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
114117
"""
@@ -149,11 +152,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
149152

150153
return X
151154

152-
transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
155+
transform.__doc__ = BaseCategorical.transform.__doc__
153156

154157
def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
155158
X = super().inverse_transform(X)
156159

157160
return X
158161

159-
inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__
162+
inverse_transform.__doc__ = BaseCategorical.inverse_transform.__doc__

feature_engine/encoding/decision_tree.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from feature_engine.discretisation import DecisionTreeDiscretiser
1010
from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
1111
from feature_engine.encoding.ordinal import OrdinalEncoder
12-
from feature_engine.variable_manipulation import _check_input_parameter_variables
1312

1413

1514
class DecisionTreeEncoder(BaseCategoricalTransformer):
@@ -139,14 +138,13 @@ def __init__(
139138
ignore_format: bool = False,
140139
) -> None:
141140

141+
super().__init__(variables, ignore_format)
142142
self.encoding_method = encoding_method
143143
self.cv = cv
144144
self.scoring = scoring
145145
self.regression = regression
146146
self.param_grid = param_grid
147147
self.random_state = random_state
148-
self.variables = _check_input_parameter_variables(variables)
149-
self.ignore_format = ignore_format
150148

151149
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
152150
"""
@@ -176,6 +174,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
176174
encoding_method=self.encoding_method,
177175
variables=self.variables_,
178176
ignore_format=self.ignore_format,
177+
errors="raise",
179178
)
180179

181180
# initialize decision tree discretiser

feature_engine/encoding/mean_encoding.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55

66
import pandas as pd
77

8-
from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
9-
from feature_engine.variable_manipulation import _check_input_parameter_variables
8+
from feature_engine.encoding.base_encoder import BaseCategorical
109

1110

12-
class MeanEncoder(BaseCategoricalTransformer):
11+
class MeanEncoder(BaseCategorical):
1312
"""
1413
The MeanEncoder() replaces categories by the mean value of the target for each
1514
category.
@@ -47,6 +46,12 @@ class MeanEncoder(BaseCategoricalTransformer):
4746
type object or categorical. If True, the encoder will select all variables or
4847
accept all variables entered by the user, including those cast as numeric.
4948
49+
errors: string, default='ignore'
50+
Indicates what to do when categories not present in the train set are
51+
encountered during transform. If 'raise', then rare categories will raise an
52+
error. If 'ignore', then rare categories will be set as NaN and a warning will
53+
be raised instead.
54+
5055
Attributes
5156
----------
5257
encoder_dict_:
@@ -95,13 +100,10 @@ def __init__(
95100
self,
96101
variables: Union[None, int, str, List[Union[str, int]]] = None,
97102
ignore_format: bool = False,
103+
errors: str = "ignore"
98104
) -> None:
99105

100-
if not isinstance(ignore_format, bool):
101-
raise ValueError("ignore_format takes only booleans True and False")
102-
103-
self.variables = _check_input_parameter_variables(variables)
104-
self.ignore_format = ignore_format
106+
super().__init__(variables, ignore_format, errors)
105107

106108
def fit(self, X: pd.DataFrame, y: pd.Series):
107109
"""
@@ -142,11 +144,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
142144

143145
return X
144146

145-
transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
147+
transform.__doc__ = BaseCategorical.transform.__doc__
146148

147149
def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
148150
X = super().inverse_transform(X)
149151

150152
return X
151153

152-
inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__
154+
inverse_transform.__doc__ = BaseCategorical.inverse_transform.__doc__

feature_engine/encoding/one_hot.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import pandas as pd
88

99
from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
10-
from feature_engine.variable_manipulation import _check_input_parameter_variables
1110

1211

1312
class OneHotEncoder(BaseCategoricalTransformer):
@@ -146,14 +145,10 @@ def __init__(
146145
if not isinstance(drop_last_binary, bool):
147146
raise ValueError("drop_last_binary takes only True or False")
148147

149-
if not isinstance(ignore_format, bool):
150-
raise ValueError("ignore_format takes only booleans True and False")
151-
148+
super().__init__(variables, ignore_format)
152149
self.top_categories = top_categories
153150
self.drop_last = drop_last
154151
self.drop_last_binary = drop_last_binary
155-
self.variables = _check_input_parameter_variables(variables)
156-
self.ignore_format = ignore_format
157152

158153
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
159154
"""

feature_engine/encoding/ordinal.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55

66
import pandas as pd
77

8-
from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
9-
from feature_engine.variable_manipulation import _check_input_parameter_variables
8+
from feature_engine.encoding.base_encoder import BaseCategorical
109

1110

12-
class OrdinalEncoder(BaseCategoricalTransformer):
11+
class OrdinalEncoder(BaseCategorical):
1312
"""
1413
The OrdinalCategoricalEncoder() replaces categories by ordinal numbers
1514
(0, 1, 2, 3, etc). The numbers can be ordered based on the mean of the target
@@ -52,6 +51,12 @@ class OrdinalEncoder(BaseCategoricalTransformer):
5251
type object or categorical. If True, the encoder will select all variables or
5352
accept all variables entered by the user, including those cast as numeric.
5453
54+
errors: string, default='ignore'
55+
Indicates what to do when categories not present in the train set are
56+
encountered during transform. If 'raise', then rare categories will raise an
57+
error. If 'ignore', then rare categories will be set as NaN and a warning will
58+
be raised instead.
59+
5560
Attributes
5661
----------
5762
encoder_dict_:
@@ -102,19 +107,17 @@ def __init__(
102107
encoding_method: str = "ordered",
103108
variables: Union[None, int, str, List[Union[str, int]]] = None,
104109
ignore_format: bool = False,
110+
errors: str = "ignore"
105111
) -> None:
106112

107113
if encoding_method not in ["ordered", "arbitrary"]:
108114
raise ValueError(
109115
"encoding_method takes only values 'ordered' and 'arbitrary'"
110116
)
111117

112-
if not isinstance(ignore_format, bool):
113-
raise ValueError("ignore_format takes only booleans True and False")
118+
super().__init__(variables, ignore_format, errors)
114119

115120
self.encoding_method = encoding_method
116-
self.variables = _check_input_parameter_variables(variables)
117-
self.ignore_format = ignore_format
118121

119122
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
120123
"""Learn the numbers to be used to replace the categories in each
@@ -174,11 +177,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
174177

175178
return X
176179

177-
transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
180+
transform.__doc__ = BaseCategorical.transform.__doc__
178181

179182
def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
180183
X = super().inverse_transform(X)
181184

182185
return X
183186

184-
inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__
187+
inverse_transform.__doc__ = BaseCategorical.inverse_transform.__doc__

0 commit comments

Comments
 (0)