|
14 | 14 | from feature_engine.variable_manipulation import ( |
15 | 15 | _find_all_variables, |
16 | 16 | _find_or_check_categorical_variables, |
| 17 | + _check_input_parameter_variables, |
17 | 18 | ) |
18 | 19 |
|
19 | 20 |
|
20 | 21 | class BaseCategoricalTransformer(BaseEstimator, TransformerMixin): |
21 | | - """shared set-up checks and methods across categorical transformers""" |
| 22 | + """shared set-up checks and methods across categorical transformers |
| 23 | +
|
| 24 | + Parameters |
| 25 | + ---------- |
| 26 | + variables: list, default=None |
| 27 | + The list of categorical variables that will be encoded. If None, the |
| 28 | + encoder will find and transform all variables of type object or categorical by |
| 29 | + default. You can also make the transformer accept numerical variables, see the |
| 30 | + next parameter. |
| 31 | +
|
| 32 | + ignore_format: bool, default=False |
| 33 | + Whether the format in which the categorical variables are cast should be |
| 34 | + ignored. If False, the encoder will automatically select variables of type |
| 35 | + object or categorical, or check that the variables entered by the user are of |
| 36 | + type object or categorical. If True, the encoder will select all variables or |
| 37 | + accept all variables entered by the user, including those cast as numeric. |
| 38 | + """ |
| 39 | + |
| 40 | + def __init__( |
| 41 | + self, |
| 42 | + variables: Union[None, int, str, List[Union[str, int]]] = None, |
| 43 | + ignore_format: bool = False, |
| 44 | + ) -> None: |
| 45 | + |
| 46 | + if not isinstance(ignore_format, bool): |
| 47 | + raise ValueError("ignore_format takes only booleans True and False. " |
| 48 | + f"Got {ignore_format} instead.") |
| 49 | + |
| 50 | + self.variables = _check_input_parameter_variables(variables) |
| 51 | + self.ignore_format = ignore_format |
22 | 52 |
|
23 | 53 | def _check_fit_input_and_variables(self, X: pd.DataFrame) -> pd.DataFrame: |
24 | 54 | """ |
@@ -144,14 +174,23 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: |
144 | 174 |
|
145 | 175 | # check if NaN values were introduced by the encoding |
146 | 176 | if X[self.encoder_dict_.keys()].isnull().sum().sum() > 0: |
147 | | - warnings.warn( |
148 | | - "NaN values were introduced in the returned dataframe by the encoder." |
149 | | - "This means that some of the categories in the input dataframe were " |
150 | | - "not present in the training set used when the fit method was called. " |
151 | | - "Thus, mappings for those categories do not exist. Try using the " |
152 | | - "RareLabelCategoricalEncoder to remove infrequent categories before " |
153 | | - "calling this encoder." |
154 | | - ) |
| 177 | + # obtain the name(s) of the columns have null values |
| 178 | + nan_columns = X.columns[X.isnull().any()].tolist() |
| 179 | + if len(nan_columns) > 1: |
| 180 | + nan_columns_str = ", ".join(nan_columns) |
| 181 | + else: |
| 182 | + nan_columns_str = nan_columns[0] |
| 183 | + |
| 184 | + if self.errors == "ignore": |
| 185 | + warnings.warn( |
| 186 | + "During the encoding, NaN values were introduced in the feature(s) " |
| 187 | + f"{nan_columns_str}." |
| 188 | + ) |
| 189 | + elif self.errors == "raise": |
| 190 | + raise ValueError( |
| 191 | + "During the encoding, NaN values were introduced in the feature(s) " |
| 192 | + f"{nan_columns_str}." |
| 193 | + ) |
155 | 194 |
|
156 | 195 | return X |
157 | 196 |
|
@@ -186,3 +225,47 @@ def _more_tags(self): |
186 | 225 | # so we need to leave without this test |
187 | 226 | tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA" |
188 | 227 | return tags_dict |
| 228 | + |
| 229 | + |
| 230 | +class BaseCategorical(BaseCategoricalTransformer): |
| 231 | + """ |
| 232 | + BaseCategorical() is the parent class to some of the encoders. |
| 233 | + It shares set-up checks of init parameters. |
| 234 | +
|
| 235 | + Parameters |
| 236 | + ---------- |
| 237 | + variables: list, default=None |
| 238 | + The list of categorical variables that will be encoded. If None, the |
| 239 | + encoder will find and transform all variables of type object or categorical by |
| 240 | + default. You can also make the transformer accept numerical variables, see the |
| 241 | + next parameter. |
| 242 | +
|
| 243 | + ignore_format: bool, default=False |
| 244 | + Whether the format in which the categorical variables are cast should be |
| 245 | + ignored. If False, the encoder will automatically select variables of type |
| 246 | + object or categorical, or check that the variables entered by the user are of |
| 247 | + type object or categorical. If True, the encoder will select all variables or |
| 248 | + accept all variables entered by the user, including those cast as numeric. |
| 249 | +
|
| 250 | + errors: string, default='ignore' |
| 251 | + Indicates what to do, when categories not present in the train set are |
| 252 | + encountered during transform. If 'raise', then rare categories will raise an |
| 253 | + error. If 'ignore', then rare categories will be set as NaN and a warning will |
| 254 | + be raised instead. |
| 255 | + """ |
| 256 | + |
| 257 | + def __init__( |
| 258 | + self, |
| 259 | + variables: Union[None, int, str, List[Union[str, int]]] = None, |
| 260 | + ignore_format: bool = False, |
| 261 | + errors: str = "ignore", |
| 262 | + ) -> None: |
| 263 | + |
| 264 | + if errors not in ["raise", "ignore"]: |
| 265 | + raise ValueError( |
| 266 | + "errors takes only values 'raise' and 'ignore ." |
| 267 | + f"Got {errors} instead." |
| 268 | + ) |
| 269 | + |
| 270 | + super().__init__(variables, ignore_format) |
| 271 | + self.errors = errors |
0 commit comments