feature-engine
diff --git a/‎feature_engine/encoding/base_encoder.py‎
Lines changed: 92 additions & 9 deletions b/‎feature_engine/encoding/base_encoder.py‎
Lines changed: 92 additions & 9 deletions
diff --git a/‎feature_engine/encoding/count_frequency.py‎
Lines changed: 12 additions & 9 deletions b/‎feature_engine/encoding/count_frequency.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎feature_engine/encoding/decision_tree.py‎
Lines changed: 2 additions & 3 deletions b/‎feature_engine/encoding/decision_tree.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎feature_engine/encoding/mean_encoding.py‎
Lines changed: 12 additions & 10 deletions b/‎feature_engine/encoding/mean_encoding.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎feature_engine/encoding/one_hot.py‎
Lines changed: 1 addition & 6 deletions b/‎feature_engine/encoding/one_hot.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎feature_engine/encoding/ordinal.py‎
Lines changed: 12 additions & 9 deletions b/‎feature_engine/encoding/ordinal.py‎
Lines changed: 12 additions & 9 deletions
@@ -14,11 +14,41 @@
 from feature_engine.variable_manipulation import (
     _find_all_variables,
     _find_or_check_categorical_variables,
+    _check_input_parameter_variables,
 )
 
 
 class BaseCategoricalTransformer(BaseEstimator, TransformerMixin):
-    """shared set-up checks and methods across categorical transformers"""
+    """shared set-up checks and methods across categorical transformers
+
+    Parameters
+    ----------
+    variables: list, default=None
+        The list of categorical variables that will be encoded. If None, the
+        encoder will find and transform all variables of type object or categorical by
+        default. You can also make the transformer accept numerical variables, see the
+        next parameter.
+
+    ignore_format: bool, default=False
+        Whether the format in which the categorical variables are cast should be
+        ignored. If False, the encoder will automatically select variables of type
+        object or categorical, or check that the variables entered by the user are of
+        type object or categorical. If True, the encoder will select all variables or
+        accept all variables entered by the user, including those cast as numeric.
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, int, str, List[Union[str, int]]] = None,
+        ignore_format: bool = False,
+    ) -> None:
+
+        if not isinstance(ignore_format, bool):
+            raise ValueError("ignore_format takes only booleans True and False. "
+                             f"Got {ignore_format} instead.")
+
+        self.variables = _check_input_parameter_variables(variables)
+        self.ignore_format = ignore_format
 
     def _check_fit_input_and_variables(self, X: pd.DataFrame) -> pd.DataFrame:
         """
@@ -144,14 +174,23 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         # check if NaN values were introduced by the encoding
         if X[self.encoder_dict_.keys()].isnull().sum().sum() > 0:
-            warnings.warn(
-                "NaN values were introduced in the returned dataframe by the encoder."
-                "This means that some of the categories in the input dataframe were "
-                "not present in the training set used when the fit method was called. "
-                "Thus, mappings for those categories do not exist. Try using the "
-                "RareLabelCategoricalEncoder to remove infrequent categories before "
-                "calling this encoder."
-            )
+            # obtain the name(s) of the columns have null values
+            nan_columns = X.columns[X.isnull().any()].tolist()
+            if len(nan_columns) > 1:
+                nan_columns_str = ", ".join(nan_columns)
+            else:
+                nan_columns_str = nan_columns[0]
+
+            if self.errors == "ignore":
+                warnings.warn(
+                    "During the encoding, NaN values were introduced in the feature(s) "
+                    f"{nan_columns_str}."
+                )
+            elif self.errors == "raise":
+                raise ValueError(
+                    "During the encoding, NaN values were introduced in the feature(s) "
+                    f"{nan_columns_str}."
+                )
 
         return X
 
@@ -186,3 +225,47 @@ def _more_tags(self):
         # so we need to leave without this test
         tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA"
         return tags_dict
+
+
+class BaseCategorical(BaseCategoricalTransformer):
+    """
+    BaseCategorical() is the parent class to some of the encoders.
+    It shares set-up checks of init parameters.
+
+    Parameters
+    ----------
+    variables: list, default=None
+        The list of categorical variables that will be encoded. If None, the
+        encoder will find and transform all variables of type object or categorical by
+        default. You can also make the transformer accept numerical variables, see the
+        next parameter.
+
+    ignore_format: bool, default=False
+        Whether the format in which the categorical variables are cast should be
+        ignored. If False, the encoder will automatically select variables of type
+        object or categorical, or check that the variables entered by the user are of
+        type object or categorical. If True, the encoder will select all variables or
+        accept all variables entered by the user, including those cast as numeric.
+
+    errors: string, default='ignore'
+        Indicates what to do, when categories not present in the train set are
+        encountered during transform. If 'raise', then rare categories will raise an
+        error. If 'ignore', then rare categories will be set as NaN and a warning will
+        be raised instead.
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, int, str, List[Union[str, int]]] = None,
+        ignore_format: bool = False,
+        errors: str = "ignore",
+    ) -> None:
+
+        if errors not in ["raise", "ignore"]:
+            raise ValueError(
+                "errors takes only values 'raise' and 'ignore ."
+                f"Got {errors} instead."
+            )
+
+        super().__init__(variables, ignore_format)
+        self.errors = errors
@@ -5,11 +5,10 @@
 
 import pandas as pd
 
-from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
-from feature_engine.variable_manipulation import _check_input_parameter_variables
+from feature_engine.encoding.base_encoder import BaseCategorical
 
 
-class CountFrequencyEncoder(BaseCategoricalTransformer):
+class CountFrequencyEncoder(BaseCategorical):
     """
     The CountFrequencyEncoder() replaces categories by either the count or the
     percentage of observations per category.
@@ -55,6 +54,12 @@ class CountFrequencyEncoder(BaseCategoricalTransformer):
         type object or categorical. If True, the encoder will select all variables or
         accept all variables entered by the user, including those cast as numeric.
 
+    errors: string, default='ignore'
+        Indicates what to do when categories not present in the train set are
+        encountered during transform. If 'raise', then rare categories will raise an
+        error. If 'ignore', then rare categories will be set as NaN and a warning will
+        be raised instead.
+
     Attributes
     ----------
     encoder_dict_:
@@ -97,18 +102,16 @@ def __init__(
         encoding_method: str = "count",
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         ignore_format: bool = False,
+        errors: str = "ignore"
     ) -> None:
 
         if encoding_method not in ["count", "frequency"]:
             raise ValueError(
                 "encoding_method takes only values 'count' and 'frequency'"
             )
-        if not isinstance(ignore_format, bool):
-            raise ValueError("ignore_format takes only booleans True and False")
+        super().__init__(variables, ignore_format, errors)
 
         self.encoding_method = encoding_method
-        self.variables = _check_input_parameter_variables(variables)
-        self.ignore_format = ignore_format
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -149,11 +152,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         return X
 
-    transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
+    transform.__doc__ = BaseCategorical.transform.__doc__
 
     def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
         X = super().inverse_transform(X)
 
         return X
 
-    inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__
+    inverse_transform.__doc__ = BaseCategorical.inverse_transform.__doc__
@@ -9,7 +9,6 @@
 from feature_engine.discretisation import DecisionTreeDiscretiser
 from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
 from feature_engine.encoding.ordinal import OrdinalEncoder
-from feature_engine.variable_manipulation import _check_input_parameter_variables
 
 
 class DecisionTreeEncoder(BaseCategoricalTransformer):
@@ -139,14 +138,13 @@ def __init__(
         ignore_format: bool = False,
     ) -> None:
 
+        super().__init__(variables, ignore_format)
         self.encoding_method = encoding_method
         self.cv = cv
         self.scoring = scoring
         self.regression = regression
         self.param_grid = param_grid
         self.random_state = random_state
-        self.variables = _check_input_parameter_variables(variables)
-        self.ignore_format = ignore_format
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -176,6 +174,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
             encoding_method=self.encoding_method,
             variables=self.variables_,
             ignore_format=self.ignore_format,
+            errors="raise",
         )
 
         # initialize decision tree discretiser
 
@@ -5,11 +5,10 @@
 
 import pandas as pd
 
-from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
-from feature_engine.variable_manipulation import _check_input_parameter_variables
+from feature_engine.encoding.base_encoder import BaseCategorical
 
 
-class MeanEncoder(BaseCategoricalTransformer):
+class MeanEncoder(BaseCategorical):
     """
     The MeanEncoder() replaces categories by the mean value of the target for each
     category.
@@ -47,6 +46,12 @@ class MeanEncoder(BaseCategoricalTransformer):
         type object or categorical. If True, the encoder will select all variables or
         accept all variables entered by the user, including those cast as numeric.
 
+    errors: string, default='ignore'
+        Indicates what to do when categories not present in the train set are
+        encountered during transform. If 'raise', then rare categories will raise an
+        error. If 'ignore', then rare categories will be set as NaN and a warning will
+        be raised instead.
+
     Attributes
     ----------
     encoder_dict_:
@@ -95,13 +100,10 @@ def __init__(
         self,
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         ignore_format: bool = False,
+        errors: str = "ignore"
     ) -> None:
 
-        if not isinstance(ignore_format, bool):
-            raise ValueError("ignore_format takes only booleans True and False")
-
-        self.variables = _check_input_parameter_variables(variables)
-        self.ignore_format = ignore_format
+        super().__init__(variables, ignore_format, errors)
 
     def fit(self, X: pd.DataFrame, y: pd.Series):
         """
@@ -142,11 +144,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         return X
 
-    transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
+    transform.__doc__ = BaseCategorical.transform.__doc__
 
     def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
         X = super().inverse_transform(X)
 
         return X
 
-    inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__
+    inverse_transform.__doc__ = BaseCategorical.inverse_transform.__doc__
@@ -7,7 +7,6 @@
 import pandas as pd
 
 from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
-from feature_engine.variable_manipulation import _check_input_parameter_variables
 
 
 class OneHotEncoder(BaseCategoricalTransformer):
@@ -146,14 +145,10 @@ def __init__(
         if not isinstance(drop_last_binary, bool):
             raise ValueError("drop_last_binary takes only True or False")
 
-        if not isinstance(ignore_format, bool):
-            raise ValueError("ignore_format takes only booleans True and False")
-
+        super().__init__(variables, ignore_format)
         self.top_categories = top_categories
         self.drop_last = drop_last
         self.drop_last_binary = drop_last_binary
-        self.variables = _check_input_parameter_variables(variables)
-        self.ignore_format = ignore_format
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
 
@@ -5,11 +5,10 @@
 
 import pandas as pd
 
-from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
-from feature_engine.variable_manipulation import _check_input_parameter_variables
+from feature_engine.encoding.base_encoder import BaseCategorical
 
 
-class OrdinalEncoder(BaseCategoricalTransformer):
+class OrdinalEncoder(BaseCategorical):
     """
     The OrdinalCategoricalEncoder() replaces categories by ordinal numbers
     (0, 1, 2, 3, etc). The numbers can be ordered based on the mean of the target
@@ -52,6 +51,12 @@ class OrdinalEncoder(BaseCategoricalTransformer):
         type object or categorical. If True, the encoder will select all variables or
         accept all variables entered by the user, including those cast as numeric.
 
+    errors: string, default='ignore'
+        Indicates what to do when categories not present in the train set are
+        encountered during transform. If 'raise', then rare categories will raise an
+        error. If 'ignore', then rare categories will be set as NaN and a warning will
+        be raised instead.
+
     Attributes
     ----------
     encoder_dict_:
@@ -102,19 +107,17 @@ def __init__(
         encoding_method: str = "ordered",
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         ignore_format: bool = False,
+        errors: str = "ignore"
     ) -> None:
 
         if encoding_method not in ["ordered", "arbitrary"]:
             raise ValueError(
                 "encoding_method takes only values 'ordered' and 'arbitrary'"
             )
 
-        if not isinstance(ignore_format, bool):
-            raise ValueError("ignore_format takes only booleans True and False")
+        super().__init__(variables, ignore_format, errors)
 
         self.encoding_method = encoding_method
-        self.variables = _check_input_parameter_variables(variables)
-        self.ignore_format = ignore_format
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """Learn the numbers to be used to replace the categories in each
@@ -174,11 +177,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         return X
 
-    transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
+    transform.__doc__ = BaseCategorical.transform.__doc__
 
     def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
         X = super().inverse_transform(X)
 
         return X
 
-    inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__
+    inverse_transform.__doc__ = BaseCategorical.inverse_transform.__doc__