feature-engine · Morgan-Sell · Apr 16, 2022 · Apr 16, 2022 · Apr 16, 2022 · Apr 16, 2022
diff --git a/docs/api_doc/discretisation/TargetMeanDiscretiser.rst b/docs/api_doc/discretisation/TargetMeanDiscretiser.rst
@@ -0,0 +1,5 @@
+TargetMeanDiscretiser
+=====================
+
+.. autoclass:: feature_engine.discretisation.TargetMeanDiscretiser
+    :members:
diff --git a/docs/api_doc/discretisation/index.rst b/docs/api_doc/discretisation/index.rst
@@ -16,7 +16,8 @@ into continuous intervals.
 :class:`EqualFrequencyDiscretiser()`     Sorts values into intervals with similar number of observations.
 :class:`EqualWidthDiscretiser()`         Sorts values into intervals of equal size.
 :class:`ArbitraryDiscretiser()`          Sorts values into intervals predefined by the user.
-:class:`DecisionTreeDiscretiser()`       Replaces values by predictions of a decision tree, which are discrete
+:class:`DecisionTreeDiscretiser()`       Replaces values by predictions of a decision tree, which are discrete.
+:class:`TargetMeanDiscretiser()`         Sorts variable into equal frequency or equal width intervals then replaces intervals by the target mean.
 =====================================  ========================================================================
 
 
@@ -28,6 +29,7 @@ into continuous intervals.
    EqualWidthDiscretiser
    ArbitraryDiscretiser
    DecisionTreeDiscretiser
+   TargetMeanDiscretiser
 
 Additional transformers for discretisation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/docs/index.rst b/docs/index.rst
@@ -138,6 +138,7 @@ Variable Discretisation: Discretisers
 - :doc:`api_doc/discretisation/EqualFrequencyDiscretiser`: sorts variable into equal frequency intervals
 - :doc:`api_doc/discretisation/EqualWidthDiscretiser`: sorts variable into equal width intervals
 - :doc:`api_doc/discretisation/DecisionTreeDiscretiser`: uses decision trees to create finite variables
+- :doc:`api_doc/discretisation/TargetMeanDiscretiser`: sorts variable into equal frequency or equal width intervals then replaces intervals by the target mean
 
 Outlier Capping or Removal
 --------------------------

diff --git a/docs/user_guide/discretisation/TargetMeanDiscretiser.rst b/docs/user_guide/discretisation/TargetMeanDiscretiser.rst
@@ -0,0 +1,84 @@
+.. _target_mean_discretiser:
+
+.. currentmodule:: feature_engine.discretisation
+
+TargetMeanDiscretiser
+=====================
+
+The :class:`TargetMeanDiscretiser()` sorts numerical variables and organizes the values into bins
+using either :class:`EqualFrequencyDiscretiser()` or :class:`EqualWidthDiscretiser()`. Once the numerical
+variables are separated into bins, :class:`MeanEncoder()` replaces categories with the mean of the
+target per bin interval. The number of bins is determined by the user.
+
+Let's look at an example using the California Housing Dataset.
+
+First, let's load the data and separate it into train and test:
+
+.. code:: python
+
+	import numpy as np
+	import pandas as pd
+	import matplotlib.pyplot as plt
+    from sklearn.datasets import fetch_california_housing
+	from sklearn.model_selection import train_test_split
+
+    from feature_engine.discretisation import TargetMeanDiscretiser
+
+    # Load dataset
+    california_dataset = fetch_california_housing()
+    data = pd.DataFrame(california_dataset.data, columns=california_dataset.feature_names)
+
+    # Seperate into train and test sets
+    X_train, X_test, y_train, y_test = train_test_split(
+        data, california_dataset["target"], test_size=0.3,
+        random_state=0)
+
+Now, we set up the :class:`TargetMeanDiscretiser()` to encode the discretised bins and replace
+the bin indices only in the 3 indicated variables using the :class:`EqualFrequencyDiscretiser()`:
+
+.. code:: python
+
+    # set up the discretisation transformer
+    disc = TargetMeanDiscretiser(variables=["HouseAge", "AveRooms", "Population"],
+                                strategy="equal_frequency",
+                                bins=5)
+
+    # fit the transformer
+    disc.fit(X_train, y_train)
+
+With `fit()` the transformer learns the boundaries of each interval. Then, we can go
+ahead and sort the values into the intervals. The transformer learns the target mean
+value for each interval, which are stored in `encoder_dict_` parameter:
+
+.. code:: python
+
+    disc._pipeline["encoder"].encoder_dict_
+
+The `encoder_dict_` contains the mean value of the target per bin interval, per variable.
+So we can easily use this dictionary to map the numbers to the discretised bins.
+
+.. code:: python
+
+    {'HouseAge': {Interval(-inf, 17.0, closed='right'): 2.0806529160739684,
+        Interval(17.0, 25.0, closed='right'): 2.097539197771588,
+        Interval(25.0, 33.0, closed='right'): 2.0686614742967993,
+        Interval(33.0, 40.0, closed='right'): 2.1031412685185185,
+        Interval(40.0, inf, closed='right'): 2.0266248845381525},
+    'AveRooms': {Interval(-inf, 4.281, closed='right'): 2.0751556984478934,
+        Interval(4.281, 4.94, closed='right'): 2.0353196247563354,
+        Interval(4.94, 5.524, closed='right'): 2.122038111675127,
+        Interval(5.524, 6.258, closed='right'): 2.0422810965372507,
+        Interval(6.258, inf, closed='right'): 2.103166361757106},
+    'Population': {Interval(-inf, 709.0, closed='right'): 2.0853869883779685,
+        Interval(709.0, 1004.0, closed='right'): 2.0658340239808153,
+        Interval(1004.0, 1346.0, closed='right'): 2.0712619255907487,
+        Interval(1346.0, 1905.0, closed='right'): 2.0454417591204397,
+        Interval(1905.0, inf, closed='right'): 2.108366283914729}}
+
+We can now go ahead and replace the bins with the numbers:
+
+..code:: python
+
+    # transform the data
+    train_t = disc.transform(X_train)
+    test_t = disc.transform(X_test)
diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py
@@ -7,10 +7,12 @@
 from .decision_tree import DecisionTreeDiscretiser
 from .equal_frequency import EqualFrequencyDiscretiser
 from .equal_width import EqualWidthDiscretiser
+from .target_mean import TargetMeanDiscretiser
 
 __all__ = [
     "DecisionTreeDiscretiser",
     "EqualFrequencyDiscretiser",
     "EqualWidthDiscretiser",
     "ArbitraryDiscretiser",
+    "TargetMeanDiscretiser"
 ]
diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py
@@ -0,0 +1,226 @@
+from typing import List, Union
+
+import pandas as pd
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import check_is_fitted
+
+from feature_engine._docstrings.class_inputs import _variables_numerical_docstring
+from feature_engine._docstrings.fit_attributes import (
+    _feature_names_in_docstring,
+    _n_features_in_docstring,
+    _variables_attribute_docstring,
+)
+from feature_engine._docstrings.methods import (
+    _fit_not_learn_docstring,
+    _fit_transform_docstring,
+)
+from feature_engine._docstrings.substitute import Substitution
+from feature_engine.dataframe_checks import (
+    _check_contains_inf,
+    _check_contains_na,
+    _check_X_matches_training_df,
+    check_X,
+    check_X_y,
+)
+from feature_engine.discretisation import (
+    EqualFrequencyDiscretiser,
+    EqualWidthDiscretiser,
+)
+from feature_engine.discretisation.base_discretiser import BaseDiscretiser
+from feature_engine.encoding import MeanEncoder
+from feature_engine.variable_manipulation import (
+    _check_input_parameter_variables,
+    _find_or_check_numerical_variables,
+)
+
+
+@Substitution(
+    return_objects=BaseDiscretiser._return_object_docstring,
+    return_boundaries=BaseDiscretiser._return_boundaries_docstring,
+    binner_dict_=BaseDiscretiser._binner_dict_docstring,
+    transform=BaseDiscretiser._transform_docstring,
+    variables=_variables_numerical_docstring,
+    variables_=_variables_attribute_docstring,
+    feature_names_in_=_feature_names_in_docstring,
+    n_features_in_=_n_features_in_docstring,
+    fit=_fit_not_learn_docstring,
+    fit_transform=_fit_transform_docstring,
+)
+class TargetMeanDiscretiser(BaseDiscretiser):
+    """
+
+    Parameters
+    ----------
+    strategy: str, default='equal_width'
+        Whether the bins should of equal width ('equal_width') or equal frequency
+        ('equal_frequency').
+
+    {variables}
+
+    bins: int, default=10
+        Desired number of equal-width or equal-distance intervals / bins.
+
+    errors: string, default='ignore'
+        Indicates what to do when a value is outside the limits indicated in the
+        'binning_dict'. If 'raise', the transformation will raise an error.
+        If 'ignore', values outside the limits are returned as NaN
+        and a warning will be raised instead.
+
+    Attributes
+    ----------
+    {variables_}
+
+    {binner_dict_}
+
+    {feature_names_in_}
+
+    {n_features_in_}
+
+    Methods
+    -------
+    {fit}
+
+    {fit_transform}
+
+    {transform}
+
+    See Also
+    --------
+    pandas.cut
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, int, str, List[Union[str, int]]] = None,
+        bins: int = 10,
+        strategy: str = "equal_frequency",
+        errors: str = "ignore",
+    ) -> None:
+
+        if not isinstance(bins, int):
+            raise ValueError(
+                f"bins must be an integer. Got {bins} instead."
+            )
+        if strategy not in ("equal_frequency", "equal_width"):
+            raise ValueError(
+                "strategy must equal 'equal_frequency' or 'equal_width'. "
+                f"Got {strategy} instead."
+            )
+
+        if errors not in ("ignore", "raise"):
+            raise ValueError(
+                "errors only takes values 'ignore' and 'raise. "
+                f"Got {errors} instead."
+            )
+
+        self.variables = _check_input_parameter_variables(variables)
+        self.bins = bins
+        self.strategy = strategy
+        self.errors = errors
+
+    def fit(self, X: pd.DataFrame, y: pd.Series):
+        """
+        Learn the boundaries of the selected dicretiser's intervals / bins
+        for the chosen numerical variables.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training dataset. Can be the entire dataframe, not just the
+            variables to be transformed.
+
+        y : pandas series of shape = [n_samples,]
+            y is not needed in this discretiser. You can pass y or None.
+        """
+        # check if 'X' is a dataframe
+        X, y = check_X_y(X, y)
+
+        #  identify numerical variables
+        self.variables_numerical_ = _find_or_check_numerical_variables(
+            X, self.variables
+        )
+
+        # check for missing values
+        _check_contains_na(X, self.variables_numerical_)
+
+        # check for inf
+        _check_contains_inf(X, self.variables_numerical_)
+
+        # instantiate pipeline
+        self._pipeline = self._make_pipeline()
+        self._pipeline.fit(X, y)
+
+        # store input features
+        self.n_features_in_ = X.shape[1]
+        self.feature_names_in_ = list(X.columns)
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Replace original values by the average of the target mean value per bin
+        for each of the variables.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_enc: pandas dataframe of shape = [n_samples, n_features]
+            The transformed data with the means of the selected numerical variables.
+
+        """
+        # check that fit method has been called
+        check_is_fitted(self)
+
+        # check that input is a dataframe
+        X = check_X(X)
+
+        # check that input data contain number of columns as the fitted df
+        _check_X_matches_training_df(X, self.n_features_in_)
+
+        # check for missing values
+        _check_contains_na(X, self.variables_numerical_)
+
+        # check for infinite values
+        _check_contains_inf(X, self.variables_numerical_)
+
+        # discretise and encode
+        X_tr = self._pipeline.transform(X)
+
+        return X_tr
+
+    def _make_discretiser(self):
+        """
+        Instantiate the EqualFrequencyDiscretiser or EqualWidthDiscretiser.
+        """
+        if self.strategy == "equal_frequency":
+            discretiser = EqualFrequencyDiscretiser(
+                q=self.bins,
+                variables=self.variables_numerical_,
+                return_boundaries=True,
+            )
+        else:
+            discretiser = EqualWidthDiscretiser(
+                bins=self.bins,
+                variables=self.variables_numerical_,
+                return_boundaries=True
+            )
+
+        return discretiser
+
+    def _make_pipeline(self):
+        """
+        Instantiate pipeline comprised of discretiser and encoder.
+        """
+        pipe = Pipeline([
+            ("discretiser", self._make_discretiser()),
+            ("encoder", MeanEncoder(
+                variables=self.variables_numerical_,
+                ignore_format=True)
+             )]
+        )
+
+        return pipe
diff --git a/tests/test_discretisation/test_check_estimator_discretisers.py b/tests/test_discretisation/test_check_estimator_discretisers.py
@@ -7,6 +7,7 @@
     DecisionTreeDiscretiser,
     EqualFrequencyDiscretiser,
     EqualWidthDiscretiser,
+    TargetMeanDiscretiser,
 )
 from tests.estimator_checks.estimator_checks import check_feature_engine_estimator
 
@@ -15,6 +16,7 @@
     EqualFrequencyDiscretiser(),
     EqualWidthDiscretiser(),
     ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}),
+    TargetMeanDiscretiser(),
 ]