feature-engine
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/api_doc/discretisation/GeometricWidthDiscretiser.rst‎
Lines changed: 5 additions & 0 deletions b/‎docs/api_doc/discretisation/GeometricWidthDiscretiser.rst‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/api_doc/discretisation/index.rst‎
Lines changed: 4 additions & 2 deletions b/‎docs/api_doc/discretisation/index.rst‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/images/increasingwidthdisc.png‎
16.9 KB b/‎docs/images/increasingwidthdisc.png‎
16.9 KB
diff --git a/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/user_guide/discretisation/GeometricWidthDiscretiser.rst‎
Lines changed: 157 additions & 0 deletions b/‎docs/user_guide/discretisation/GeometricWidthDiscretiser.rst‎
Lines changed: 157 additions & 0 deletions
diff --git a/‎docs/user_guide/discretisation/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/user_guide/discretisation/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎feature_engine/_docstrings/init_parameters/discretisers.py‎
Lines changed: 4 additions & 0 deletions b/‎feature_engine/_docstrings/init_parameters/discretisers.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎feature_engine/discretisation/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎feature_engine/discretisation/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎feature_engine/discretisation/arbitrary.py‎
Lines changed: 12 additions & 5 deletions b/‎feature_engine/discretisation/arbitrary.py‎
Lines changed: 12 additions & 5 deletions
@@ -89,6 +89,7 @@ transforming parameters from the data and then transform it.
 ### Discretisation methods
 * EqualFrequencyDiscretiser
 * EqualWidthDiscretiser
+* GeometricWidthDiscretiser
 * DecisionTreeDiscretiser
 * ArbitraryDiscreriser
 
 
@@ -0,0 +1,5 @@
+GeometricWidthDiscretiser
+=========================
+
+.. autoclass:: feature_engine.discretisation.GeometricWidthDiscretiser
+    :members:
@@ -16,7 +16,8 @@ into continuous intervals.
 :class:`EqualFrequencyDiscretiser()`     Sorts values into intervals with similar number of observations.
 :class:`EqualWidthDiscretiser()`         Sorts values into intervals of equal size.
 :class:`ArbitraryDiscretiser()`          Sorts values into intervals predefined by the user.
-:class:`DecisionTreeDiscretiser()`       Replaces values by predictions of a decision tree, which are discrete
+:class:`DecisionTreeDiscretiser()`       Replaces values by predictions of a decision tree, which are discrete.
+:class:`GeometricWidthDiscretiser()`     Sorts variable into geometrical intervals.
 =====================================  ========================================================================
 
 
@@ -28,9 +29,10 @@ into continuous intervals.
    EqualWidthDiscretiser
    ArbitraryDiscretiser
    DecisionTreeDiscretiser
+   GeometricWidthDiscretiser
 
 Additional transformers for discretisation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 For discretisation using K-means, check Scikit-learn's
-`KBinsDiscretizer <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html>`_.
+`KBinsDiscretizer <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html>`_.
@@ -137,6 +137,7 @@ Variable Discretisation: Discretisers
 - :doc:`api_doc/discretisation/EqualFrequencyDiscretiser`: sorts variable into equal frequency intervals
 - :doc:`api_doc/discretisation/EqualWidthDiscretiser`: sorts variable into equal width intervals
 - :doc:`api_doc/discretisation/DecisionTreeDiscretiser`: uses decision trees to create finite variables
+- :doc:`api_doc/discretisation/GeometricWidthDiscretiser`: sorts variable into geometrical intervals
 
 Outlier Capping or Removal
 --------------------------
 
@@ -0,0 +1,157 @@
+.. _increasing_width_discretiser:
+
+.. currentmodule:: feature_engine.discretisation
+
+GeometricWidthDiscretiser
+=========================
+
+The :class:`GeometricWidthDiscretiser()` divides continuous numerical variables into
+intervals of increasing width. The width of each succeeding interval is larger than the
+previous interval by a constant amount (cw).
+
+The constant amount is calculated as:
+
+    .. math::
+        cw = (Max - Min)^{1/n}
+
+were Max and Min are the variable's maximum and minimum value, and n is the number of
+intervals.
+
+The sizes of the intervals themselves are calculated with a geometric progression:
+
+    .. math::
+        a_{i+1} = a_i cw
+
+Thus, the first interval's width equals cw, the second interval's width equals 2 * cw,
+and so on.
+
+Note that the proportion of observations per interval may vary.
+
+This discretisation technique is great when the distribution of the variable is right skewed.
+
+Note: The width of some bins might be very small. Thus, to allow this transformer
+to work properly, it might help to increase the precision value, that is,
+the number of decimal values allowed to define each bin. If the variable has a
+narrow range or you are sorting into several bins, allow greater precision
+(i.e., if precision = 3, then 0.001; if precision = 7, then 0.0001).
+
+The :class:`GeometricWidthDiscretiser()` works only with numerical variables. A list of
+variables to discretise can be indicated, or the discretiser will automatically select
+all numerical variables in the train set.
+
+**Example**
+
+Let's look at an example using the house prices dataset (more details about the
+dataset :ref:`here <datasets>`).
+
+Let's load the house prices dataset and separate it into train and test sets:
+
+.. code:: python
+
+	import numpy as np
+	import pandas as pd
+	import matplotlib.pyplot as plt
+	from sklearn.model_selection import train_test_split
+
+	from feature_engine.discretisation import GeometricWidthDiscretiser
+
+	# Load dataset
+	data = pd.read_csv('houseprice.csv')
+
+	# Separate into train and test sets
+	X_train, X_test, y_train, y_test =  train_test_split(
+		    data.drop(['Id', 'SalePrice'], axis=1),
+		    data['SalePrice'], test_size=0.3, random_state=0)
+
+
+Now, we want to discretise the 2 variables indicated below into 10 intervals of increasing
+width:
+
+.. code:: python
+
+	# set up the discretisation transformer
+	disc = GeometricWidthDiscretiser(bins=10, variables=['LotArea', 'GrLivArea'])
+
+	# fit the transformer
+	disc.fit(X_train)
+
+With `fit()` the transformer learns the boundaries of each interval. Then, we can go
+ahead and sort the values into the intervals:
+
+.. code:: python
+
+	# transform the data
+	train_t= disc.transform(X_train)
+	test_t= disc.transform(X_test)
+
+The `binner_dict_` stores the interval limits identified for each variable.
+
+.. code:: python
+
+	disc.binner_dict_
+
+.. code:: python
+
+	'LotArea': [-inf,
+        1303.412,
+        1311.643,
+        1339.727,
+        1435.557,
+        1762.542,
+        2878.27,
+        6685.32,
+        19675.608,
+        64000.633,
+        inf],
+	'GrLivArea': [-inf,
+        336.311,
+        339.34,
+        346.34,
+        362.515,
+        399.894,
+        486.27,
+        685.871,
+        1147.115,
+        2212.974,
+        inf]}
+
+With increasing width discretisation, each bin does not necessarily contain the same number
+of observations. This transformer is suitable for variables with right skewed distributions.
+
+Let's compare the variable distribution before and after the discretization:
+
+.. code:: python
+
+    fig, ax = plt.subplots(1, 2)
+    X_train['LotArea'].hist(ax=ax[0], bins=10);
+    train_t['LotArea'].hist(ax=ax[1], bins=10);
+
+We can see below that the intervals contain different number of observations. We can also
+see that the shape from the distribution changed from skewed to a more "bell shaped"
+distribution.
+
+.. image:: ../../images/increasingwidthdisc.png
+
+|
+
+**Discretisation plus encoding**
+
+If we return the interval values as integers, the discretiser has the option to return
+the transformed variable as integer or as object. Why would we want the transformed
+variables as object?
+
+Categorical encoders in Feature-engine are designed to work with variables of type
+object by default. Thus, if you wish to encode the returned bins further, say to try and
+obtain monotonic relationships between the variable and the target, you can do so
+seamlessly by setting `return_object` to True. You can find an example of how to use
+this functionality `here <https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/GeometricWidthDiscretiser_plus_MeanEncoder.ipynb>`_.
+
+More details
+^^^^^^^^^^^^
+
+Check also for more details on how to use this transformer:
+
+- `Jupyter notebook - Geometric Discretiser <https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/GeometricWidthDiscretiser.ipynb>`_
+- `Jupyter notebook - Geometric Discretiser plus Mean encoding <https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/GeometricWidthDiscretiser_plus_MeanEncoder.ipynb>`_
+
+All notebooks can be found in a `dedicated repository <https://github.com/feature-engine/feature-engine-examples>`_.
@@ -34,3 +34,4 @@ Throughout the user guide, we point to jupyter notebooks that showcase this func
    EqualWidthDiscretiser
    ArbitraryDiscretiser
    DecisionTreeDiscretiser
+   GeometricWidthDiscretiser
@@ -8,3 +8,7 @@
         Whether the output should be the interval boundaries. If True, it returns
         the interval boundaries. If False, it returns integers.
     """.rstrip()
+
+_precision_docstring = """precision: int, default=3
+        The precision at which to store and display the bins labels.
+    """.rstrip()
@@ -7,10 +7,12 @@
 from .decision_tree import DecisionTreeDiscretiser
 from .equal_frequency import EqualFrequencyDiscretiser
 from .equal_width import EqualWidthDiscretiser
+from .geometric_width import GeometricWidthDiscretiser
 
 __all__ = [
     "DecisionTreeDiscretiser",
     "EqualFrequencyDiscretiser",
     "EqualWidthDiscretiser",
     "ArbitraryDiscretiser",
+    "GeometricWidthDiscretiser",
 ]
@@ -8,13 +8,15 @@
 
 from feature_engine._base_transformers.mixins import FitFromDictMixin
 from feature_engine._docstrings.fit_attributes import (
+    _binner_dict_docstring,
     _feature_names_in_docstring,
     _n_features_in_docstring,
-    _variables_attribute_docstring, _binner_dict_docstring,
+    _variables_attribute_docstring,
 )
 from feature_engine._docstrings.init_parameters.discretisers import (
-    _return_object_docstring,
+    _precision_docstring,
     _return_boundaries_docstring,
+    _return_object_docstring,
 )
 from feature_engine._docstrings.methods import (
     _fit_not_learn_docstring,
@@ -29,6 +31,7 @@
 @Substitution(
     return_object=_return_object_docstring,
     return_boundaries=_return_boundaries_docstring,
+    precision=_precision_docstring,
     binner_dict_=_binner_dict_docstring,
     transform=_transform_discretiser_docstring,
     variables_=_variables_attribute_docstring,
@@ -59,6 +62,8 @@ class ArbitraryDiscretiser(BaseDiscretiser, FitFromDictMixin):
 
     {return_boundaries}
 
+    {precision}
+
     errors: string, default='ignore'
         Indicates what to do when a value is outside the limits indicated in the
         'binning_dict'. If 'raise', the transformation will raise an error.
@@ -111,6 +116,7 @@ def __init__(
         binning_dict: Dict[Union[str, int], List[Union[str, int]]],
         return_object: bool = False,
         return_boundaries: bool = False,
+        precision: int = 3,
         errors: str = "ignore",
     ) -> None:
 
@@ -126,7 +132,7 @@ def __init__(
                 f"Got {errors} instead."
             )
 
-        super().__init__(return_object, return_boundaries)
+        super().__init__(return_object, return_boundaries, precision)
 
         self.binning_dict = binning_dict
         self.errors = errors
@@ -153,12 +159,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         return self
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
-        """Sort the variable values into the intervals.
+        """
+        Sort the variable values into the intervals.
 
         Parameters
         ----------
         X: pandas dataframe of shape = [n_samples, n_features]
-            The dataframe to be transformed.
+            The data to transform.
 
         Returns
         -------