Add NaN detection to in discretizers transform method by creating a BaseDiscretizer class (#341)

Morgan-Sell · solegalli · web-flow · commit e6a13f87a901 · 2022-01-04T10:29:39.000-03:00
* create BaseDiscretizer class

* implement _check_transform_input_and_state() method for BaseDiscretizer

* start implementation of tranform() for BaseDiscretizer class

* add init params to parent class BaseDiscretizer

* add BaseDiscretiser class to init file

* change ArbitraryDiscretiser, EqualFrequencyDiscretiser, and EqualWidthDiscretiser parent class to BaseDiscretiser and refactor init params

* change warning text

* fix init file and BaseDiscretiser imports. add errors to BaseDiscretiser init params.

* edit ValueError text in init()

* delete _check_transform_input_and_state method

* add inheritance and fix comments in BaseDiscretiser transform()

* update transform() method for base class. add corresponding docstring

* delete transform() code for three discretisers. code base was moved to BaseDiscretiser class

* raise ValueError if  equals

* create error/warrning informing user where nan values are located

* fix style error

* create test_transform_raises_error_if_df_contains_na() and test_error_if_errors_not_permitted_value() fcns

* change df used in test_error_if_input_df_contains_na_in_transform() fcn

* revise test_error_if_input_df_contains_na_in_transform() and create test_error_if_not_permitted_value_is_errors() fcns

* edit comments

* revise test_error_if_input_df_contains_na_in_transform() and create  test_error_if_not_permitted_value_is_errors()

* fix style error

* add back prior version of test_error_if_input_df_contains_na_in_transform for the three discretisers

* edit BaseDiscretiser transform() docstring

* remove transform() from the three discretisers

* fix style errors

* add 'return_object' and 'return_boundaries' to init fcn for each of the 3 classes

* add 'errors' to init method of the 3 discretisers

* fix incorrect discretiser class error

* change 'self.encoder_dict_' to 'self.binner_dict_'

* add init params when instantiating EqualFrequencyDiscretiser class in tests

* add binning_dict values when instatiating ArbitraryDiscretiser. Replace Boston housing price dataset code.

* change boston housing dataset to california housing dataset. Sklearn document state that 'The Boston housing prices dataset has an ethical problem.' California housing dataset is suggested by sklearn maintainers.

* change code based on california housing dataset features

* fix style errors

* fix style errors

* fix style errors

* fix style error

* change 'msg' text

* modifies docstrings base class

* adds dosctring to discretisers

* moves error check to arbitrary discretizer only

* resets equal width and fret test to main branch

* adds test to arbitrary discretizer

* blacks files

* fixes style tests

* style fixes

* minor edits to doc string and code for ArbitrarayDiscretizer()

* change -0 to 0 in test_arbitrary_discresir.py

* add back 'X = super().transform(X)' to ArbitraryDiscretiser transform()

* remove 2 print statements from abitrary test and remove df copy() from ArbitraryDiscretiser transform()

* fix style error

* add back ArbitraryDiscretiser transform() docstring

* fix docstring variables

* revert wording change

* revert wording in docstring

* removes back slash from statement

Co-authored-by: Soledad Galli &lt;solegalli@protonmail.com&gt;
diff --git a/feature_engine/discretisation/arbitrary.py b/feature_engine/discretisation/arbitrary.py
@@ -1,15 +1,16 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import Dict, List, Optional, Union
 
 import pandas as pd
 
-from feature_engine.base_transformers import BaseNumericalTransformer
+from feature_engine.discretisation.base_discretiser import BaseDiscretiser
 from feature_engine.validation import _return_tags
 
 
-class ArbitraryDiscretiser(BaseNumericalTransformer):
+class ArbitraryDiscretiser(BaseDiscretiser):
     """
     The ArbitraryDiscretiser() divides numerical variables into intervals which limits
     are determined by the user. Thus, it works only with numerical variables.
@@ -39,6 +40,12 @@ class ArbitraryDiscretiser(BaseNumericalTransformer):
         Whether the output, that is the bins, should be the interval boundaries. If
         True, it returns the interval boundaries. If False, it returns integers.
 
+    errors: string, default='ignore'
+        Indicates what to do when a value is outside the limits indicated in the
+        'binning_dict'. If 'raise', the transformation will raise an error.
+        If 'ignore', values outside the limits are returned as NaN
+        and a warning will be raised instead.
+
     Attributes
     ----------
     binner_dict_:
@@ -69,19 +76,25 @@ def __init__(
         binning_dict: Dict[Union[str, int], List[Union[str, int]]],
         return_object: bool = False,
         return_boundaries: bool = False,
+        errors: str = "ignore",
     ) -> None:
 
         if not isinstance(binning_dict, dict):
             raise ValueError(
-                "Please provide at a dictionary with the interval limits per variable"
+                "binning_dict must be a dictionary with the interval limits per "
+                f"variable. Got {binning_dict} instead."
+            )
+
+        if errors not in ["ignore", "raise"]:
+            raise ValueError(
+                "errors only takes values 'ignore' and 'raise'. "
+                f"Got {errors} instead."
             )
 
-        if not isinstance(return_object, bool):
-            raise ValueError("return_object must be True or False")
+        super().__init__(return_object, return_boundaries)
 
         self.binning_dict = binning_dict
-        self.return_object = return_object
-        self.return_boundaries = return_boundaries
+        self.errors = errors
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -109,34 +122,42 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """Sort the variable values into the intervals.
 
-        Parameters
-        ----------
-        X: pandas dataframe of shape = [n_samples, n_features]
-            The dataframe to be transformed.
+       Parameters
+       ----------
+       X: pandas dataframe of shape = [n_samples, n_features]
+           The dataframe to be transformed.
 
-        Returns
-        -------
-        X_new: pandas dataframe of shape = [n_samples, n_features]
-            The transformed data with the discrete variables.
-        """
+       Returns
+       -------
+       X_new: pandas dataframe of shape = [n_samples, n_features]
+           The transformed data with the discrete variables.
+       """
 
-        # check input dataframe and if class was fitted
         X = super().transform(X)
+        # check if NaN values were introduced by the discretisation procedure.
+        if X[self.variables_].isnull().sum().sum() > 0:
 
-        # transform variables
-        if self.return_boundaries:
-            for feature in self.variables_:
-                X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
+            # obtain the name(s) of the columns with null values
+            nan_columns = (
+                X[self.variables_].columns[X[self.variables_].isnull().any()].tolist()
+            )
 
-        else:
-            for feature in self.variables_:
-                X[feature] = pd.cut(
-                    X[feature], self.binner_dict_[feature], labels=False
+            if len(nan_columns) > 1:
+                nan_columns_str = ", ".join(nan_columns)
+            else:
+                nan_columns_str = nan_columns[0]
+
+            if self.errors == "ignore":
+                warnings.warn(
+                    f"During the discretisation, NaN values were introduced in "
+                    f"the feature(s) {nan_columns_str}."
                 )
 
-            # return object
-            if self.return_object:
-                X[self.variables_] = X[self.variables_].astype("O")
+            elif self.errors == "raise":
+                raise ValueError(
+                    "During the discretisation, NaN values were introduced in "
+                    f"the feature(s) {nan_columns_str}."
+                )
 
         return X
 
diff --git a/feature_engine/discretisation/base_discretiser.py b/feature_engine/discretisation/base_discretiser.py
@@ -0,0 +1,82 @@
+# Authors: Morgan Sell <morganpsell@gmail.com>
+# License: BSD 3 clause
+
+import pandas as pd
+
+from feature_engine.base_transformers import BaseNumericalTransformer
+
+
+class BaseDiscretiser(BaseNumericalTransformer):
+    """
+    Shared set-up checks and methods across numerical discretisers.
+
+    Parameters
+    ----------
+    return_object: bool, default=False
+        Whether the the discrete variable should be returned as numeric or as
+        object. If you would like to proceed with the engineering of the variable as if
+        it was categorical, use True. Alternatively, keep the default to False.
+
+    return_boundaries: bool, default=False
+        Whether the output should be the interval boundaries. If True, it returns
+        the interval boundaries. If False, it returns integers.
+
+    Methods
+    -------
+    transform:
+        Sort continuous variable values into the intervals.
+    """
+
+    def __init__(
+        self,
+        return_object: bool = False,
+        return_boundaries: bool = False,
+    ) -> None:
+
+        if not isinstance(return_object, bool):
+            raise ValueError(
+                "return_object must be True or False. " f"Got {return_object} instead."
+            )
+
+        if not isinstance(return_boundaries, bool):
+            raise ValueError(
+                "return_boundaries must be True or False. "
+                f"Got {return_boundaries} instead."
+            )
+
+        self.return_object = return_object
+        self.return_boundaries = return_boundaries
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """Sort the variable values into the intervals.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_new: pandas dataframe of shape = [n_samples, n_features]
+            The transformed data with the discrete variables.
+        """
+
+        # check input dataframe and if class was fitted
+        X = super().transform(X)
+
+        # transform variables
+        if self.return_boundaries:
+            for feature in self.variables_:
+                X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
+
+        else:
+            for feature in self.variables_:
+                X[feature] = pd.cut(
+                    X[feature], self.binner_dict_[feature], labels=False
+                )
+
+            # return object
+            if self.return_object:
+                X[self.variables_] = X[self.variables_].astype("O")
+
+        return X
diff --git a/feature_engine/discretisation/equal_frequency.py b/feature_engine/discretisation/equal_frequency.py
@@ -5,11 +5,11 @@
 
 import pandas as pd
 
-from feature_engine.base_transformers import BaseNumericalTransformer
+from feature_engine.discretisation.base_discretiser import BaseDiscretiser
 from feature_engine.variable_manipulation import _check_input_parameter_variables
 
 
-class EqualFrequencyDiscretiser(BaseNumericalTransformer):
+class EqualFrequencyDiscretiser(BaseDiscretiser):
     """
     The EqualFrequencyDiscretiser() divides continuous numerical variables
     into contiguous equal frequency intervals, that is, intervals that contain
@@ -86,18 +86,12 @@ def __init__(
     ) -> None:
 
         if not isinstance(q, int):
-            raise ValueError("q must be an integer")
+            raise ValueError(f"q must be an integer. Got {q} instead.")
 
-        if not isinstance(return_object, bool):
-            raise ValueError("return_object must be True or False")
-
-        if not isinstance(return_boundaries, bool):
-            raise ValueError("return_boundaries must be True or False")
+        super().__init__(return_object, return_boundaries)
 
         self.q = q
         self.variables = _check_input_parameter_variables(variables)
-        self.return_object = return_object
-        self.return_boundaries = return_boundaries
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -129,37 +123,3 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         self.n_features_in_ = X.shape[1]
 
         return self
-
-    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
-        """Sort the variable values into the intervals.
-
-        Parameters
-        ----------
-        X: pandas dataframe of shape = [n_samples, n_features]
-            The data to transform.
-
-        Returns
-        -------
-        X_new: pandas dataframe of shape = [n_samples, n_features]
-            The transformed data with the discrete variables.
-        """
-
-        # check input dataframe and if class was fitted
-        X = super().transform(X)
-
-        # transform variables
-        if self.return_boundaries:
-            for feature in self.variables_:
-                X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
-
-        else:
-            for feature in self.variables_:
-                X[feature] = pd.cut(
-                    X[feature], self.binner_dict_[feature], labels=False
-                )
-
-            # return object
-            if self.return_object:
-                X[self.variables_] = X[self.variables_].astype("O")
-
-        return X
diff --git a/feature_engine/discretisation/equal_width.py b/feature_engine/discretisation/equal_width.py
@@ -5,11 +5,11 @@
 
 import pandas as pd
 
-from feature_engine.base_transformers import BaseNumericalTransformer
+from feature_engine.discretisation.base_discretiser import BaseDiscretiser
 from feature_engine.variable_manipulation import _check_input_parameter_variables
 
 
-class EqualWidthDiscretiser(BaseNumericalTransformer):
+class EqualWidthDiscretiser(BaseDiscretiser):
     """
     The EqualWidthDiscretiser() divides continuous numerical variables into
     intervals of the same width, that is, equidistant intervals. Note that the
@@ -95,18 +95,12 @@ def __init__(
     ) -> None:
 
         if not isinstance(bins, int):
-            raise ValueError("q must be an integer")
+            raise ValueError(f"bins must be an integer. Got {bins} instead.")
 
-        if not isinstance(return_object, bool):
-            raise ValueError("return_object must be True or False")
-
-        if not isinstance(return_boundaries, bool):
-            raise ValueError("return_boundaries must be True or False")
+        super().__init__(return_object, return_boundaries)
 
         self.bins = bins
         self.variables = _check_input_parameter_variables(variables)
-        self.return_object = return_object
-        self.return_boundaries = return_boundaries
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -142,38 +136,3 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         self.n_features_in_ = X.shape[1]
 
         return self
-
-    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
-        """
-        Sort the variable values into the intervals.
-
-        Parameters
-        ----------
-        X: pandas dataframe of shape = [n_samples, n_features]
-            The data to transform.
-
-        Returns
-        -------
-        X_new: pandas dataframe of shape = [n_samples, n_features]
-            The transformed data with the discrete variables.
-        """
-
-        # check input dataframe and if class was fitted
-        X = super().transform(X)
-
-        # transform variables
-        if self.return_boundaries:
-            for feature in self.variables_:
-                X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
-
-        else:
-            for feature in self.variables_:
-                X[feature] = pd.cut(
-                    X[feature], self.binner_dict_[feature], labels=False
-                )
-
-            # return object
-            if self.return_object:
-                X[self.variables_] = X[self.variables_].astype("O")
-
-        return X
diff --git a/tests/test_discretisation/test_arbitrary_discretiser.py b/tests/test_discretisation/test_arbitrary_discretiser.py