split function to check na into strict and optional (#608)

solegalli · web-flow · commit 6adef90ed99e · 2023-01-29T11:10:23.000-03:00
* split check_na in 2

* replace check for optional na

* update match categories
diff --git a/feature_engine/dataframe_checks.py b/feature_engine/dataframe_checks.py
@@ -243,7 +243,7 @@ def _check_X_matches_training_df(X: pd.DataFrame, reference: int) -> None:
 
 
 def _check_contains_na(
-    X: pd.DataFrame, variables: List[Union[str, int]], switch_param: bool = False
+    X: pd.DataFrame, variables: List[Union[str, int]],
 ) -> None:
     """
     Checks if DataFrame contains null values in the selected columns.
@@ -255,9 +255,31 @@ def _check_contains_na(
     variables : List
         The selected group of variables in which null values will be examined.
 
-    switch_param: bool
-        Whether the transformer has the parameter missing_values in the init to modify
-        its behaviour towards nan.
+    Raises
+    ------
+    ValueError
+        If the variable(s) contain null values.
+    """
+
+    if X[variables].isnull().any().any():
+        raise ValueError(
+            "Some of the variables in the dataset contain NaN. Check and "
+            "remove those before using this transformer."
+        )
+
+
+def _check_optional_contains_na(
+    X: pd.DataFrame, variables: List[Union[str, int]]
+) -> None:
+    """
+    Checks if DataFrame contains null values in the selected columns.
+
+    Parameters
+    ----------
+    X : Pandas DataFrame
+
+    variables : List
+        The selected group of variables in which null values will be examined.
 
     Raises
     ------
@@ -266,17 +288,11 @@ def _check_contains_na(
     """
 
     if X[variables].isnull().any().any():
-        if switch_param is False:
-            raise ValueError(
-                "Some of the variables in the dataset contain NaN. Check and "
-                "remove those before using this transformer."
-            )
-        else:
-            raise ValueError(
-                "Some of the variables in the dataset contain NaN. Check and "
-                "remove those before using this transformer or set the parameter "
-                "`missing_values='ignore'` when initialising this transformer."
-            )
+        raise ValueError(
+            "Some of the variables in the dataset contain NaN. Check and "
+            "remove those before using this transformer or set the parameter "
+            "`missing_values='ignore'` when initialising this transformer."
+        )
 
 
 def _check_contains_inf(X: pd.DataFrame, variables: List[Union[str, int]]) -> None:
diff --git a/feature_engine/encoding/base_encoder.py b/feature_engine/encoding/base_encoder.py
@@ -20,7 +20,7 @@
     _find_or_check_categorical_variables,
 )
 from feature_engine.dataframe_checks import (
-    _check_contains_na,
+    _check_optional_contains_na,
     _check_X_matches_training_df,
     check_X,
 )
@@ -110,7 +110,7 @@ class CategoricalMethodsMixin(BaseEstimator, TransformerMixin, GetFeatureNamesOu
 
     def _check_na(self, X: pd.DataFrame, variables):
         if self.missing_values == "raise":
-            _check_contains_na(X, variables, switch_param=True)
+            _check_optional_contains_na(X, variables)
 
     def _check_or_select_variables(self, X: pd.DataFrame):
         """
@@ -207,7 +207,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         # check if dataset contains na
         if self.missing_values == "raise":
-            _check_contains_na(X, self.variables_, switch_param=True)
+            _check_optional_contains_na(X, self.variables_)
 
         X = self._encode(X)
 
diff --git a/feature_engine/encoding/rare_label.py b/feature_engine/encoding/rare_label.py
@@ -19,7 +19,7 @@
 from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
 from feature_engine._docstrings.methods import _fit_transform_docstring
 from feature_engine._docstrings.substitute import Substitution
-from feature_engine.dataframe_checks import _check_contains_na, check_X
+from feature_engine.dataframe_checks import _check_optional_contains_na, check_X
 from feature_engine.encoding.base_encoder import (
     CategoricalInitMixinNA,
     CategoricalMethodsMixin,
@@ -244,7 +244,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         # check if dataset contains na
         if self.missing_values == "raise":
-            _check_contains_na(X, self.variables_, switch_param=True)
+            _check_optional_contains_na(X, self.variables_)
 
             for feature in self.variables_:
                 X[feature] = np.where(
diff --git a/feature_engine/encoding/similarity_encoder.py b/feature_engine/encoding/similarity_encoder.py
@@ -16,7 +16,7 @@
 from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
 from feature_engine._docstrings.methods import _fit_transform_docstring
 from feature_engine._docstrings.substitute import Substitution
-from feature_engine.dataframe_checks import _check_contains_na, check_X
+from feature_engine.dataframe_checks import _check_optional_contains_na, check_X
 from feature_engine.encoding.base_encoder import (
     CategoricalInitMixin,
     CategoricalMethodsMixin,
@@ -241,7 +241,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
 
         # if data contains nan, fail before running any logic
         if self.missing_values == "raise":
-            _check_contains_na(X, variables_, switch_param=True)
+            _check_optional_contains_na(X, variables_)
 
         self.encoder_dict_ = {}
 
@@ -311,7 +311,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         check_is_fitted(self)
         X = self._check_transform_input_and_state(X)
         if self.missing_values == "raise":
-            _check_contains_na(X, self.variables_, switch_param=True)
+            _check_optional_contains_na(X, self.variables_)
 
         new_values = []
         for var in self.variables_:
diff --git a/feature_engine/preprocessing/match_categories.py b/feature_engine/preprocessing/match_categories.py
@@ -14,7 +14,7 @@
 )
 from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
 from feature_engine._docstrings.substitute import Substitution
-from feature_engine.dataframe_checks import _check_contains_na, check_X
+from feature_engine.dataframe_checks import _check_optional_contains_na, check_X
 from feature_engine.encoding.base_encoder import (
     CategoricalInitMixinNA,
     CategoricalMethodsMixin,
@@ -116,7 +116,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         variables_ = self._check_or_select_variables(X)
 
         if self.missing_values == "raise":
-            _check_contains_na(X, variables_, switch_param=True)
+            _check_optional_contains_na(X, variables_)
 
         self.category_dict_ = dict()
         for var in variables_:
@@ -143,7 +143,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         X = self._check_transform_input_and_state(X)
 
         if self.missing_values == "raise":
-            _check_contains_na(X, self.variables_, switch_param=True)
+            _check_optional_contains_na(X, self.variables_)
 
         for feature, levels in self.category_dict_.items():
             X[feature] = pd.Categorical(X[feature], levels)
diff --git a/tests/test_dataframe_checks.py b/tests/test_dataframe_checks.py
@@ -7,6 +7,7 @@
 from feature_engine.dataframe_checks import (
     _check_contains_inf,
     _check_contains_na,
+    _check_optional_contains_na,
     _check_X_matches_training_df,
     check_X,
     check_X_y,
@@ -152,14 +153,16 @@ def test_contains_na(df_na):
         assert _check_contains_na(df_na, ["Name", "City"])
     assert str(record.value) == msg
 
+
+def test_optional_contains_na(df_na):
     msg = (
         "Some of the variables in the dataset contain NaN. Check and "
         "remove those before using this transformer or set the parameter "
         "`missing_values='ignore'` when initialising this transformer."
     )
 
     with pytest.raises(ValueError) as record:
-        assert _check_contains_na(df_na, ["Name", "City"], switch_param=True)
+        assert _check_optional_contains_na(df_na, ["Name", "City"])
     assert str(record.value) == msg