feature-engine
diff --git a/‎feature_engine/selection/recursive_feature_addition.py‎
Lines changed: 19 additions & 6 deletions b/‎feature_engine/selection/recursive_feature_addition.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎feature_engine/selection/recursive_feature_elimination.py‎
Lines changed: 19 additions & 6 deletions b/‎feature_engine/selection/recursive_feature_elimination.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎feature_engine/selection/shuffle_features.py‎
Lines changed: 19 additions & 6 deletions b/‎feature_engine/selection/shuffle_features.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎feature_engine/selection/single_feature_performance.py‎
Lines changed: 19 additions & 6 deletions b/‎feature_engine/selection/single_feature_performance.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎feature_engine/selection/smart_correlation_selection.py‎
Lines changed: 19 additions & 6 deletions b/‎feature_engine/selection/smart_correlation_selection.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎tests/test_selection/test_recursive_feature_addition.py‎
Lines changed: 81 additions & 5 deletions b/‎tests/test_selection/test_recursive_feature_addition.py‎
Lines changed: 81 additions & 5 deletions
@@ -62,8 +62,24 @@ class RecursiveFeatureAddition(BaseSelector):
         The threshold must be defined by the user. Bigger thresholds will select less
         features.
 
-    cv: int, default=3
-        Cross-validation fold to be used to fit the estimator.
+    cv: int, cross-validation generator or an iterable, default=3
+        Determines the cross-validation splitting strategy. Possible inputs for cv are:
+
+            - None, to use cross_validate's default 5-fold cross validation
+
+            - int, to specify the number of folds in a (Stratified)KFold,
+
+            - CV splitter
+                - (https://scikit-learn.org/stable/glossary.html#term-CV-splitter)
+
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and y is either binary or
+        multiclass, StratifiedKFold is used. In all other cases, Fold is used. These
+        splitters are instantiated with shuffle=False so the splits will be the same
+        across calls.
+
+        For more details check Scikit-learn's cross_validate documentation
 
     Attributes
     ----------
@@ -100,14 +116,11 @@ def __init__(
         self,
         estimator,
         scoring: str = "roc_auc",
-        cv: int = 3,
+        cv=3,
         threshold: Union[int, float] = 0.01,
         variables: Variables = None,
     ):
 
-        if not isinstance(cv, int) or cv < 1:
-            raise ValueError("cv can only take positive integers bigger than 1")
-
         if not isinstance(threshold, (int, float)):
             raise ValueError("threshold can only be integer or float")
 
 
@@ -62,8 +62,24 @@ class RecursiveFeatureElimination(BaseSelector):
         The threshold must be defined by the user. Bigger thresholds will select less
         features.
 
-    cv: int, default=3
-        Cross-validation fold to be used to fit the estimator.
+    cv: int, cross-validation generator or an iterable, default=3
+        Determines the cross-validation splitting strategy. Possible inputs for cv are:
+
+            - None, to use cross_validate's default 5-fold cross validation
+
+            - int, to specify the number of folds in a (Stratified)KFold,
+
+            - CV splitter
+                - (https://scikit-learn.org/stable/glossary.html#term-CV-splitter)
+
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and y is either binary or
+        multiclass, StratifiedKFold is used. In all other cases, Fold is used. These
+        splitters are instantiated with shuffle=False so the splits will be the same
+        across calls.
+
+        For more details check Scikit-learn's cross_validate documentation
 
     Attributes
     ----------
@@ -99,14 +115,11 @@ def __init__(
         self,
         estimator,
         scoring: str = "roc_auc",
-        cv: int = 3,
+        cv=3,
         threshold: Union[int, float] = 0.01,
         variables: Variables = None,
     ):
 
-        if not isinstance(cv, int) or cv < 1:
-            raise ValueError("cv can only take positive integers bigger than 1")
-
         if not isinstance(threshold, (int, float)):
             raise ValueError("threshold can only be integer or float")
 
 
@@ -63,8 +63,24 @@ class SelectByShuffling(BaseSelector):
         performance drift is smaller than the mean performance drift across all
         features.
 
-    cv: int, default=3
-        Desired number of cross-validation fold to be used to fit the estimator.
+    cv: int, cross-validation generator or an iterable, default=3
+        Determines the cross-validation splitting strategy. Possible inputs for cv are:
+
+            - None, to use cross_validate's default 5-fold cross validation
+
+            - int, to specify the number of folds in a (Stratified)KFold,
+
+            - CV splitter
+                - (https://scikit-learn.org/stable/glossary.html#term-CV-splitter)
+
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and y is either binary or
+        multiclass, StratifiedKFold is used. In all other cases, Fold is used. These
+        splitters are instantiated with shuffle=False so the splits will be the same
+        across calls.
+
+        For more details check Scikit-learn's cross_validate documentation
 
     random_state: int, default=None
         Controls the randomness when shuffling features.
@@ -100,15 +116,12 @@ def __init__(
         self,
         estimator,
         scoring: str = "roc_auc",
-        cv: int = 3,
+        cv=3,
         threshold: Union[float, int] = None,
         variables: Variables = None,
         random_state: int = None,
     ):
 
-        if not isinstance(cv, int) or cv < 1:
-            raise ValueError("cv can only take positive integers bigger than 1")
-
         if threshold and not isinstance(threshold, (int, float)):
             raise ValueError("threshold can only be integer or float or None")
 
 
@@ -57,8 +57,24 @@ class SelectBySingleFeaturePerformance(BaseSelector):
         The threshold can be specified by the user. If None, it will be automatically
         set to the mean performance value of all features.
 
-    cv: int, default=3
-        Desired number of cross-validation fold to be used to fit the estimator.
+    cv: int, cross-validation generator or an iterable, default=3
+        Determines the cross-validation splitting strategy. Possible inputs for cv are:
+
+            - None, to use cross_validate's default 5-fold cross validation
+
+            - int, to specify the number of folds in a (Stratified)KFold,
+
+            - CV splitter
+                - (https://scikit-learn.org/stable/glossary.html#term-CV-splitter)
+
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and y is either binary or
+        multiclass, StratifiedKFold is used. In all other cases, Fold is used. These
+        splitters are instantiated with shuffle=False so the splits will be the same
+        across calls.
+
+        For more details check Scikit-learn's cross_validate documentation
 
     Attributes
     ----------
@@ -88,14 +104,11 @@ def __init__(
         self,
         estimator,
         scoring: str = "roc_auc",
-        cv: int = 3,
+        cv=3,
         threshold: Union[int, float] = None,
         variables: Variables = None,
     ):
 
-        if not isinstance(cv, int) or cv < 1:
-            raise ValueError("cv can only take positive integers bigger than 1")
-
         if threshold:
             if not isinstance(threshold, (int, float)):
                 raise ValueError("threshold can only be integer, float or None")
 
@@ -83,8 +83,24 @@ class SmartCorrelatedSelection(BaseSelector):
         sklearn.metrics. See the model evaluation documentation for more options:
         https://scikit-learn.org/stable/modules/model_evaluation.html
 
-    cv: int, default=3
-        Cross-validation fold to be used to fit the estimator.
+    cv: int, cross-validation generator or an iterable, default=3
+        Determines the cross-validation splitting strategy. Possible inputs for cv are:
+
+            - None, to use cross_validate's default 5-fold cross validation
+
+            - int, to specify the number of folds in a (Stratified)KFold,
+
+            - CV splitter
+                - (https://scikit-learn.org/stable/glossary.html#term-CV-splitter)
+
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and y is either binary or
+        multiclass, StratifiedKFold is used. In all other cases, Fold is used. These
+        splitters are instantiated with shuffle=False so the splits will be the same
+        across calls.
+
+        For more details check Scikit-learn's cross_validate documentation
 
     Attributes
     ----------
@@ -124,7 +140,7 @@ def __init__(
         selection_method: str = "missing_values",
         estimator=None,
         scoring: str = "roc_auc",
-        cv: int = 3,
+        cv=3,
     ):
 
         if method not in ["pearson", "spearman", "kendall"]:
@@ -149,9 +165,6 @@ def __init__(
                 "'variance' or 'model_performance'."
             )
 
-        if not isinstance(cv, int) or cv < 1:
-            raise ValueError("cv can only take positive integers bigger than 1")
-
         if selection_method == "model_performance" and estimator is None:
             raise ValueError(
                 "Please provide an estimator, e.g., "
 
@@ -4,6 +4,7 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import KFold, StratifiedKFold
 from sklearn.tree import DecisionTreeRegressor
 
 from feature_engine.selection import RecursiveFeatureAddition
@@ -146,11 +147,6 @@ def test_non_fitted_error(df_test):
         sel.transform(df_test)
 
 
-def test_raises_cv_error():
-    with pytest.raises(ValueError):
-        RecursiveFeatureAddition(RandomForestClassifier(random_state=1), cv=0)
-
-
 def test_raises_threshold_error():
     with pytest.raises(ValueError):
         RecursiveFeatureAddition(RandomForestClassifier(random_state=1), threshold=None)
@@ -225,3 +221,83 @@ def test_automatic_variable_selection(df_test):
     assert list(sel.performance_drifts_.keys()) == ordered_features
     # test transform output
     pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
+
+
+def test_KFold_generators(df_test):
+
+    X, y = df_test
+
+    # Kfold
+    sel = RecursiveFeatureAddition(
+        RandomForestClassifier(random_state=1),
+        threshold=0.001,
+        cv=KFold(n_splits=3),
+    )
+    sel.fit(X, y)
+    Xtransformed = sel.transform(X)
+
+    # test fit attrs
+    assert sel.initial_model_performance_ > 0.995
+    assert isinstance(sel.features_to_drop_, list)
+    assert all([x for x in sel.features_to_drop_ if x in X.columns])
+    assert len(sel.features_to_drop_) < X.shape[1]
+    assert not Xtransformed.empty
+    assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_])
+    assert isinstance(sel.performance_drifts_, dict)
+    assert all([x for x in X.columns if x in sel.performance_drifts_.keys()])
+    assert all(
+        [
+            isinstance(sel.performance_drifts_[var], (int, float))
+            for var in sel.performance_drifts_.keys()
+        ]
+    )
+
+    # Stratfied
+    sel = RecursiveFeatureAddition(
+        RandomForestClassifier(random_state=1),
+        threshold=0.001,
+        cv=StratifiedKFold(n_splits=3),
+    )
+    sel.fit(X, y)
+    Xtransformed = sel.transform(X)
+
+    # test fit attrs
+    assert sel.initial_model_performance_ > 0.995
+    assert isinstance(sel.features_to_drop_, list)
+    assert all([x for x in sel.features_to_drop_ if x in X.columns])
+    assert len(sel.features_to_drop_) < X.shape[1]
+    assert not Xtransformed.empty
+    assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_])
+    assert isinstance(sel.performance_drifts_, dict)
+    assert all([x for x in X.columns if x in sel.performance_drifts_.keys()])
+    assert all(
+        [
+            isinstance(sel.performance_drifts_[var], (int, float))
+            for var in sel.performance_drifts_.keys()
+        ]
+    )
+
+    # None
+    sel = RecursiveFeatureAddition(
+        RandomForestClassifier(random_state=1),
+        threshold=0.001,
+        cv=None,
+    )
+    sel.fit(X, y)
+    Xtransformed = sel.transform(X)
+
+    # test fit attrs
+    assert sel.initial_model_performance_ > 0.995
+    assert isinstance(sel.features_to_drop_, list)
+    assert all([x for x in sel.features_to_drop_ if x in X.columns])
+    assert len(sel.features_to_drop_) < X.shape[1]
+    assert not Xtransformed.empty
+    assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_])
+    assert isinstance(sel.performance_drifts_, dict)
+    assert all([x for x in X.columns if x in sel.performance_drifts_.keys()])
+    assert all(
+        [
+            isinstance(sel.performance_drifts_[var], (int, float))
+            for var in sel.performance_drifts_.keys()
+        ]
+    )