allow sample weight in shuffle features (#662)

solegalli · web-flow · commit 4d55ed7bb279 · 2023-04-27T09:44:37.000+02:00
* allow sample weight in shuffle features

* add additional tag with test that fails for sample_weights
diff --git a/feature_engine/selection/shuffle_features.py b/feature_engine/selection/shuffle_features.py
@@ -5,7 +5,7 @@
 from sklearn.base import is_classifier
 from sklearn.metrics import get_scorer
 from sklearn.model_selection import check_cv, cross_validate
-from sklearn.utils.validation import check_random_state
+from sklearn.utils.validation import check_random_state, _check_sample_weight
 
 from feature_engine._docstrings.fit_attributes import (
     _feature_names_in_docstring,
@@ -185,16 +185,25 @@ def __init__(
         self.cv = cv
         self.random_state = random_state
 
-    def fit(self, X: pd.DataFrame, y: pd.Series):
+    def fit(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        sample_weight: Union[np.array, pd.Series, List] = None,
+    ):
         """
         Find the important features.
 
         Parameters
         ----------
         X: pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.
+
         y: array-like of shape (n_samples)
            Target variable. Required to train the estimator.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
         """
 
         X, y = check_X_y(X, y)
@@ -203,6 +212,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         X = X.reset_index(drop=True)
         y = y.reset_index(drop=True)
 
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
         # If required exclude variables that are not in the input dataframe
         self._confirm_variables(X)
 
@@ -220,6 +232,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
             cv=self.cv,
             return_estimator=True,
             scoring=self.scoring,
+            fit_params={"sample_weight": sample_weight},
         )
 
         # store initial model performance
diff --git a/feature_engine/tags.py b/feature_engine/tags.py
@@ -14,6 +14,7 @@ def _return_tags():
             # The test aims to check that the check_X_y function from sklearn is
             # working, but we do not use that check, because we work with dfs.
             "check_transformer_data_not_an_array": "Ok to fail",
+            "check_sample_weights_not_an_array": "Ok to fail",
             # TODO: we probably need the test below!!
             "check_methods_sample_order_invariance": "Test does not work on dataframes",
             # TODO: we probably need the test below!!
diff --git a/tests/test_selection/test_shuffle_features.py b/tests/test_selection/test_shuffle_features.py
@@ -134,3 +134,21 @@ def test_automatic_variable_selection(df_test):
     ]
     # test transform output
     pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
+
+
+def test_sample_weights():
+    X = pd.DataFrame(
+        dict(
+            x1=[1000, 2000, 1000, 1000, 2000, 3000],
+            x2=[1000, 2000, 1000, 1000, 2000, 3000],
+        )
+    )
+    y = pd.Series([1, 0, 0, 1, 1, 0])
+
+    sbs = SelectByShuffling(
+        RandomForestClassifier(random_state=42), cv=2, random_state=42
+    )
+
+    sample_weight = [1000, 2000, 1000, 1000, 2000, 3000]
+    sbs.fit_transform(X, y, sample_weight=sample_weight)
+    assert sbs.initial_model_performance_ == 0.125