fix r2 bug in selectors

solegalli · web-flow · commit b11205b1669e · 2021-01-03T13:54:51.000-03:00
* fix r2 bug

* add example of target mean selection
diff --git a/docs/selection/SelectByTargetMeanPerformance.rst b/docs/selection/SelectByTargetMeanPerformance.rst
@@ -11,4 +11,98 @@ API Reference
 Example
 -------
 
-Coming soon
+.. code:: python
+
+    import pandas as pd
+    import numpy as np
+    from sklearn.model_selection import train_test_split
+    from sklearn.metrics import roc_auc_score
+    from feature_engine.selection import SelectByTargetMeanPerformance
+
+    # load data
+    data = pd.read_csv('../titanic.csv')
+
+    # extract cabin letter
+    data['cabin'] = data['cabin'].str[0]
+
+    # replace infrequent cabins by N
+    data['cabin'] = np.where(data['cabin'].isin(['T', 'G']), 'N', data['cabin'])
+
+    # cap maximum values
+    data['parch'] = np.where(data['parch']>3,3,data['parch'])
+    data['sibsp'] = np.where(data['sibsp']>3,3,data['sibsp'])
+
+    # cast variables as object to treat as categorical
+    data[['pclass','sibsp','parch']] = data[['pclass','sibsp','parch']].astype('O')
+
+    # separate train and test sets
+    X_train, X_test, y_train, y_test = train_test_split(
+        data.drop(['survived'], axis=1),
+        data['survived'],
+        test_size=0.3,
+        random_state=0)
+
+
+    # feature engine automates the selection for both categorical and numerical
+    # variables
+    sel = SelectByTargetMeanPerformance(
+        variables=None,
+        scoring="roc_auc_score",
+        threshold=0.6,
+        bins=3,
+        strategy="equal_frequency", 
+        cv=2,# cross validation
+        random_state=1, #seed for reproducibility
+    )
+
+    # find important features
+    sel.fit(X_train, y_train)
+
+    sel.variables_categorical_
+
+.. code:: python
+
+    ['pclass', 'sex', 'sibsp', 'parch', 'cabin', 'embarked']
+
+.. code:: python
+
+    sel.variables_numerical_
+
+.. code:: python
+
+    ['age', 'fare']
+
+.. code:: python
+
+    sel.feature_performance_
+
+.. code:: python
+
+    {'pclass': 0.6802934787230475,
+     'sex': 0.7491365252482871,
+     'age': 0.5345141148737766,
+     'sibsp': 0.5720480307315783,
+     'parch': 0.5243557188989476,
+     'fare': 0.6600883312700917,
+     'cabin': 0.6379782658154696,
+     'embarked': 0.5672382248783936}
+
+.. code:: python
+
+    sel.features_to_drop_
+
+.. code:: python
+
+    ['age', 'sibsp', 'parch', 'embarked']
+
+.. code:: python
+
+    # remove features
+    X_train = sel.transform(X_train)
+    X_test = sel.transform(X_test)
+
+    X_train.shape, X_test.shape
+
+.. code:: python
+
+    ((914, 4), (392, 4))
diff --git a/feature_engine/selection/single_feature_performance.py b/feature_engine/selection/single_feature_performance.py
@@ -1,7 +1,6 @@
 from typing import List, Union
 import warnings
 
-import numpy as np
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import cross_validate
@@ -46,8 +45,8 @@ class SelectBySingleFeaturePerformance(BaseSelector):
     threshold : float, int, default = None
         The value that defines if a feature will be kept or removed.
 
-        For r2, the transformer will consider absolute values to select features. So,
-        for a threshold of 0.5, features with r2 > 0.5 or r2 < -0.5 will be selected.
+        The r2varies between 0 and 1. So a threshold needs to be set-up within
+        these boundaries.
 
         The roc-auc varies between 0.5 and 1. So a threshold needs to be set-up within
         these boundaries.
@@ -158,18 +157,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         else:
             threshold = self.threshold
 
-        if self.scoring == "r2":
-            # take the absolute value
-            self.features_to_drop_ = [
-                f
-                for f in self.feature_performance_.keys()
-                if np.abs(self.feature_performance_[f]) < threshold
-            ]
-        else:
-            self.features_to_drop_ = [
-                f
-                for f in self.feature_performance_.keys()
-                if self.feature_performance_[f] < threshold
+        self.features_to_drop_ = [
+            f
+            for f in self.feature_performance_.keys()
+            if self.feature_performance_[f] < threshold
             ]
 
         # check we are not dropping all the columns in the df
diff --git a/feature_engine/selection/target_mean_selection.py b/feature_engine/selection/target_mean_selection.py
@@ -1,6 +1,5 @@
 from typing import List, Union
 
-import numpy as np
 import pandas as pd
 from sklearn.metrics import roc_auc_score, r2_score
 from sklearn.model_selection import StratifiedKFold
@@ -80,8 +79,7 @@ class SelectByTargetMeanPerformance(BaseSelector):
         The current implementation supports 'roc_auc_score' and 'r2_score'.
 
     threshold : float, default = 0.5
-        The performance threshold above which a feature will be selected.If scoring is
-        'r2_score', the selector evaluates the absolute value.
+        The performance threshold above which a feature will be selected.
 
     bins : int, default = 5
         If the dataset contains numerical variables, the number of bins into which
@@ -247,18 +245,12 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
             axis=1
         ).to_dict()
 
-        if self.scoring == "roc_auc_score":
-            self.features_to_drop_ = [
-                f
-                for f in self.variables
-                if self.feature_performance_[f] < self.threshold
-            ]
-        else:
-            self.features_to_drop_ = [
-                f
-                for f in self.variables
-                if np.abs(self.feature_performance_[f]) < self.threshold
-            ]
+        self.features_to_drop_ = [
+            f
+            for f in self.variables
+            if self.feature_performance_[f] < self.threshold
+        ]
+
         self.input_shape_ = X.shape
 
         return self