Skip to content

Commit b11205b

Browse files
authored
fix r2 bug in selectors
* fix r2 bug * add example of target mean selection
1 parent afe3bdf commit b11205b

File tree

3 files changed

+108
-31
lines changed

3 files changed

+108
-31
lines changed

docs/selection/SelectByTargetMeanPerformance.rst

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,98 @@ API Reference
1111
Example
1212
-------
1313

14-
Coming soon
14+
.. code:: python
15+
16+
import pandas as pd
17+
import numpy as np
18+
from sklearn.model_selection import train_test_split
19+
from sklearn.metrics import roc_auc_score
20+
from feature_engine.selection import SelectByTargetMeanPerformance
21+
22+
# load data
23+
data = pd.read_csv('../titanic.csv')
24+
25+
# extract cabin letter
26+
data['cabin'] = data['cabin'].str[0]
27+
28+
# replace infrequent cabins by N
29+
data['cabin'] = np.where(data['cabin'].isin(['T', 'G']), 'N', data['cabin'])
30+
31+
# cap maximum values
32+
data['parch'] = np.where(data['parch']>3,3,data['parch'])
33+
data['sibsp'] = np.where(data['sibsp']>3,3,data['sibsp'])
34+
35+
# cast variables as object to treat as categorical
36+
data[['pclass','sibsp','parch']] = data[['pclass','sibsp','parch']].astype('O')
37+
38+
# separate train and test sets
39+
X_train, X_test, y_train, y_test = train_test_split(
40+
data.drop(['survived'], axis=1),
41+
data['survived'],
42+
test_size=0.3,
43+
random_state=0)
44+
45+
46+
# feature engine automates the selection for both categorical and numerical
47+
# variables
48+
sel = SelectByTargetMeanPerformance(
49+
variables=None,
50+
scoring="roc_auc_score",
51+
threshold=0.6,
52+
bins=3,
53+
strategy="equal_frequency",
54+
cv=2,# cross validation
55+
random_state=1, #seed for reproducibility
56+
)
57+
58+
# find important features
59+
sel.fit(X_train, y_train)
60+
61+
sel.variables_categorical_
62+
63+
.. code:: python
64+
65+
['pclass', 'sex', 'sibsp', 'parch', 'cabin', 'embarked']
66+
67+
.. code:: python
68+
69+
sel.variables_numerical_
70+
71+
.. code:: python
72+
73+
['age', 'fare']
74+
75+
.. code:: python
76+
77+
sel.feature_performance_
78+
79+
.. code:: python
80+
81+
{'pclass': 0.6802934787230475,
82+
'sex': 0.7491365252482871,
83+
'age': 0.5345141148737766,
84+
'sibsp': 0.5720480307315783,
85+
'parch': 0.5243557188989476,
86+
'fare': 0.6600883312700917,
87+
'cabin': 0.6379782658154696,
88+
'embarked': 0.5672382248783936}
89+
90+
.. code:: python
91+
92+
sel.features_to_drop_
93+
94+
.. code:: python
95+
96+
['age', 'sibsp', 'parch', 'embarked']
97+
98+
.. code:: python
99+
100+
# remove features
101+
X_train = sel.transform(X_train)
102+
X_test = sel.transform(X_test)
103+
104+
X_train.shape, X_test.shape
105+
106+
.. code:: python
107+
108+
((914, 4), (392, 4))

feature_engine/selection/single_feature_performance.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import List, Union
22
import warnings
33

4-
import numpy as np
54
import pandas as pd
65
from sklearn.ensemble import RandomForestClassifier
76
from sklearn.model_selection import cross_validate
@@ -46,8 +45,8 @@ class SelectBySingleFeaturePerformance(BaseSelector):
4645
threshold : float, int, default = None
4746
The value that defines if a feature will be kept or removed.
4847
49-
For r2, the transformer will consider absolute values to select features. So,
50-
for a threshold of 0.5, features with r2 > 0.5 or r2 < -0.5 will be selected.
48+
The r2varies between 0 and 1. So a threshold needs to be set-up within
49+
these boundaries.
5150
5251
The roc-auc varies between 0.5 and 1. So a threshold needs to be set-up within
5352
these boundaries.
@@ -158,18 +157,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
158157
else:
159158
threshold = self.threshold
160159

161-
if self.scoring == "r2":
162-
# take the absolute value
163-
self.features_to_drop_ = [
164-
f
165-
for f in self.feature_performance_.keys()
166-
if np.abs(self.feature_performance_[f]) < threshold
167-
]
168-
else:
169-
self.features_to_drop_ = [
170-
f
171-
for f in self.feature_performance_.keys()
172-
if self.feature_performance_[f] < threshold
160+
self.features_to_drop_ = [
161+
f
162+
for f in self.feature_performance_.keys()
163+
if self.feature_performance_[f] < threshold
173164
]
174165

175166
# check we are not dropping all the columns in the df

feature_engine/selection/target_mean_selection.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from typing import List, Union
22

3-
import numpy as np
43
import pandas as pd
54
from sklearn.metrics import roc_auc_score, r2_score
65
from sklearn.model_selection import StratifiedKFold
@@ -80,8 +79,7 @@ class SelectByTargetMeanPerformance(BaseSelector):
8079
The current implementation supports 'roc_auc_score' and 'r2_score'.
8180
8281
threshold : float, default = 0.5
83-
The performance threshold above which a feature will be selected.If scoring is
84-
'r2_score', the selector evaluates the absolute value.
82+
The performance threshold above which a feature will be selected.
8583
8684
bins : int, default = 5
8785
If the dataset contains numerical variables, the number of bins into which
@@ -247,18 +245,12 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
247245
axis=1
248246
).to_dict()
249247

250-
if self.scoring == "roc_auc_score":
251-
self.features_to_drop_ = [
252-
f
253-
for f in self.variables
254-
if self.feature_performance_[f] < self.threshold
255-
]
256-
else:
257-
self.features_to_drop_ = [
258-
f
259-
for f in self.variables
260-
if np.abs(self.feature_performance_[f]) < self.threshold
261-
]
248+
self.features_to_drop_ = [
249+
f
250+
for f in self.variables
251+
if self.feature_performance_[f] < self.threshold
252+
]
253+
262254
self.input_shape_ = X.shape
263255

264256
return self

0 commit comments

Comments
 (0)