Skip to content

Commit bbd95e8

Browse files
datacubeRdatacubeRsolegalli
authored
Add code examples to Feature Selection Module API (#582)
* Adding Examples for Feature Selection Module * Fixing examples after review * Update recursive_feature_elimination.py * Update smart_correlation_selection.py * Update shuffle_features.py --------- Co-authored-by: datacubeR <[email protected]> Co-authored-by: Soledad Galli <[email protected]>
1 parent aa78270 commit bbd95e8

12 files changed

+272
-0
lines changed

feature_engine/selection/drop_constant_features.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,35 @@ class DropConstantFeatures(BaseSelector):
101101
See Also
102102
--------
103103
sklearn.feature_selection.VarianceThreshold
104+
105+
Examples
106+
--------
107+
108+
>>> import pandas as pd
109+
>>> from feature_engine.selection import DropConstantFeatures
110+
>>> X = pd.DataFrame(dict(x1 = [1,1,1,1],
111+
>>> x2 = ["a", "a", "b", "c"],
112+
>>> x3 = [True, False, False, True]))
113+
>>> dcf = DropConstantFeatures()
114+
>>> dcf.fit_transform(X)
115+
x2 x3
116+
0 a True
117+
1 a False
118+
2 b False
119+
3 c True
120+
121+
Additionally, you can set the Threshold for quasi-constant features:
122+
123+
>>> X = pd.DataFrame(dict(x1 = [1,1,1,1],
124+
>>> x2 = ["a", "a", "b", "c"],
125+
>>> x3 = [True, False, False, False]))
126+
>>> dcf = DropConstantFeatures(tol = 0.75)
127+
>>> dcf.fit_transform(X)
128+
x2
129+
0 a
130+
1 a
131+
2 b
132+
3 c
104133
"""
105134

106135
def __init__(

feature_engine/selection/drop_correlated_features.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,20 @@ class DropCorrelatedFeatures(BaseSelector):
112112
--------
113113
pandas.corr
114114
feature_engine.selection.SmartCorrelationSelection
115+
116+
Examples
117+
--------
118+
119+
>>> import pandas as pd
120+
>>> from feature_engine.selection import DropCorrelatedFeatures
121+
>>> X = pd.DataFrame(dict(x1 = [1,2,1,1], x2 = [2,4,3,1], x3 = [1, 0, 0, 1]))
122+
>>> dcf = DropCorrelatedFeatures(threshold=0.7)
123+
>>> dcf.fit_transform(X)
124+
x1 x3
125+
0 1 1
126+
1 2 0
127+
2 1 0
128+
3 1 1
115129
"""
116130

117131
def __init__(

feature_engine/selection/drop_duplicate_features.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,21 @@ class DropDuplicateFeatures(BaseSelector):
9090
transform:
9191
Remove duplicated features.
9292
93+
Examples
94+
--------
95+
96+
>>> import pandas as pd
97+
>>> from feature_engine.selection import DropDuplicateFeatures
98+
>>> X = pd.DataFrame(dict(x1 = [1,1,1,1],
99+
>>> x2 = [1,1,1,1],
100+
>>> x3 = [True, False, False, False]))
101+
>>> ddf = DropDuplicateFeatures()
102+
>>> ddf.fit_transform(X)
103+
x1 x3
104+
0 1 True
105+
1 1 False
106+
2 1 False
107+
3 1 False
93108
"""
94109

95110
def __init__(

feature_engine/selection/drop_features.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,22 @@ class DropFeatures(BaseSelector):
5252
5353
transform:
5454
Drops indicated features.
55+
56+
Examples
57+
--------
58+
59+
>>> import pandas as pd
60+
>>> from feature_engine.selection import DropFeatures
61+
>>> X = pd.DataFrame(dict(x1 = [1,2,3,4],
62+
>>> x2 = ["a", "a", "b", "c"],
63+
>>> x3 = [True, False, False, True]))
64+
>>> df = DropFeatures(features_to_drop=["x2"])
65+
>>> df.fit_transform(X)
66+
x1 x3
67+
0 1 True
68+
1 2 False
69+
2 3 False
70+
3 4 True
5571
"""
5672

5773
def __init__(self, features_to_drop: List[Union[str, int]]):

feature_engine/selection/drop_psi_features.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,28 @@ class DropHighPSIFeatures(BaseSelector):
213213
.. [1] Yurdakul B. "Statistical properties of population stability index".
214214
Western Michigan University, 2018.
215215
https://scholarworks.wmich.edu/dissertations/3208/
216+
217+
Examples
218+
--------
219+
220+
>>> import pandas as pd
221+
>>> from feature_engine.selection import DropHighPSIFeatures
222+
>>> X = pd.DataFrame(dict(x1 = [1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
223+
>>> x2 = [32,87,6,32,11,44,8,7,9,0,32,87,6,32,11,44,8,7,9,0]))
224+
>>> psi = DropHighPSIFeatures()
225+
>>> psi.fit_transform(X)
226+
x2
227+
0 32
228+
1 87
229+
2 6
230+
3 32
231+
4 11
232+
5 44
233+
6 8
234+
7 7
235+
8 9
236+
9 0
237+
10 32
216238
"""
217239

218240
def __init__(

feature_engine/selection/information_value.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,25 @@ class SelectByInformationValue(BaseSelector, WoE):
141141
142142
.. [2] WoE and IV for continuous variables
143143
https://www.listendata.com/2019/08/WOE-IV-Continuous-Dependent.html
144+
145+
Examples
146+
--------
147+
148+
>>> import pandas as pd
149+
>>> from feature_engine.selection import SelectByInformationValue
150+
>>> X = pd.DataFrame(dict(x1 = [1,1,1,1,1,1],
151+
>>> x2 = [3,2,2,3,3,2],
152+
>>> x3 = ["a","b","c","a","c","b"]))
153+
>>> y = pd.Series([1,1,1,0,0,0])
154+
>>> iv = SelectByInformationValue()
155+
>>> iv.fit_transform(X, y)
156+
x2
157+
0 3
158+
1 2
159+
2 2
160+
3 3
161+
4 3
162+
5 2
144163
"""
145164

146165
def __init__(

feature_engine/selection/recursive_feature_addition.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,27 @@ class RecursiveFeatureAddition(BaseRecursiveSelector):
112112
113113
{transform}
114114
115+
Examples
116+
--------
117+
118+
>>> import pandas as pd
119+
>>> from sklearn.ensemble import RandomForestClassifier
120+
>>> from feature_engine.selection import RecursiveFeatureAddition
121+
>>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
122+
>>> x2 = [2,4,3,1,2,2],
123+
>>> x3 = [1,1,1,0,0,0],
124+
>>> x4 = [1,2,1,1,0,1],
125+
>>> x5 = [1,1,1,1,1,1]))
126+
>>> y = pd.Series([1,0,0,1,1,0])
127+
>>> rfa = RecursiveFeatureAddition(RandomForestClassifier(random_state=42), cv=2)
128+
>>> rfa.fit_transform(X, y)
129+
x2 x4
130+
0 2 1
131+
1 4 2
132+
2 3 1
133+
3 1 1
134+
4 2 0
135+
5 2 1
115136
"""
116137

117138
def fit(self, X: pd.DataFrame, y: pd.Series):

feature_engine/selection/recursive_feature_elimination.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,27 @@ class RecursiveFeatureElimination(BaseRecursiveSelector):
112112
113113
{transform}
114114
115+
Examples
116+
--------
117+
118+
>>> import pandas as pd
119+
>>> from sklearn.ensemble import RandomForestClassifier
120+
>>> from feature_engine.selection import RecursiveFeatureElimination
121+
>>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
122+
>>> x2 = [2,4,3,1,2,2],
123+
>>> x3 = [1,1,1,0,0,0],
124+
>>> x4 = [1,2,1,1,0,1],
125+
>>> x5 = [1,1,1,1,1,1]))
126+
>>> y = pd.Series([1,0,0,1,1,0])
127+
>>> rfe = RecursiveFeatureElimination(RandomForestClassifier(random_state=2), cv=2)
128+
>>> rfe.fit_transform(X, y)
129+
x2
130+
0 2
131+
1 4
132+
2 3
133+
3 1
134+
4 2
135+
5 2
115136
"""
116137

117138
def fit(self, X: pd.DataFrame, y: pd.Series):

feature_engine/selection/shuffle_features.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,32 @@ class SelectByShuffling(BaseSelector):
134134
See Also
135135
--------
136136
sklearn.inspection.permutation_importance
137+
138+
Examples
139+
--------
140+
141+
>>> import pandas as pd
142+
>>> from sklearn.ensemble import RandomForestClassifier
143+
>>> from feature_engine.selection import SelectByShuffling
144+
>>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
145+
>>> x2 = [2,4,3,1,2,2],
146+
>>> x3 = [1,1,1,0,0,0],
147+
>>> x4 = [1,2,1,1,0,1],
148+
>>> x5 = [1,1,1,1,1,1]))
149+
>>> y = pd.Series([1,0,0,1,1,0])
150+
>>> sbs = SelectByShuffling(
151+
>>> RandomForestClassifier(random_state=42),
152+
>>> cv=2,
153+
>>> random_state=42,
154+
>>> )
155+
>>> sbs.fit_transform(X, y)
156+
x2 x4 x5
157+
0 2 1 1
158+
1 4 2 1
159+
2 3 1 1
160+
3 1 1 1
161+
4 2 0 1
162+
5 2 1 1
137163
"""
138164

139165
def __init__(

feature_engine/selection/single_feature_performance.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,30 @@ class SelectBySingleFeaturePerformance(BaseSelector):
115115
116116
.. [1] Galli S. "Machine Learning in Financial Risk Assessment".
117117
https://www.youtube.com/watch?v=KHGGlozsRtA
118+
119+
Examples
120+
--------
121+
122+
>>> import pandas as pd
123+
>>> from sklearn.ensemble import RandomForestClassifier
124+
>>> from feature_engine.selection import SelectBySingleFeaturePerformance
125+
>>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
126+
>>> x2 = [2,4,3,1,2,2],
127+
>>> x3 = [1,1,1,0,0,0],
128+
>>> x4 = [1,2,1,1,0,1],
129+
>>> x5 = [1,1,1,1,1,1]))
130+
>>> y = pd.Series([1,0,0,1,1,0])
131+
>>> sfp = SelectBySingleFeaturePerformance(
132+
>>> RandomForestClassifier(random_state=42),
133+
>>> cv=2)
134+
>>> sfp.fit_transform(X, y)
135+
x2 x3
136+
0 2 1
137+
1 4 1
138+
2 3 1
139+
3 1 0
140+
4 2 0
141+
5 2 0
118142
"""
119143

120144
def __init__(

0 commit comments

Comments
 (0)