Skip to content

Commit dbf990e

Browse files
authored
unify selectors params fix various bugs add seed to shuffle closes #192 (#201)
* unifies param to drop features, fixes automatic variable selection * fix stylechecks * creates baseselector and unifies classes * fix flake error
1 parent 6b99174 commit dbf990e

18 files changed

+512
-473
lines changed

feature_engine/selection/base_selector.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
11
import numpy as np
2+
import pandas as pd
3+
from sklearn.base import TransformerMixin, BaseEstimator
4+
from sklearn.utils.validation import check_is_fitted
5+
6+
from feature_engine.dataframe_checks import (
7+
_is_dataframe,
8+
_check_input_matches_training_df,
9+
)
210

311

412
def get_feature_importances(estimator):
@@ -17,3 +25,34 @@ def get_feature_importances(estimator):
1725
importances = np.linalg.norm(coef_, axis=0, ord=len(estimator.coef_))
1826

1927
return list(importances)
28+
29+
30+
class BaseSelector(BaseEstimator, TransformerMixin):
31+
"""Transformation shared by all selectors"""
32+
33+
def transform(self, X: pd.DataFrame):
34+
"""
35+
Return dataframe with selected features.
36+
37+
Parameters
38+
----------
39+
X : pandas dataframe of shape = [n_samples, n_features].
40+
The input dataframe.
41+
42+
Returns
43+
-------
44+
X_transformed: pandas dataframe of shape = [n_samples, n_selected_features]
45+
Pandas dataframe with the selected features.
46+
"""
47+
48+
# check if fit is performed prior to transform
49+
check_is_fitted(self)
50+
51+
# check if input is a dataframe
52+
X = _is_dataframe(X)
53+
54+
# check if number of columns in test dataset matches to train dataset
55+
_check_input_matches_training_df(X, self.input_shape_[1])
56+
57+
# return the dataframe with the selected features
58+
return X.drop(columns=self.features_to_drop_)

feature_engine/selection/drop_constant_features.py

Lines changed: 12 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,21 @@
22

33
import numpy as np
44
import pandas as pd
5-
from sklearn.base import BaseEstimator, TransformerMixin
6-
from sklearn.utils.validation import check_is_fitted
75

86
from feature_engine.dataframe_checks import (
97
_is_dataframe,
10-
_check_input_matches_training_df,
118
_check_contains_na,
129
)
1310
from feature_engine.variable_manipulation import (
1411
_check_input_parameter_variables,
1512
_find_all_variables,
1613
)
14+
from feature_engine.selection.base_selector import BaseSelector
1715

1816
Variables = Union[None, int, str, List[Union[str, int]]]
1917

2018

21-
class DropConstantFeatures(TransformerMixin, BaseEstimator):
19+
class DropConstantFeatures(BaseSelector):
2220
"""
2321
Drop constant and quasi-constant variables from a dataframe. Constant variables
2422
show the same value across all the observations in the dataset. Quasi-constant
@@ -51,7 +49,7 @@ class DropConstantFeatures(TransformerMixin, BaseEstimator):
5149
5250
Attributes
5351
----------
54-
constant_features_:
52+
features_to_drop_:
5553
List with constant and quasi-constant features.
5654
5755
Methods
@@ -90,7 +88,6 @@ def __init__(
9088
self.missing_values = missing_values
9189

9290
def fit(self, X: pd.DataFrame, y: pd.Series = None):
93-
9491
"""
9592
Find constant and quasi-constant features.
9693
@@ -121,13 +118,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
121118

122119
# find constant features
123120
if self.tol == 1:
124-
self.constant_features_ = [
121+
self.features_to_drop_ = [
125122
feature for feature in self.variables if X[feature].nunique() == 1
126123
]
127124

128125
# find constant and quasi-constant features
129126
else:
130-
self.constant_features_ = []
127+
self.features_to_drop_ = []
131128

132129
for feature in self.variables:
133130
# find most frequent value / category in the variable
@@ -138,10 +135,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
138135
)
139136

140137
if predominant >= self.tol:
141-
self.constant_features_.append(feature)
138+
self.features_to_drop_.append(feature)
142139

143140
# check we are not dropping all the columns in the df
144-
if len(self.constant_features_) == len(X.columns):
141+
if len(self.features_to_drop_) == len(X.columns):
145142
raise ValueError(
146143
"The resulting dataframe will have no columns after dropping all "
147144
"constant or quasi-constant features. Try changing the tol value."
@@ -151,36 +148,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
151148

152149
return self
153150

154-
def transform(self, X: pd.DataFrame):
155-
"""
156-
Drop the constant and quasi-constant features from a dataframe.
157-
158-
Parameters
159-
----------
160-
X : pandas dataframe of shape = [n_samples, n_features].
161-
The input samples.
162-
163-
Returns
164-
-------
165-
X_transformed : pandas dataframe,
166-
shape = [n_samples, n_features - (constant + quasi_constant features)]
167-
The transformed dataframe with the remaining subset of variables.
168-
"""
169-
170-
# check if fit is performed prior to transform
171-
check_is_fitted(self)
172-
173-
# check if input is a dataframe
174-
X = _is_dataframe(X)
175-
176-
# check if number of columns in test dataset matches to train dataset
177-
_check_input_matches_training_df(X, self.input_shape_[1])
178-
179-
if self.missing_values == "raise":
180-
# check if dataset contains na
181-
_check_contains_na(X, self.variables)
182-
183-
# returned selected features
184-
X = X.drop(columns=self.constant_features_)
151+
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
152+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
153+
X = super().transform(X)
185154

186155
return X
156+
157+
transform.__doc__ = BaseSelector.transform.__doc__

feature_engine/selection/drop_correlated_features.py

Lines changed: 16 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,21 @@
11
from typing import List, Union
22

33
import pandas as pd
4-
from sklearn.base import TransformerMixin, BaseEstimator
5-
from sklearn.utils.validation import check_is_fitted
64

75
from feature_engine.dataframe_checks import (
86
_is_dataframe,
9-
_check_input_matches_training_df,
107
_check_contains_na,
118
)
129
from feature_engine.variable_manipulation import (
1310
_find_or_check_numerical_variables,
1411
_check_input_parameter_variables,
1512
)
13+
from feature_engine.selection.base_selector import BaseSelector
1614

1715
Variables = Union[None, int, str, List[Union[str, int]]]
1816

1917

20-
class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
18+
class DropCorrelatedFeatures(BaseSelector):
2119
"""
2220
DropCorrelatedFeatures() finds and removes correlated features. Correlation is
2321
calculated with `pandas.corr()`.
@@ -52,14 +50,11 @@ class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
5250
5351
Attributes
5452
----------
55-
correlated_features_:
56-
Set with the correlated features.
53+
features_to_drop_:
54+
Set with the correlated features that will be dropped.
5755
5856
correlated_feature_sets_:
59-
Groups of correlated features. Each list is a group of correlated features.
60-
61-
correlated_matrix_:
62-
The correlation matrix.
57+
Groups of correlated features. Each list is a group of correlated features.
6358
6459
Methods
6560
-------
@@ -128,20 +123,20 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
128123
_check_contains_na(X, self.variables)
129124

130125
# set to collect features that are correlated
131-
self.correlated_features_ = set()
126+
self.features_to_drop_ = set()
132127

133128
# create tuples of correlated feature groups
134129
self.correlated_feature_sets_ = []
135130

136131
# the correlation matrix
137-
self.correlated_matrix_ = X[self.variables].corr(method=self.method)
132+
_correlated_matrix = X[self.variables].corr(method=self.method)
138133

139134
# create set of examined features, helps to determine feature combinations
140135
# to evaluate below
141136
_examined_features = set()
142137

143138
# for each feature in the dataset (columns of the correlation matrix)
144-
for feature in self.correlated_matrix_.columns:
139+
for feature in _correlated_matrix.columns:
145140

146141
if feature not in _examined_features:
147142

@@ -155,20 +150,18 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
155150
# features that have not been examined, are not currently examined and
156151
# were not found correlated
157152
_features_to_compare = [
158-
f
159-
for f in self.correlated_matrix_.columns
160-
if f not in _examined_features
153+
f for f in _correlated_matrix.columns if f not in _examined_features
161154
]
162155

163156
# create combinations:
164157
for f2 in _features_to_compare:
165158

166159
# if the correlation is higher than the threshold
167160
# we are interested in absolute correlation coefficient value
168-
if abs(self.correlated_matrix_.loc[f2, feature]) > self.threshold:
161+
if abs(_correlated_matrix.loc[f2, feature]) > self.threshold:
169162

170163
# add feature (f2) to our correlated set
171-
self.correlated_features_.add(f2)
164+
self.features_to_drop_.add(f2)
172165
_temp_set.add(f2)
173166
_examined_features.add(f2)
174167

@@ -180,35 +173,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
180173

181174
return self
182175

183-
def transform(self, X):
184-
"""
185-
Drop the correlated features from a dataframe.
186-
187-
Parameters
188-
----------
189-
X : pandas dataframe of shape = [n_samples, n_features].
190-
The input samples.
191-
192-
Returns
193-
-------
194-
X_transformed : pandas dataframe
195-
shape = [n_samples, n_features - (correlated features)]
196-
The transformed dataframe with the remaining subset of variables.
197-
"""
198-
# check if fit is performed prior to transform
199-
check_is_fitted(self)
200-
201-
# check if input is a dataframe
202-
X = _is_dataframe(X)
203-
204-
# check if number of columns in test dataset matches to train dataset
205-
_check_input_matches_training_df(X, self.input_shape_[1])
206-
207-
if self.missing_values == "raise":
208-
# check if dataset contains na
209-
_check_contains_na(X, self.variables)
210-
211-
# returned non-correlated features
212-
X = X.drop(columns=self.correlated_features_)
176+
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
177+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
178+
X = super().transform(X)
213179

214180
return X
181+
182+
transform.__doc__ = BaseSelector.transform.__doc__

0 commit comments

Comments
 (0)