Skip to content

Commit 8536994

Browse files
daniel-schrasbt
authored andcommitted
Use values instead of DataFrame for all modes of SequentialFeatureSelector (#506)
* Use values instead of DataFrame for all modes of SequentialFeatureSelector (fixes #505) * Add bugfix description to changelog
1 parent 1ca3059 commit 8536994

File tree

3 files changed

+88
-40
lines changed

3 files changed

+88
-40
lines changed

docs/sources/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ The CHANGELOG for the current development version is available at
2828
- The `feature_selection.ColumnSelector` now also supports column names of type `int` (in addition to `str` names) if the input is a pandas DataFrame. ([#500](https://github.com/rasbt/mlxtend/pull/500) via [tetrar124](https://github.com/tetrar124)
2929
- Fix unreadable labels in `plot_confusion_matrix` for imbalanced datasets if `show_absolute=True` and `show_normed=True`. ([#504](https://github.com/rasbt/mlxtend/pull/504))
3030
- Raises a more informative error if a `SparseDataFrame` is passed to `apriori` and the dataframe has integer column names that don't start with `0` due to current limitations of the `SparseDataFrame` implementation in pandas. ([#503](https://github.com/rasbt/mlxtend/pull/503))
31+
- SequentialFeatureSelector now supports DataFrame as input for all operating modes (forward/backward/floating). [#506](https://github.com/rasbt/mlxtend/pull/506)
3132

3233
### Version 0.15.0 (01-19-2019)
3334

mlxtend/feature_selection/sequential_feature_selector.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
352352

353353
k_idx, k_score, cv_scores = self._exclusion(
354354
feature_set=prev_subset,
355-
X=X,
355+
X=X_,
356356
y=y,
357357
**fit_params
358358
)
@@ -378,7 +378,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
378378
k_idx_c, k_score_c, cv_scores_c = self._exclusion(
379379
feature_set=k_idx,
380380
fixed_feature=new_feature,
381-
X=X,
381+
X=X_,
382382
y=y,
383383
**fit_params
384384
)
@@ -387,7 +387,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
387387
k_idx_c, k_score_c, cv_scores_c = self._inclusion(
388388
orig_set=orig_set - {new_feature},
389389
subset=set(k_idx),
390-
X=X,
390+
X=X_,
391391
y=y,
392392
**fit_params
393393
)

mlxtend/feature_selection/tests/test_sequential_feature_selector.py

Lines changed: 84 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -809,43 +809,90 @@ def test_max_feature_subset_parsimonious():
809809

810810

811811
def test_check_pandas_dataframe_fit():
812-
813-
iris = load_iris()
814-
X = iris.data
815-
y = iris.target
816-
lr = SoftmaxRegression(random_seed=1)
817-
sfs1 = SFS(lr,
818-
k_features=2,
819-
forward=True,
820-
floating=False,
821-
scoring='accuracy',
822-
cv=0,
823-
verbose=0,
824-
n_jobs=1)
825-
826-
df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
827-
'petal len', 'petal width'])
828-
829-
sfs1 = sfs1.fit(X, y)
830-
assert sfs1.k_feature_idx_ == (1, 3)
831-
assert sfs1.k_feature_names_ == ('1', '3')
832-
assert sfs1.subsets_[2]['feature_names'] == ('1', '3')
833-
834-
sfs1 = sfs1.fit(df, y)
835-
assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
836-
assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal width')
837-
assert sfs1.subsets_[1]['feature_idx'] == (3,)
838-
assert sfs1.subsets_[2]['feature_idx'] == (1, 3)
839-
assert sfs1.k_feature_idx_ == (1, 3)
840-
assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
841-
842-
sfs1._TESTING_INTERRUPT_MODE = True
843-
out = sfs1.fit(df, y)
844-
assert len(out.subsets_.keys()) > 0
845-
assert sfs1.interrupted_
846-
assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
847-
assert sfs1.k_feature_idx_ == (3,)
848-
assert sfs1.k_feature_names_ == ('petal width',)
812+
for floating in [True, False]:
813+
iris = load_iris()
814+
X = iris.data
815+
y = iris.target
816+
lr = SoftmaxRegression(random_seed=1)
817+
sfs1 = SFS(lr,
818+
k_features=2,
819+
forward=True,
820+
floating=floating,
821+
scoring='accuracy',
822+
cv=0,
823+
verbose=0,
824+
n_jobs=1)
825+
826+
df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
827+
'petal len', 'petal width'])
828+
829+
sfs1 = sfs1.fit(X, y)
830+
assert sfs1.k_feature_idx_ == (1, 3)
831+
assert sfs1.k_feature_names_ == ('1', '3')
832+
assert sfs1.subsets_[2]['feature_names'] == ('1', '3')
833+
834+
sfs1 = sfs1.fit(df, y)
835+
assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
836+
assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
837+
'petal width')
838+
assert sfs1.subsets_[1]['feature_idx'] == (3,)
839+
assert sfs1.subsets_[2]['feature_idx'] == (1, 3)
840+
assert sfs1.k_feature_idx_ == (1, 3)
841+
assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
842+
843+
sfs1._TESTING_INTERRUPT_MODE = True
844+
out = sfs1.fit(df, y)
845+
assert len(out.subsets_.keys()) > 0
846+
assert sfs1.interrupted_
847+
assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
848+
assert sfs1.k_feature_idx_ == (3,)
849+
assert sfs1.k_feature_names_ == ('petal width',)
850+
851+
852+
def test_check_pandas_dataframe_fit_backward():
853+
for floating in [True, False]:
854+
iris = load_iris()
855+
X = iris.data
856+
y = iris.target
857+
lr = SoftmaxRegression(random_seed=1)
858+
sfs1 = SFS(lr,
859+
k_features=2,
860+
forward=False,
861+
floating=floating,
862+
scoring='accuracy',
863+
cv=0,
864+
verbose=0,
865+
n_jobs=1)
866+
867+
df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
868+
'petal len', 'petal width'])
869+
870+
sfs1 = sfs1.fit(X, y)
871+
assert sfs1.k_feature_idx_ == (1, 2)
872+
assert sfs1.k_feature_names_ == ('1', '2')
873+
assert sfs1.subsets_[2]['feature_names'] == ('1', '2')
874+
875+
sfs1 = sfs1.fit(df, y)
876+
assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
877+
'sepal width',
878+
'petal len')
879+
assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
880+
'petal len')
881+
assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2)
882+
assert sfs1.subsets_[2]['feature_idx'] == (1, 2)
883+
assert sfs1.k_feature_idx_ == (1, 2)
884+
assert sfs1.k_feature_names_ == ('sepal width', 'petal len')
885+
886+
sfs1._TESTING_INTERRUPT_MODE = True
887+
out = sfs1.fit(df, y)
888+
assert len(out.subsets_.keys()) > 0
889+
assert sfs1.interrupted_
890+
assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
891+
'sepal width',
892+
'petal len')
893+
assert sfs1.k_feature_idx_ == (0, 1, 2)
894+
assert sfs1.k_feature_names_ == ('sepal len', 'sepal width',
895+
'petal len')
849896

850897

851898
def test_check_pandas_dataframe_transform():

0 commit comments

Comments
 (0)