Use values instead of DataFrame for all modes of SequentialFeatureSelector (#506)

daniel-sch · rasbt · commit 8536994a1cfb · 2019-02-15T23:41:02.000-06:00
* Use values instead of DataFrame for all modes of SequentialFeatureSelector (fixes #505) * Add bugfix description to changelog
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -28,6 +28,7 @@ The CHANGELOG for the current development version is available at
 - The `feature_selection.ColumnSelector` now also supports column names of type `int` (in addition to `str` names) if the input is a pandas DataFrame.  ([#500](https://github.com/rasbt/mlxtend/pull/500) via [tetrar124](https://github.com/tetrar124)
 - Fix unreadable labels in `plot_confusion_matrix` for imbalanced datasets if `show_absolute=True` and `show_normed=True`. ([#504](https://github.com/rasbt/mlxtend/pull/504))
 - Raises a more informative error if a `SparseDataFrame` is passed to `apriori` and the dataframe has integer column names that don't start with `0` due to current limitations of the `SparseDataFrame` implementation in pandas. ([#503](https://github.com/rasbt/mlxtend/pull/503))
+- SequentialFeatureSelector now supports DataFrame as input for all operating modes (forward/backward/floating). [#506](https://github.com/rasbt/mlxtend/pull/506)
 
 ### Version 0.15.0 (01-19-2019)
 
diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py
@@ -352,7 +352,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
 
                     k_idx, k_score, cv_scores = self._exclusion(
                         feature_set=prev_subset,
-                        X=X,
+                        X=X_,
                         y=y,
                         **fit_params
                     )
@@ -378,7 +378,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                             k_idx_c, k_score_c, cv_scores_c = self._exclusion(
                                 feature_set=k_idx,
                                 fixed_feature=new_feature,
-                                X=X,
+                                X=X_,
                                 y=y,
                                 **fit_params
                             )
@@ -387,7 +387,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                             k_idx_c, k_score_c, cv_scores_c = self._inclusion(
                                 orig_set=orig_set - {new_feature},
                                 subset=set(k_idx),
-                                X=X,
+                                X=X_,
                                 y=y,
                                 **fit_params
                             )
diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py
@@ -809,43 +809,90 @@ def test_max_feature_subset_parsimonious():
 
 
 def test_check_pandas_dataframe_fit():
-
-    iris = load_iris()
-    X = iris.data
-    y = iris.target
-    lr = SoftmaxRegression(random_seed=1)
-    sfs1 = SFS(lr,
-               k_features=2,
-               forward=True,
-               floating=False,
-               scoring='accuracy',
-               cv=0,
-               verbose=0,
-               n_jobs=1)
-
-    df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
-                                  'petal len', 'petal width'])
-
-    sfs1 = sfs1.fit(X, y)
-    assert sfs1.k_feature_idx_ == (1, 3)
-    assert sfs1.k_feature_names_ == ('1', '3')
-    assert sfs1.subsets_[2]['feature_names'] == ('1', '3')
-
-    sfs1 = sfs1.fit(df, y)
-    assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
-    assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal width')
-    assert sfs1.subsets_[1]['feature_idx'] == (3,)
-    assert sfs1.subsets_[2]['feature_idx'] == (1, 3)
-    assert sfs1.k_feature_idx_ == (1, 3)
-    assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
-
-    sfs1._TESTING_INTERRUPT_MODE = True
-    out = sfs1.fit(df, y)
-    assert len(out.subsets_.keys()) > 0
-    assert sfs1.interrupted_
-    assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
-    assert sfs1.k_feature_idx_ == (3,)
-    assert sfs1.k_feature_names_ == ('petal width',)
+    for floating in [True, False]:
+        iris = load_iris()
+        X = iris.data
+        y = iris.target
+        lr = SoftmaxRegression(random_seed=1)
+        sfs1 = SFS(lr,
+                   k_features=2,
+                   forward=True,
+                   floating=floating,
+                   scoring='accuracy',
+                   cv=0,
+                   verbose=0,
+                   n_jobs=1)
+
+        df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
+                                      'petal len', 'petal width'])
+
+        sfs1 = sfs1.fit(X, y)
+        assert sfs1.k_feature_idx_ == (1, 3)
+        assert sfs1.k_feature_names_ == ('1', '3')
+        assert sfs1.subsets_[2]['feature_names'] == ('1', '3')
+
+        sfs1 = sfs1.fit(df, y)
+        assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
+        assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
+                                                     'petal width')
+        assert sfs1.subsets_[1]['feature_idx'] == (3,)
+        assert sfs1.subsets_[2]['feature_idx'] == (1, 3)
+        assert sfs1.k_feature_idx_ == (1, 3)
+        assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
+
+        sfs1._TESTING_INTERRUPT_MODE = True
+        out = sfs1.fit(df, y)
+        assert len(out.subsets_.keys()) > 0
+        assert sfs1.interrupted_
+        assert sfs1.subsets_[1]['feature_names'] == ('petal width',)
+        assert sfs1.k_feature_idx_ == (3,)
+        assert sfs1.k_feature_names_ == ('petal width',)
+
+
+def test_check_pandas_dataframe_fit_backward():
+    for floating in [True, False]:
+        iris = load_iris()
+        X = iris.data
+        y = iris.target
+        lr = SoftmaxRegression(random_seed=1)
+        sfs1 = SFS(lr,
+                   k_features=2,
+                   forward=False,
+                   floating=floating,
+                   scoring='accuracy',
+                   cv=0,
+                   verbose=0,
+                   n_jobs=1)
+
+        df = pd.DataFrame(X, columns=['sepal len', 'sepal width',
+                                      'petal len', 'petal width'])
+
+        sfs1 = sfs1.fit(X, y)
+        assert sfs1.k_feature_idx_ == (1, 2)
+        assert sfs1.k_feature_names_ == ('1', '2')
+        assert sfs1.subsets_[2]['feature_names'] == ('1', '2')
+
+        sfs1 = sfs1.fit(df, y)
+        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
+                                                     'sepal width',
+                                                     'petal len')
+        assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
+                                                     'petal len')
+        assert sfs1.subsets_[3]['feature_idx'] == (0, 1, 2)
+        assert sfs1.subsets_[2]['feature_idx'] == (1, 2)
+        assert sfs1.k_feature_idx_ == (1, 2)
+        assert sfs1.k_feature_names_ == ('sepal width', 'petal len')
+
+        sfs1._TESTING_INTERRUPT_MODE = True
+        out = sfs1.fit(df, y)
+        assert len(out.subsets_.keys()) > 0
+        assert sfs1.interrupted_
+        assert sfs1.subsets_[3]['feature_names'] == ('sepal len',
+                                                     'sepal width',
+                                                     'petal len')
+        assert sfs1.k_feature_idx_ == (0, 1, 2)
+        assert sfs1.k_feature_names_ == ('sepal len', 'sepal width',
+                                         'petal len')
 
 
 def test_check_pandas_dataframe_transform():