Update dfs transformer features arg to be empty list for fast mode (#3875)

tamargrey · chukarsten · web-flow · commit eb02c4423eb2 · 2022-12-07T14:08:35.000-05:00
* Handle features with dataframe names that aren't 'X'

* Explain how this change relates to TODO

* Add release note

* Explain assumption around dataframe name

* Fix punctuation

* cleanup

* Update dfs transformer features arg to be empty list for fast mode

* Add release note

* remove comment

* make behavior of empty list of features more explicit through tests and docs

* PR comments

Co-authored-by: chukarsten &lt;64713315+chukarsten@users.noreply.github.com&gt;
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -4,6 +4,7 @@ Release Notes
     * Enhancements
     * Fixes
         * Allowed the DFS Transformer to calculate feature values for Features with a ``dataframe_name`` that is not ``"X"`` :pr:`3873`
+        * Stopped passing full list of DFS Transformer features into cloned pipeline in partial dependence fast mode :pr:`3875`
     * Changes
         * Remove Int64Index after Pandas 1.5 Upgrade :pr:`3825`
         * Reduced the threshold for setting ``use_covariates`` to False for ARIMA models in AutoMLSearch :pr:`3868`
diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -353,7 +353,9 @@ class AutoMLSearch:
             to `multiclass` or `regression` depending on the problem type. Note that if allowed_pipelines is provided,
             this parameter will be ignored.
 
-        features (list)[FeatureBase]: List of features to run DFS on AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the search input and if the feature itself is not in search input.
+        features (list)[FeatureBase]: List of features to run DFS on AutoML pipelines. Defaults to None.
+            Features will only be computed if the columns used by the feature exist in the search input
+            and if the feature itself is not in search input. If features is an empty list, the DFS Transformer will not be included in pipelines.
 
         data_splitter (sklearn.model_selection.BaseCrossValidator): Data splitting method to use. Defaults to StratifiedKFold.
 
diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py
@@ -13,7 +13,7 @@ class DFSTransformer(Transformer):
         index (string): The name of the column that contains the indices. If no column with this name exists,
             then featuretools.EntitySet() creates a column with this name to serve as the index column. Defaults to 'index'.
         random_seed (int): Seed for the random number generator. Defaults to 0.
-        features (list)[FeatureBase]: List of features to run DFS on. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature itself is not in input.
+        features (list)[FeatureBase]: List of features to run DFS on. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature itself is not in input. If features is an empty list, no transformation will occur to inputted data.
     """
 
     name = "DFS Transformer"
@@ -27,7 +27,7 @@ def __init__(self, index="index", features=None, random_seed=0, **kwargs):
 
         self.index = index
         self.features = features
-        self._passed_in_features = True if features else None
+        self._passed_in_features = True if features is not None else None
         # If features are passed in, they'll have a dataframe_name we should utilize.
         # Assumes all features were created from the same dataframe, which may not be true
         # if the EntitySet used to create them had multiple dataframes.
@@ -166,4 +166,7 @@ def _handle_partial_dependence_fast_mode(self, pipeline_parameters, X, target):
                 raise ValueError(
                     "Cannot use fast mode with DFS Transformer when features are unspecified or not all present in X.",
                 )
+            # Pass in empty list of features so we don't run calculate feature matrix
+            # which would happen with the full set of features for a single column at refit
+            pipeline_parameters["DFS Transformer"]["features"] = []
         return pipeline_parameters
diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -4831,6 +4831,39 @@ def test_automl_accepts_features(
         )
 
 
+@pytest.mark.parametrize(
+    "automl_algorithm",
+    ["iterative", "default"],
+)
+def test_automl_with_empty_features_list(
+    automl_algorithm,
+    X_y_binary,
+    AutoMLTestEnv,
+):
+    X, y = X_y_binary
+    X = pd.DataFrame(X)  # Drop ww information since setting column types fails
+    X.columns = X.columns.astype(str)
+
+    automl = AutoMLSearch(
+        X_train=X,
+        y_train=y,
+        problem_type="binary",
+        optimize_thresholds=False,
+        max_batches=3,
+        features=[],
+        automl_algorithm=automl_algorithm,
+    )
+
+    assert automl.automl_algorithm.features == []
+    env = AutoMLTestEnv("binary")
+    with env.test_context(score_return_value={automl.objective.name: 1.0}):
+        automl.search()
+
+    assert all(
+        ["DFS Transformer" not in p for p in automl.full_rankings["parameters"][1:]],
+    )
+
+
 @pytest.mark.skip_during_conda
 def test_automl_with_iterative_algorithm_puts_ts_estimators_first(
     ts_data,
diff --git a/evalml/tests/component_tests/test_featuretools.py b/evalml/tests/component_tests/test_featuretools.py
@@ -241,6 +241,41 @@ def test_dfs_with_serialized_features_dataframe_name(
     assert es_df_name == expected_dataframe_name
 
 
+@patch("evalml.pipelines.components.transformers.preprocessing.featuretools.dfs")
+@patch(
+    "evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix",
+)
+def test_dfs_with_empty_input_features(
+    mock_calculate_feature_matrix,
+    mock_dfs,
+    X_y_binary,
+):
+    """Confirms that the features arg being an empty list is not treated the same as
+    it being unspecified."""
+    X, y = X_y_binary
+    X_pd = pd.DataFrame(X)
+    X_pd.columns = X_pd.columns.astype(str)
+
+    # Check DFS Transformer with empty features list
+    dfs_empty_features = DFSTransformer(features=[])
+    dfs_empty_features.fit(X_pd)  # no-op
+    assert not mock_dfs.called
+
+    X_t_empty_features = dfs_empty_features.transform(X_pd)
+    assert not mock_calculate_feature_matrix.called
+    assert_frame_equal(X_pd, X_t_empty_features)
+    assert not dfs_empty_features.features
+
+    # Check DFS Transformer with features list set to None
+    dfs_unspecified_features = DFSTransformer(features=None)
+    dfs_unspecified_features.fit(X_pd)
+    assert mock_dfs.called
+
+    dfs_unspecified_features.transform(X_pd)
+    assert mock_calculate_feature_matrix.called
+    assert dfs_unspecified_features.features
+
+
 @patch("evalml.pipelines.components.transformers.preprocessing.featuretools.dfs")
 @patch(
     "evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix",
diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -7,6 +7,7 @@
 import pytest
 import woodwork as ww
 
+from evalml.automl.automl_search import AutoMLSearch
 from evalml.exceptions import (
     NullsInColumnWarning,
     PartialDependenceError,
@@ -2934,3 +2935,69 @@ def test_partial_dependence_dfs_transformer_target_in_features(fast_mode, X_y_bi
 
     assert part_dep.feature_values.notnull().all()
     assert part_dep.partial_dependence.notnull().all()
+
+
+@patch(
+    "evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix",
+)
+def test_partial_dependence_dfs_transformer_does_not_calculate_feature_matrix(
+    mock_calculate_feature_matrix,
+    X_y_binary,
+    AutoMLTestEnv,
+):
+    """Tests that the DFS Transformer doesn't ever have to call calculate feature matrix
+    in partial dependence fast mode. This is important, because it ensures that we are doing
+    the exact same calculations as slow mode."""
+    X, y = X_y_binary
+    X = pd.DataFrame(X)
+    X.columns = X.columns.astype(str)
+
+    es = ft.EntitySet()
+    es = es.add_dataframe(
+        dataframe_name="data",
+        dataframe=X,
+        index="index",
+        make_index=True,
+    )
+    X_fm, features = ft.dfs(
+        entityset=es,
+        target_dataframe_name="data",
+        trans_primitives=["absolute", "add_numeric"],
+    )
+    env = AutoMLTestEnv("binary")
+    automl = AutoMLSearch(
+        X_train=X_fm,
+        y_train=y,
+        problem_type="binary",
+        optimize_thresholds=False,
+        max_iterations=2,
+        features=features,
+        automl_algorithm="default",
+    )
+
+    with env.test_context(score_return_value={automl.objective.name: 1.0}):
+        automl.search()
+
+    assert not mock_calculate_feature_matrix.called
+    pipeline = automl.get_pipeline(1)
+    pipeline.fit(X_fm, y)
+    part_dep = partial_dependence(
+        pipeline,
+        X_fm,
+        features=0,
+        grid_resolution=2,
+    )
+    fast_part_dep = partial_dependence(
+        pipeline,
+        X_fm,
+        features=0,
+        grid_resolution=2,
+        fast_mode=True,
+        X_train=X_fm,
+        y_train=y,
+    )
+    assert not mock_calculate_feature_matrix.called
+
+    assert part_dep.feature_values.notnull().all()
+    assert part_dep.partial_dependence.notnull().all()
+    pd.testing.assert_frame_equal(part_dep, fast_part_dep)