Skip to content

Commit eb02c44

Browse files
Update dfs transformer features arg to be empty list for fast mode (#3875)
* Handle features with dataframe names that aren't 'X' * Explain how this change relates to TODO * Add release note * Explain assumption around dataframe name * Fix punctuation * cleanup * Update dfs transformer features arg to be empty list for fast mode * Add release note * remove comment * make behavior of empty list of features more explicit through tests and docs * PR comments Co-authored-by: chukarsten <64713315+chukarsten@users.noreply.github.com>
1 parent 9fedf3b commit eb02c44

File tree

6 files changed

+144
-3
lines changed

6 files changed

+144
-3
lines changed

docs/source/release_notes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Release Notes
44
* Enhancements
55
* Fixes
66
* Allowed the DFS Transformer to calculate feature values for Features with a ``dataframe_name`` that is not ``"X"`` :pr:`3873`
7+
* Stopped passing full list of DFS Transformer features into cloned pipeline in partial dependence fast mode :pr:`3875`
78
* Changes
89
* Remove Int64Index after Pandas 1.5 Upgrade :pr:`3825`
910
* Reduced the threshold for setting ``use_covariates`` to False for ARIMA models in AutoMLSearch :pr:`3868`

evalml/automl/automl_search.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,9 @@ class AutoMLSearch:
353353
to `multiclass` or `regression` depending on the problem type. Note that if allowed_pipelines is provided,
354354
this parameter will be ignored.
355355
356-
features (list)[FeatureBase]: List of features to run DFS on AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the search input and if the feature itself is not in search input.
356+
features (list)[FeatureBase]: List of features to run DFS on AutoML pipelines. Defaults to None.
357+
Features will only be computed if the columns used by the feature exist in the search input
358+
and if the feature itself is not in search input. If features is an empty list, the DFS Transformer will not be included in pipelines.
357359
358360
data_splitter (sklearn.model_selection.BaseCrossValidator): Data splitting method to use. Defaults to StratifiedKFold.
359361

evalml/pipelines/components/transformers/preprocessing/featuretools.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class DFSTransformer(Transformer):
1313
index (string): The name of the column that contains the indices. If no column with this name exists,
1414
then featuretools.EntitySet() creates a column with this name to serve as the index column. Defaults to 'index'.
1515
random_seed (int): Seed for the random number generator. Defaults to 0.
16-
features (list)[FeatureBase]: List of features to run DFS on. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature itself is not in input.
16+
features (list)[FeatureBase]: List of features to run DFS on. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature itself is not in input. If features is an empty list, no transformation will occur to inputted data.
1717
"""
1818

1919
name = "DFS Transformer"
@@ -27,7 +27,7 @@ def __init__(self, index="index", features=None, random_seed=0, **kwargs):
2727

2828
self.index = index
2929
self.features = features
30-
self._passed_in_features = True if features else None
30+
self._passed_in_features = True if features is not None else None
3131
# If features are passed in, they'll have a dataframe_name we should utilize.
3232
# Assumes all features were created from the same dataframe, which may not be true
3333
# if the EntitySet used to create them had multiple dataframes.
@@ -166,4 +166,7 @@ def _handle_partial_dependence_fast_mode(self, pipeline_parameters, X, target):
166166
raise ValueError(
167167
"Cannot use fast mode with DFS Transformer when features are unspecified or not all present in X.",
168168
)
169+
# Pass in empty list of features so we don't run calculate feature matrix
170+
# which would happen with the full set of features for a single column at refit
171+
pipeline_parameters["DFS Transformer"]["features"] = []
169172
return pipeline_parameters

evalml/tests/automl_tests/test_automl.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4831,6 +4831,39 @@ def test_automl_accepts_features(
48314831
)
48324832

48334833

4834+
@pytest.mark.parametrize(
4835+
"automl_algorithm",
4836+
["iterative", "default"],
4837+
)
4838+
def test_automl_with_empty_features_list(
4839+
automl_algorithm,
4840+
X_y_binary,
4841+
AutoMLTestEnv,
4842+
):
4843+
X, y = X_y_binary
4844+
X = pd.DataFrame(X) # Drop ww information since setting column types fails
4845+
X.columns = X.columns.astype(str)
4846+
4847+
automl = AutoMLSearch(
4848+
X_train=X,
4849+
y_train=y,
4850+
problem_type="binary",
4851+
optimize_thresholds=False,
4852+
max_batches=3,
4853+
features=[],
4854+
automl_algorithm=automl_algorithm,
4855+
)
4856+
4857+
assert automl.automl_algorithm.features == []
4858+
env = AutoMLTestEnv("binary")
4859+
with env.test_context(score_return_value={automl.objective.name: 1.0}):
4860+
automl.search()
4861+
4862+
assert all(
4863+
["DFS Transformer" not in p for p in automl.full_rankings["parameters"][1:]],
4864+
)
4865+
4866+
48344867
@pytest.mark.skip_during_conda
48354868
def test_automl_with_iterative_algorithm_puts_ts_estimators_first(
48364869
ts_data,

evalml/tests/component_tests/test_featuretools.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,41 @@ def test_dfs_with_serialized_features_dataframe_name(
241241
assert es_df_name == expected_dataframe_name
242242

243243

244+
@patch("evalml.pipelines.components.transformers.preprocessing.featuretools.dfs")
245+
@patch(
246+
"evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix",
247+
)
248+
def test_dfs_with_empty_input_features(
249+
mock_calculate_feature_matrix,
250+
mock_dfs,
251+
X_y_binary,
252+
):
253+
"""Confirms that the features arg being an empty list is not treated the same as
254+
it being unspecified."""
255+
X, y = X_y_binary
256+
X_pd = pd.DataFrame(X)
257+
X_pd.columns = X_pd.columns.astype(str)
258+
259+
# Check DFS Transformer with empty features list
260+
dfs_empty_features = DFSTransformer(features=[])
261+
dfs_empty_features.fit(X_pd) # no-op
262+
assert not mock_dfs.called
263+
264+
X_t_empty_features = dfs_empty_features.transform(X_pd)
265+
assert not mock_calculate_feature_matrix.called
266+
assert_frame_equal(X_pd, X_t_empty_features)
267+
assert not dfs_empty_features.features
268+
269+
# Check DFS Transformer with features list set to None
270+
dfs_unspecified_features = DFSTransformer(features=None)
271+
dfs_unspecified_features.fit(X_pd)
272+
assert mock_dfs.called
273+
274+
dfs_unspecified_features.transform(X_pd)
275+
assert mock_calculate_feature_matrix.called
276+
assert dfs_unspecified_features.features
277+
278+
244279
@patch("evalml.pipelines.components.transformers.preprocessing.featuretools.dfs")
245280
@patch(
246281
"evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix",

evalml/tests/model_understanding_tests/test_partial_dependence.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pytest
88
import woodwork as ww
99

10+
from evalml.automl.automl_search import AutoMLSearch
1011
from evalml.exceptions import (
1112
NullsInColumnWarning,
1213
PartialDependenceError,
@@ -2934,3 +2935,69 @@ def test_partial_dependence_dfs_transformer_target_in_features(fast_mode, X_y_bi
29342935

29352936
assert part_dep.feature_values.notnull().all()
29362937
assert part_dep.partial_dependence.notnull().all()
2938+
2939+
2940+
@patch(
2941+
"evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix",
2942+
)
2943+
def test_partial_dependence_dfs_transformer_does_not_calculate_feature_matrix(
2944+
mock_calculate_feature_matrix,
2945+
X_y_binary,
2946+
AutoMLTestEnv,
2947+
):
2948+
"""Tests that the DFS Transformer doesn't ever have to call calculate feature matrix
2949+
in partial dependence fast mode. This is important, because it ensures that we are doing
2950+
the exact same calculations as slow mode."""
2951+
X, y = X_y_binary
2952+
X = pd.DataFrame(X)
2953+
X.columns = X.columns.astype(str)
2954+
2955+
es = ft.EntitySet()
2956+
es = es.add_dataframe(
2957+
dataframe_name="data",
2958+
dataframe=X,
2959+
index="index",
2960+
make_index=True,
2961+
)
2962+
X_fm, features = ft.dfs(
2963+
entityset=es,
2964+
target_dataframe_name="data",
2965+
trans_primitives=["absolute", "add_numeric"],
2966+
)
2967+
env = AutoMLTestEnv("binary")
2968+
automl = AutoMLSearch(
2969+
X_train=X_fm,
2970+
y_train=y,
2971+
problem_type="binary",
2972+
optimize_thresholds=False,
2973+
max_iterations=2,
2974+
features=features,
2975+
automl_algorithm="default",
2976+
)
2977+
2978+
with env.test_context(score_return_value={automl.objective.name: 1.0}):
2979+
automl.search()
2980+
2981+
assert not mock_calculate_feature_matrix.called
2982+
pipeline = automl.get_pipeline(1)
2983+
pipeline.fit(X_fm, y)
2984+
part_dep = partial_dependence(
2985+
pipeline,
2986+
X_fm,
2987+
features=0,
2988+
grid_resolution=2,
2989+
)
2990+
fast_part_dep = partial_dependence(
2991+
pipeline,
2992+
X_fm,
2993+
features=0,
2994+
grid_resolution=2,
2995+
fast_mode=True,
2996+
X_train=X_fm,
2997+
y_train=y,
2998+
)
2999+
assert not mock_calculate_feature_matrix.called
3000+
3001+
assert part_dep.feature_values.notnull().all()
3002+
assert part_dep.partial_dependence.notnull().all()
3003+
pd.testing.assert_frame_equal(part_dep, fast_part_dep)

0 commit comments

Comments
 (0)