Skip to content

Commit 20d5b2c

Browse files
Fix pipeline example erroring out on DFS (#4059)
* Fix example not working with DFS * RL * Add seperate test case for generate_pipeline_code * Remove extra print * Add warnings * Fix release notes * Address comments * Fix tests * Fix docs build * Add validation for features path * docstring * Fix test case with another primitive
1 parent 1412fc3 commit 20d5b2c

File tree

5 files changed

+167
-41
lines changed

5 files changed

+167
-41
lines changed

docs/source/release_notes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Release Notes
1010
* Pipelines with DFS Transformers will run fast permutation importance if DFS features pre-exist :pr:`4037`
1111
* Add get_prediction_intervals() at the pipeline level :pr:`4052`
1212
* Fixes
13+
* Fixed ``generate_pipeline_example`` erroring out for pipelines with a ``DFSTransformer`` :pr:`4059`
1314
* Remove nullable types handling for ``OverSampler`` :pr:`4064`
1415
* Changes
1516
* Uncapped ``pmdarima`` and updated minimum version :pr:`4027`

evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,6 @@ def transform(
208208
return X, y
209209
original_index = y.index
210210
X, y = self._check_target(X, y)
211-
212211
self._check_oos_past(y)
213212

214213
y_in_sample = pd.Series([])

evalml/pipelines/pipeline_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,8 @@ def repr_component(parameters):
709709
parameters_repr = ", ".join(
710710
[
711711
f"'{component}':{{{repr_component(parameters)}}}"
712+
if component != DFSTransformer.name
713+
else f"'{component}':{{}}"
712714
for component, parameters in self.parameters.items()
713715
],
714716
)

evalml/pipelines/utils.py

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"""Utility methods for EvalML pipelines."""
22
import copy
33
import os
4+
import warnings
45

56
import black
7+
import featuretools as ft
68
from woodwork import logical_types
79

810
from evalml.data_checks import DataCheckActionCode, DataCheckActionOption
@@ -610,18 +612,20 @@ def make_pipeline(
610612
return pipeline
611613

612614

613-
def generate_pipeline_code(element):
615+
def generate_pipeline_code(element, features_path=None):
614616
"""Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline.
615617
616618
Args:
617619
element (pipeline instance): The instance of the pipeline to generate string Python code.
620+
features_path (str): path to features json created from featuretools.save_features(). Defaults to None.
618621
619622
Returns:
620623
str: String representation of Python code that can be run separately in order to recreate the pipeline instance.
621624
Does not include code for custom component implementation.
622625
623626
Raises:
624627
ValueError: If element is not a pipeline, or if the pipeline is nonlinear.
628+
ValueError: If features in `features_path` do not match the features on the pipeline.
625629
"""
626630
# hold the imports needed and add code to end
627631
code_strings = []
@@ -637,8 +641,44 @@ def generate_pipeline_code(element):
637641
element.__class__.__name__,
638642
),
639643
)
644+
try:
645+
has_dfs = element.component_graph.get_component(DFSTransformer.name)
646+
except ValueError:
647+
has_dfs = False
648+
649+
if has_dfs and not features_path:
650+
warnings.warn(
651+
"This pipeline contains a DFS Transformer but no `features_path` has been specified. Please add a `features_path` or the pipeline code will generate a pipeline that does not run DFS.",
652+
)
653+
654+
has_dfs_and_features = has_dfs and features_path
655+
if has_dfs_and_features:
656+
features = ft.load_features(features_path)
657+
if len(features) != len(element.parameters[DFSTransformer.name]["features"]):
658+
raise ValueError(
659+
"Provided features in `features_path` do not match pipeline features. There is a different amount of features in the loaded features.",
660+
)
661+
662+
for pipeline_feature, serialized_feature in zip(
663+
element.parameters[DFSTransformer.name]["features"],
664+
features,
665+
):
666+
if (
667+
pipeline_feature.get_feature_names()
668+
!= serialized_feature.get_feature_names()
669+
):
670+
raise ValueError(
671+
"Provided features in `features_path` do not match pipeline features.",
672+
)
673+
code_strings.append("from featuretools import load_features")
674+
code_strings.append(f'features=load_features("{features_path}")')
640675
code_strings.append(repr(element))
641676
pipeline_code = "\n".join(code_strings)
677+
if has_dfs_and_features:
678+
pipeline_code = pipeline_code.replace(
679+
"'DFS Transformer':{},",
680+
"'DFS Transformer':{'features':features},",
681+
)
642682
current_dir = os.path.dirname(os.path.abspath(__file__))
643683
evalml_path = os.path.abspath(os.path.join(current_dir, "..", ".."))
644684
black_config = get_evalml_black_config(evalml_path)
@@ -651,6 +691,7 @@ def generate_pipeline_example(
651691
path_to_train,
652692
path_to_holdout,
653693
target,
694+
path_to_features=None,
654695
path_to_mapping="",
655696
output_file_path=None,
656697
):
@@ -661,8 +702,9 @@ def generate_pipeline_example(
661702
path_to_train (str): path to training data.
662703
path_to_holdout (str): path to holdout data.
663704
target (str): target variable.
664-
path_to_mapping (str): path to mapping json
665-
output_file_path (str): path to output python file.
705+
path_to_features (str): path to features json. Defaults to None.
706+
path_to_mapping (str): path to mapping json. Defaults to None.
707+
output_file_path (str): path to output python file. Defaults to None.
666708
667709
Returns:
668710
str: String representation of Python code that can be run separately in order to recreate the pipeline instance.
@@ -671,7 +713,7 @@ def generate_pipeline_example(
671713
"""
672714
output_str = f"""
673715
import evalml
674-
import woodwork
716+
import woodwork as ww
675717
import pandas as pd
676718
677719
PATH_TO_TRAIN = "{path_to_train}"
@@ -682,22 +724,22 @@ def generate_pipeline_example(
682724
# This is the machine learning pipeline you have exported.
683725
# By running this code you will fit the pipeline on the files provided
684726
# and you can then use this pipeline for prediction and model understanding.
685-
{generate_pipeline_code(pipeline)}
727+
{generate_pipeline_code(pipeline, path_to_features)}
686728
687729
print(pipeline.name)
688730
print(pipeline.parameters)
689731
pipeline.describe()
690732
691-
df = pd.read_csv(PATH_TO_TRAIN)
692-
y_train = df[TARGET]
693-
X_train = df.drop(TARGET, axis=1)
733+
df = ww.deserialize.from_disk(PATH_TO_TRAIN)
734+
y_train = df.ww[TARGET]
735+
X_train = df.ww.drop(TARGET)
694736
695737
pipeline.fit(X_train, y_train)
696738
697739
# You can now generate predictions as well as run model understanding.
698-
df = pd.read_csv(PATH_TO_HOLDOUT)
699-
y_holdout = df[TARGET]
700-
X_holdout= df.drop(TARGET, axis=1)
740+
df = ww.deserialize.from_disk(PATH_TO_HOLDOUT)
741+
y_holdout = df.ww[TARGET]
742+
X_holdout= df.ww.drop(TARGET)
701743
"""
702744
if not is_time_series(pipeline.problem_type):
703745
output_str += """

evalml/tests/pipeline_tests/test_pipeline_utils.py

Lines changed: 111 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -732,17 +732,18 @@ def test_generate_code_pipeline(get_black_config):
732732
assert pipeline == expected_code
733733

734734
regression_pipeline_with_params = RegressionPipeline(
735-
["Imputer", "Random Forest Regressor"],
735+
["DFS Transformer", "Imputer", "Random Forest Regressor"],
736736
custom_name="Mock Regression Pipeline",
737737
parameters={
738+
"DFS Transformer": {"features": None},
738739
"Imputer": {"numeric_impute_strategy": "most_frequent"},
739740
"Random Forest Regressor": {"n_estimators": 50},
740741
},
741742
)
742743
expected_code_params = black.format_str(
743744
"from evalml.pipelines.regression_pipeline import RegressionPipeline\n"
744-
"pipeline = RegressionPipeline(component_graph={'Imputer': ['Imputer', 'X', 'y'], 'Random Forest Regressor': ['Random Forest Regressor', 'Imputer.x', 'y']}, "
745-
"parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, "
745+
"pipeline = RegressionPipeline(component_graph={'DFS Transformer': ['DFS Transformer', 'X', 'y'],'Imputer': ['Imputer', 'DFS Transformer.x', 'y'], 'Random Forest Regressor': ['Random Forest Regressor', 'Imputer.x', 'y']}, "
746+
"parameters={'DFS Transformer':{}, 'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, "
746747
"'Random Forest Regressor':{'n_estimators': 50, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Regression Pipeline', random_seed=0)",
747748
mode=black.Mode(**get_black_config),
748749
)
@@ -856,59 +857,140 @@ def test_generate_pipeline_example(
856857
X_y_regression,
857858
ts_data,
858859
):
859-
path = os.path.join(str(tmpdir), "train.csv")
860860
if automl_type == ProblemTypes.BINARY:
861861
X, y = X_y_binary
862862
elif automl_type == ProblemTypes.MULTICLASS:
863863
X, y = X_y_multi
864864
elif automl_type == ProblemTypes.REGRESSION:
865865
X, y = X_y_regression
866-
elif (
867-
automl_type == ProblemTypes.TIME_SERIES_MULTICLASS
868-
or automl_type == ProblemTypes.TIME_SERIES_BINARY
869-
):
870-
X, _, y = ts_data(problem_type=automl_type)
871-
else:
866+
elif is_time_series(automl_type):
872867
X, _, y = ts_data(problem_type=automl_type)
873868

874-
from evalml import AutoMLSearch
875-
876-
aml = AutoMLSearch(
877-
X_train=X,
878-
y_train=y,
879-
problem_type=automl_type,
880-
optimize_thresholds=False,
881-
max_time=1,
882-
max_iterations=5,
883-
problem_configuration={
869+
problem_configuration = (
870+
{
884871
"time_index": "date",
885872
"gap": 1,
886873
"max_delay": 1,
887874
"forecast_horizon": 3,
888875
}
889876
if is_time_series(automl_type)
890-
else None,
877+
else None
878+
)
879+
880+
import featuretools as ft
881+
882+
from evalml import AutoMLSearch
883+
from evalml.preprocessing import split_data
884+
885+
X_train, X_test, y_train, y_test = split_data(
886+
X,
887+
y,
888+
problem_type=automl_type,
889+
test_size=0.2,
890+
)
891+
892+
X_train = pd.DataFrame(X_train)
893+
X_train.columns = X_train.columns.astype(str)
894+
es = ft.EntitySet()
895+
es = es.add_dataframe(
896+
dataframe_name="X",
897+
dataframe=X_train,
898+
index="index",
899+
make_index=False,
900+
)
901+
X_train_t, features = ft.dfs(
902+
entityset=es,
903+
target_dataframe_name="X",
904+
trans_primitives=["absolute"],
905+
return_types="all",
906+
)
907+
features_path = os.path.join(str(tmpdir), "features.json")
908+
ft.save_features(features, features_path)
909+
910+
aml = AutoMLSearch(
911+
X_train=X_train_t,
912+
y_train=y_train,
913+
problem_type=automl_type,
914+
optimize_thresholds=False,
915+
max_iterations=5,
916+
problem_configuration=problem_configuration,
917+
features=features,
891918
)
892919
env = AutoMLTestEnv(automl_type)
893920
with env.test_context(score_return_value={aml.objective.name: 1.0}):
894921
aml.search()
895-
pipeline = aml.best_pipeline
896922

897-
X["target"] = y
898-
X.to_csv(path)
923+
pipeline = aml.get_pipeline(2)
924+
925+
y_train.index = X_train_t.index
926+
y_test.index = X_test.index
927+
X_train_t.ww["target"] = y_train
928+
X_test.ww["target"] = y_test.reindex(X_test.index)
929+
930+
train_path = os.path.join(str(tmpdir), "train")
931+
holdout_path = os.path.join(str(tmpdir), "holdout")
932+
933+
X_train_t.ww.to_disk(train_path)
934+
X_test.ww.to_disk(holdout_path)
899935
output_path = os.path.join(str(tmpdir), "example.py")
936+
937+
# extra features provided to example
938+
with pytest.raises(
939+
ValueError,
940+
match="Provided features in `features_path` do not match pipeline features. There is a different amount of features in the loaded features.",
941+
):
942+
_, false_features = ft.dfs(
943+
entityset=es,
944+
target_dataframe_name="X",
945+
trans_primitives=["absolute", "is_null"],
946+
return_types="all",
947+
)
948+
false_features_path = os.path.join(str(tmpdir), "false_features.json")
949+
ft.save_features(false_features, false_features_path)
950+
_ = generate_pipeline_example(
951+
pipeline=pipeline,
952+
path_to_train=train_path,
953+
path_to_holdout=holdout_path,
954+
path_to_features=false_features_path,
955+
target="target",
956+
output_file_path=output_path,
957+
)
958+
959+
# different features provided to example
960+
with pytest.raises(
961+
ValueError,
962+
match="Provided features in `features_path` do not match pipeline features.",
963+
):
964+
_, false_features = ft.dfs(
965+
entityset=es,
966+
target_dataframe_name="X",
967+
trans_primitives=["sine"],
968+
return_types="all",
969+
)
970+
false_features_path = os.path.join(str(tmpdir), "false_features.json")
971+
ft.save_features(false_features, false_features_path)
972+
_ = generate_pipeline_example(
973+
pipeline=pipeline,
974+
path_to_train=train_path,
975+
path_to_holdout=holdout_path,
976+
path_to_features=false_features_path,
977+
target="target",
978+
output_file_path=output_path,
979+
)
980+
900981
pipeline_example = generate_pipeline_example(
901982
pipeline=pipeline,
902-
path_to_train=path,
903-
path_to_holdout=path,
983+
path_to_train=train_path,
984+
path_to_holdout=holdout_path,
985+
path_to_features=features_path,
904986
target="target",
905987
output_file_path=output_path,
906988
)
907-
assert f'PATH_TO_TRAIN = "{path}"' in pipeline_example
908-
assert f'PATH_TO_HOLDOUT = "{path}"' in pipeline_example
989+
assert f'PATH_TO_TRAIN = "{train_path}"' in pipeline_example
990+
assert f'PATH_TO_HOLDOUT = "{holdout_path}"' in pipeline_example
909991
assert 'TARGET = "target"' in pipeline_example
910992
assert 'column_mapping = ""' in pipeline_example
911-
assert generate_pipeline_code(pipeline) in pipeline_example
993+
assert generate_pipeline_code(pipeline, features_path) in pipeline_example
912994

913995
if is_time_series(automl_type):
914996
assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example

0 commit comments

Comments
 (0)