Fix pipeline example erroring out on DFS (#4059)

jeremyliweishih · web-flow · commit 20d5b2cbde16 · 2023-03-14T23:19:25.000Z
* Fix example not working with DFS

* RL

* Add seperate test case for generate_pipeline_code

* Remove extra print

* Add warnings

* Fix release notes

* Address comments

* Fix tests

* Fix docs build

* Add validation for features path

* docstring

* Fix test case with another primitive
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -10,6 +10,7 @@ Release Notes
         * Pipelines with DFS Transformers will run fast permutation importance if DFS features pre-exist :pr:`4037`
         * Add get_prediction_intervals() at the pipeline level :pr:`4052`
     * Fixes
+        * Fixed ``generate_pipeline_example`` erroring out for pipelines with a ``DFSTransformer`` :pr:`4059`
         * Remove nullable types handling for ``OverSampler`` :pr:`4064`
     * Changes
         * Uncapped ``pmdarima`` and updated minimum version :pr:`4027`
diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py
@@ -208,7 +208,6 @@ def transform(
             return X, y
         original_index = y.index
         X, y = self._check_target(X, y)
-
         self._check_oos_past(y)
 
         y_in_sample = pd.Series([])
diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py
@@ -709,6 +709,8 @@ def repr_component(parameters):
         parameters_repr = ", ".join(
             [
                 f"'{component}':{{{repr_component(parameters)}}}"
+                if component != DFSTransformer.name
+                else f"'{component}':{{}}"
                 for component, parameters in self.parameters.items()
             ],
         )
diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -1,8 +1,10 @@
 """Utility methods for EvalML pipelines."""
 import copy
 import os
+import warnings
 
 import black
+import featuretools as ft
 from woodwork import logical_types
 
 from evalml.data_checks import DataCheckActionCode, DataCheckActionOption
@@ -610,18 +612,20 @@ def make_pipeline(
     return pipeline
 
 
-def generate_pipeline_code(element):
+def generate_pipeline_code(element, features_path=None):
     """Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline.
 
     Args:
         element (pipeline instance): The instance of the pipeline to generate string Python code.
+        features_path (str): path to features json created from featuretools.save_features(). Defaults to None.
 
     Returns:
         str: String representation of Python code that can be run separately in order to recreate the pipeline instance.
         Does not include code for custom component implementation.
 
     Raises:
         ValueError: If element is not a pipeline, or if the pipeline is nonlinear.
+        ValueError: If features in `features_path` do not match the features on the pipeline.
     """
     # hold the imports needed and add code to end
     code_strings = []
@@ -637,8 +641,44 @@ def generate_pipeline_code(element):
             element.__class__.__name__,
         ),
     )
+    try:
+        has_dfs = element.component_graph.get_component(DFSTransformer.name)
+    except ValueError:
+        has_dfs = False
+
+    if has_dfs and not features_path:
+        warnings.warn(
+            "This pipeline contains a DFS Transformer but no `features_path` has been specified. Please add a `features_path` or the pipeline code will generate a pipeline that does not run DFS.",
+        )
+
+    has_dfs_and_features = has_dfs and features_path
+    if has_dfs_and_features:
+        features = ft.load_features(features_path)
+        if len(features) != len(element.parameters[DFSTransformer.name]["features"]):
+            raise ValueError(
+                "Provided features in `features_path` do not match pipeline features. There is a different amount of features in the loaded features.",
+            )
+
+        for pipeline_feature, serialized_feature in zip(
+            element.parameters[DFSTransformer.name]["features"],
+            features,
+        ):
+            if (
+                pipeline_feature.get_feature_names()
+                != serialized_feature.get_feature_names()
+            ):
+                raise ValueError(
+                    "Provided features in `features_path` do not match pipeline features.",
+                )
+        code_strings.append("from featuretools import load_features")
+        code_strings.append(f'features=load_features("{features_path}")')
     code_strings.append(repr(element))
     pipeline_code = "\n".join(code_strings)
+    if has_dfs_and_features:
+        pipeline_code = pipeline_code.replace(
+            "'DFS Transformer':{},",
+            "'DFS Transformer':{'features':features},",
+        )
     current_dir = os.path.dirname(os.path.abspath(__file__))
     evalml_path = os.path.abspath(os.path.join(current_dir, "..", ".."))
     black_config = get_evalml_black_config(evalml_path)
@@ -651,6 +691,7 @@ def generate_pipeline_example(
     path_to_train,
     path_to_holdout,
     target,
+    path_to_features=None,
     path_to_mapping="",
     output_file_path=None,
 ):
@@ -661,8 +702,9 @@ def generate_pipeline_example(
         path_to_train (str): path to training data.
         path_to_holdout (str): path to holdout data.
         target (str): target variable.
-        path_to_mapping (str): path to mapping json
-        output_file_path (str): path to output python file.
+        path_to_features (str): path to features json. Defaults to None.
+        path_to_mapping (str): path to mapping json. Defaults to None.
+        output_file_path (str): path to output python file. Defaults to None.
 
     Returns:
         str: String representation of Python code that can be run separately in order to recreate the pipeline instance.
@@ -671,7 +713,7 @@ def generate_pipeline_example(
     """
     output_str = f"""
 import evalml
-import woodwork
+import woodwork as ww
 import pandas as pd
 
 PATH_TO_TRAIN = "{path_to_train}"
@@ -682,22 +724,22 @@ def generate_pipeline_example(
 # This is the machine learning pipeline you have exported.
 # By running this code you will fit the pipeline on the files provided
 # and you can then use this pipeline for prediction and model understanding.
-{generate_pipeline_code(pipeline)}
+{generate_pipeline_code(pipeline, path_to_features)}
 
 print(pipeline.name)
 print(pipeline.parameters)
 pipeline.describe()
 
-df = pd.read_csv(PATH_TO_TRAIN)
-y_train = df[TARGET]
-X_train = df.drop(TARGET, axis=1)
+df = ww.deserialize.from_disk(PATH_TO_TRAIN)
+y_train = df.ww[TARGET]
+X_train = df.ww.drop(TARGET)
 
 pipeline.fit(X_train, y_train)
 
 # You can now generate predictions as well as run model understanding.
-df = pd.read_csv(PATH_TO_HOLDOUT)
-y_holdout = df[TARGET]
-X_holdout= df.drop(TARGET, axis=1)
+df = ww.deserialize.from_disk(PATH_TO_HOLDOUT)
+y_holdout = df.ww[TARGET]
+X_holdout= df.ww.drop(TARGET)
 """
     if not is_time_series(pipeline.problem_type):
         output_str += """
diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -732,17 +732,18 @@ def test_generate_code_pipeline(get_black_config):
     assert pipeline == expected_code
 
     regression_pipeline_with_params = RegressionPipeline(
-        ["Imputer", "Random Forest Regressor"],
+        ["DFS Transformer", "Imputer", "Random Forest Regressor"],
         custom_name="Mock Regression Pipeline",
         parameters={
+            "DFS Transformer": {"features": None},
             "Imputer": {"numeric_impute_strategy": "most_frequent"},
             "Random Forest Regressor": {"n_estimators": 50},
         },
     )
     expected_code_params = black.format_str(
         "from evalml.pipelines.regression_pipeline import RegressionPipeline\n"
-        "pipeline = RegressionPipeline(component_graph={'Imputer': ['Imputer', 'X', 'y'], 'Random Forest Regressor': ['Random Forest Regressor', 'Imputer.x', 'y']}, "
-        "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, "
+        "pipeline = RegressionPipeline(component_graph={'DFS Transformer': ['DFS Transformer', 'X', 'y'],'Imputer': ['Imputer', 'DFS Transformer.x', 'y'], 'Random Forest Regressor': ['Random Forest Regressor', 'Imputer.x', 'y']}, "
+        "parameters={'DFS Transformer':{}, 'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, "
         "'Random Forest Regressor':{'n_estimators': 50, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Regression Pipeline', random_seed=0)",
         mode=black.Mode(**get_black_config),
     )
@@ -856,59 +857,140 @@ def test_generate_pipeline_example(
     X_y_regression,
     ts_data,
 ):
-    path = os.path.join(str(tmpdir), "train.csv")
     if automl_type == ProblemTypes.BINARY:
         X, y = X_y_binary
     elif automl_type == ProblemTypes.MULTICLASS:
         X, y = X_y_multi
     elif automl_type == ProblemTypes.REGRESSION:
         X, y = X_y_regression
-    elif (
-        automl_type == ProblemTypes.TIME_SERIES_MULTICLASS
-        or automl_type == ProblemTypes.TIME_SERIES_BINARY
-    ):
-        X, _, y = ts_data(problem_type=automl_type)
-    else:
+    elif is_time_series(automl_type):
         X, _, y = ts_data(problem_type=automl_type)
 
-    from evalml import AutoMLSearch
-
-    aml = AutoMLSearch(
-        X_train=X,
-        y_train=y,
-        problem_type=automl_type,
-        optimize_thresholds=False,
-        max_time=1,
-        max_iterations=5,
-        problem_configuration={
+    problem_configuration = (
+        {
             "time_index": "date",
             "gap": 1,
             "max_delay": 1,
             "forecast_horizon": 3,
         }
         if is_time_series(automl_type)
-        else None,
+        else None
+    )
+
+    import featuretools as ft
+
+    from evalml import AutoMLSearch
+    from evalml.preprocessing import split_data
+
+    X_train, X_test, y_train, y_test = split_data(
+        X,
+        y,
+        problem_type=automl_type,
+        test_size=0.2,
+    )
+
+    X_train = pd.DataFrame(X_train)
+    X_train.columns = X_train.columns.astype(str)
+    es = ft.EntitySet()
+    es = es.add_dataframe(
+        dataframe_name="X",
+        dataframe=X_train,
+        index="index",
+        make_index=False,
+    )
+    X_train_t, features = ft.dfs(
+        entityset=es,
+        target_dataframe_name="X",
+        trans_primitives=["absolute"],
+        return_types="all",
+    )
+    features_path = os.path.join(str(tmpdir), "features.json")
+    ft.save_features(features, features_path)
+
+    aml = AutoMLSearch(
+        X_train=X_train_t,
+        y_train=y_train,
+        problem_type=automl_type,
+        optimize_thresholds=False,
+        max_iterations=5,
+        problem_configuration=problem_configuration,
+        features=features,
     )
     env = AutoMLTestEnv(automl_type)
     with env.test_context(score_return_value={aml.objective.name: 1.0}):
         aml.search()
-    pipeline = aml.best_pipeline
 
-    X["target"] = y
-    X.to_csv(path)
+    pipeline = aml.get_pipeline(2)
+
+    y_train.index = X_train_t.index
+    y_test.index = X_test.index
+    X_train_t.ww["target"] = y_train
+    X_test.ww["target"] = y_test.reindex(X_test.index)
+
+    train_path = os.path.join(str(tmpdir), "train")
+    holdout_path = os.path.join(str(tmpdir), "holdout")
+
+    X_train_t.ww.to_disk(train_path)
+    X_test.ww.to_disk(holdout_path)
     output_path = os.path.join(str(tmpdir), "example.py")
+
+    # extra features provided to example
+    with pytest.raises(
+        ValueError,
+        match="Provided features in `features_path` do not match pipeline features. There is a different amount of features in the loaded features.",
+    ):
+        _, false_features = ft.dfs(
+            entityset=es,
+            target_dataframe_name="X",
+            trans_primitives=["absolute", "is_null"],
+            return_types="all",
+        )
+        false_features_path = os.path.join(str(tmpdir), "false_features.json")
+        ft.save_features(false_features, false_features_path)
+        _ = generate_pipeline_example(
+            pipeline=pipeline,
+            path_to_train=train_path,
+            path_to_holdout=holdout_path,
+            path_to_features=false_features_path,
+            target="target",
+            output_file_path=output_path,
+        )
+
+    # different features provided to example
+    with pytest.raises(
+        ValueError,
+        match="Provided features in `features_path` do not match pipeline features.",
+    ):
+        _, false_features = ft.dfs(
+            entityset=es,
+            target_dataframe_name="X",
+            trans_primitives=["sine"],
+            return_types="all",
+        )
+        false_features_path = os.path.join(str(tmpdir), "false_features.json")
+        ft.save_features(false_features, false_features_path)
+        _ = generate_pipeline_example(
+            pipeline=pipeline,
+            path_to_train=train_path,
+            path_to_holdout=holdout_path,
+            path_to_features=false_features_path,
+            target="target",
+            output_file_path=output_path,
+        )
+
     pipeline_example = generate_pipeline_example(
         pipeline=pipeline,
-        path_to_train=path,
-        path_to_holdout=path,
+        path_to_train=train_path,
+        path_to_holdout=holdout_path,
+        path_to_features=features_path,
         target="target",
         output_file_path=output_path,
     )
-    assert f'PATH_TO_TRAIN = "{path}"' in pipeline_example
-    assert f'PATH_TO_HOLDOUT = "{path}"' in pipeline_example
+    assert f'PATH_TO_TRAIN = "{train_path}"' in pipeline_example
+    assert f'PATH_TO_HOLDOUT = "{holdout_path}"' in pipeline_example
     assert 'TARGET = "target"' in pipeline_example
     assert 'column_mapping = ""' in pipeline_example
-    assert generate_pipeline_code(pipeline) in pipeline_example
+    assert generate_pipeline_code(pipeline, features_path) in pipeline_example
 
     if is_time_series(automl_type):
         assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example

Original file line number	Diff line number	Diff line change
`@@ -709,6 +709,8 @@ def repr_component(parameters):`
`709`	`709`	`parameters_repr = ", ".join(`
`710`	`710`	`[`
`711`	`711`	`f"'{component}':{{{repr_component(parameters)}}}"`
	`712`	`+ if component != DFSTransformer.name`
	`713`	`+ else f"'{component}':{{}}"`
`712`	`714`	`for component, parameters in self.parameters.items()`
`713`	`715`	`],`
`714`	`716`	`)`