@@ -732,17 +732,18 @@ def test_generate_code_pipeline(get_black_config):
732732 assert pipeline == expected_code
733733
734734 regression_pipeline_with_params = RegressionPipeline (
735- ["Imputer" , "Random Forest Regressor" ],
735+ ["DFS Transformer" , " Imputer" , "Random Forest Regressor" ],
736736 custom_name = "Mock Regression Pipeline" ,
737737 parameters = {
738+ "DFS Transformer" : {"features" : None },
738739 "Imputer" : {"numeric_impute_strategy" : "most_frequent" },
739740 "Random Forest Regressor" : {"n_estimators" : 50 },
740741 },
741742 )
742743 expected_code_params = black .format_str (
743744 "from evalml.pipelines.regression_pipeline import RegressionPipeline\n "
744- "pipeline = RegressionPipeline(component_graph={'Imputer': ['Imputer', 'X ', 'y'], 'Random Forest Regressor': ['Random Forest Regressor', 'Imputer.x', 'y']}, "
745- "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, "
745+ "pipeline = RegressionPipeline(component_graph={'DFS Transformer': ['DFS Transformer', 'X', 'y'],' Imputer': ['Imputer', 'DFS Transformer.x ', 'y'], 'Random Forest Regressor': ['Random Forest Regressor', 'Imputer.x', 'y']}, "
746+ "parameters={'DFS Transformer':{}, ' Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, "
746747 "'Random Forest Regressor':{'n_estimators': 50, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Regression Pipeline', random_seed=0)" ,
747748 mode = black .Mode (** get_black_config ),
748749 )
@@ -856,59 +857,140 @@ def test_generate_pipeline_example(
856857 X_y_regression ,
857858 ts_data ,
858859):
859- path = os .path .join (str (tmpdir ), "train.csv" )
860860 if automl_type == ProblemTypes .BINARY :
861861 X , y = X_y_binary
862862 elif automl_type == ProblemTypes .MULTICLASS :
863863 X , y = X_y_multi
864864 elif automl_type == ProblemTypes .REGRESSION :
865865 X , y = X_y_regression
866- elif (
867- automl_type == ProblemTypes .TIME_SERIES_MULTICLASS
868- or automl_type == ProblemTypes .TIME_SERIES_BINARY
869- ):
870- X , _ , y = ts_data (problem_type = automl_type )
871- else :
866+ elif is_time_series (automl_type ):
872867 X , _ , y = ts_data (problem_type = automl_type )
873868
874- from evalml import AutoMLSearch
875-
876- aml = AutoMLSearch (
877- X_train = X ,
878- y_train = y ,
879- problem_type = automl_type ,
880- optimize_thresholds = False ,
881- max_time = 1 ,
882- max_iterations = 5 ,
883- problem_configuration = {
869+ problem_configuration = (
870+ {
884871 "time_index" : "date" ,
885872 "gap" : 1 ,
886873 "max_delay" : 1 ,
887874 "forecast_horizon" : 3 ,
888875 }
889876 if is_time_series (automl_type )
890- else None ,
877+ else None
878+ )
879+
880+ import featuretools as ft
881+
882+ from evalml import AutoMLSearch
883+ from evalml .preprocessing import split_data
884+
885+ X_train , X_test , y_train , y_test = split_data (
886+ X ,
887+ y ,
888+ problem_type = automl_type ,
889+ test_size = 0.2 ,
890+ )
891+
892+ X_train = pd .DataFrame (X_train )
893+ X_train .columns = X_train .columns .astype (str )
894+ es = ft .EntitySet ()
895+ es = es .add_dataframe (
896+ dataframe_name = "X" ,
897+ dataframe = X_train ,
898+ index = "index" ,
899+ make_index = False ,
900+ )
901+ X_train_t , features = ft .dfs (
902+ entityset = es ,
903+ target_dataframe_name = "X" ,
904+ trans_primitives = ["absolute" ],
905+ return_types = "all" ,
906+ )
907+ features_path = os .path .join (str (tmpdir ), "features.json" )
908+ ft .save_features (features , features_path )
909+
910+ aml = AutoMLSearch (
911+ X_train = X_train_t ,
912+ y_train = y_train ,
913+ problem_type = automl_type ,
914+ optimize_thresholds = False ,
915+ max_iterations = 5 ,
916+ problem_configuration = problem_configuration ,
917+ features = features ,
891918 )
892919 env = AutoMLTestEnv (automl_type )
893920 with env .test_context (score_return_value = {aml .objective .name : 1.0 }):
894921 aml .search ()
895- pipeline = aml .best_pipeline
896922
897- X ["target" ] = y
898- X .to_csv (path )
923+ pipeline = aml .get_pipeline (2 )
924+
925+ y_train .index = X_train_t .index
926+ y_test .index = X_test .index
927+ X_train_t .ww ["target" ] = y_train
928+ X_test .ww ["target" ] = y_test .reindex (X_test .index )
929+
930+ train_path = os .path .join (str (tmpdir ), "train" )
931+ holdout_path = os .path .join (str (tmpdir ), "holdout" )
932+
933+ X_train_t .ww .to_disk (train_path )
934+ X_test .ww .to_disk (holdout_path )
899935 output_path = os .path .join (str (tmpdir ), "example.py" )
936+
937+ # extra features provided to example
938+ with pytest .raises (
939+ ValueError ,
940+ match = "Provided features in `features_path` do not match pipeline features. There is a different amount of features in the loaded features." ,
941+ ):
942+ _ , false_features = ft .dfs (
943+ entityset = es ,
944+ target_dataframe_name = "X" ,
945+ trans_primitives = ["absolute" , "is_null" ],
946+ return_types = "all" ,
947+ )
948+ false_features_path = os .path .join (str (tmpdir ), "false_features.json" )
949+ ft .save_features (false_features , false_features_path )
950+ _ = generate_pipeline_example (
951+ pipeline = pipeline ,
952+ path_to_train = train_path ,
953+ path_to_holdout = holdout_path ,
954+ path_to_features = false_features_path ,
955+ target = "target" ,
956+ output_file_path = output_path ,
957+ )
958+
959+ # different features provided to example
960+ with pytest .raises (
961+ ValueError ,
962+ match = "Provided features in `features_path` do not match pipeline features." ,
963+ ):
964+ _ , false_features = ft .dfs (
965+ entityset = es ,
966+ target_dataframe_name = "X" ,
967+ trans_primitives = ["sine" ],
968+ return_types = "all" ,
969+ )
970+ false_features_path = os .path .join (str (tmpdir ), "false_features.json" )
971+ ft .save_features (false_features , false_features_path )
972+ _ = generate_pipeline_example (
973+ pipeline = pipeline ,
974+ path_to_train = train_path ,
975+ path_to_holdout = holdout_path ,
976+ path_to_features = false_features_path ,
977+ target = "target" ,
978+ output_file_path = output_path ,
979+ )
980+
900981 pipeline_example = generate_pipeline_example (
901982 pipeline = pipeline ,
902- path_to_train = path ,
903- path_to_holdout = path ,
983+ path_to_train = train_path ,
984+ path_to_holdout = holdout_path ,
985+ path_to_features = features_path ,
904986 target = "target" ,
905987 output_file_path = output_path ,
906988 )
907- assert f'PATH_TO_TRAIN = "{ path } "' in pipeline_example
908- assert f'PATH_TO_HOLDOUT = "{ path } "' in pipeline_example
989+ assert f'PATH_TO_TRAIN = "{ train_path } "' in pipeline_example
990+ assert f'PATH_TO_HOLDOUT = "{ holdout_path } "' in pipeline_example
909991 assert 'TARGET = "target"' in pipeline_example
910992 assert 'column_mapping = ""' in pipeline_example
911- assert generate_pipeline_code (pipeline ) in pipeline_example
993+ assert generate_pipeline_code (pipeline , features_path ) in pipeline_example
912994
913995 if is_time_series (automl_type ):
914996 assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example
0 commit comments