Removed first batch from default algorithm (#4215)

christopherbunn · web-flow · commit aa172958db55 · 2023-07-11T18:59:48.000Z
* Initial commit

* Fixed failing tests

* Updated test case

* Updated release notes

* Reduced processing time

* Test updates
diff --git a/docs/source/demos/lead_scoring.ipynb b/docs/source/demos/lead_scoring.ipynb
@@ -171,7 +171,7 @@
     "    objective=lead_scoring_objective,\n",
     "    additional_objectives=[\"auc\"],\n",
     "    allowed_model_families=[\"extra_trees\", \"linear_model\"],\n",
-    "    max_batches=3,\n",
+    "    max_batches=2,\n",
     "    verbose=True,\n",
     ")\n",
     "\n",
@@ -273,7 +273,7 @@
     "    objective=\"auc\",\n",
     "    additional_objectives=[lead_scoring_objective],\n",
     "    allowed_model_families=[\"extra_trees\", \"linear_model\"],\n",
-    "    max_batches=3,\n",
+    "    max_batches=2,\n",
     "    verbose=True,\n",
     ")\n",
     "\n",
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -28,6 +28,7 @@ Release Notes
 
     **Breaking Changes**
         * Removed Decision Tree and CatBoost Estimators from AutoML search :pr:`4205`
+        * Removed first batch from default algorithm :pr:`4215`
 
 
 **v0.77.0 Jun. 07, 2023**
diff --git a/docs/source/start.ipynb b/docs/source/start.ipynb
@@ -127,7 +127,7 @@
     "    y_train=y_train,\n",
     "    problem_type=\"binary\",\n",
     "    objective=\"f1\",\n",
-    "    max_batches=3,\n",
+    "    max_batches=2,\n",
     "    verbose=True,\n",
     ")"
    ]
@@ -170,7 +170,7 @@
     "    y_train=y_train,\n",
     "    problem_type=\"binary\",\n",
     "    objective=\"f1\",\n",
-    "    max_batches=3,\n",
+    "    max_batches=2,\n",
     "    verbose=False,\n",
     ")\n",
     "automl.search()"
diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py
@@ -169,11 +169,9 @@ def __init__(
     def default_max_batches(self):
         """Returns the number of max batches AutoMLSearch should run by default."""
         if self.ensembling:
-            return 4
-        elif is_time_series(self.problem_type):
-            return 2  # we do not run feature selection for time series
-        else:
             return 3
+        else:
+            return 2
 
     def num_pipelines_per_batch(self, batch_number):
         """Return the number of pipelines in the nth batch.
@@ -184,17 +182,17 @@ def num_pipelines_per_batch(self, batch_number):
         Returns:
             int: number of pipelines in the given batch.
         """
-        if batch_number == 0 or batch_number == 1:
+        if batch_number == 0:
             return len(self._naive_estimators())
-        elif batch_number == 2:
+        elif batch_number == 1:
             return len(self._non_naive_estimators())
         if self.ensembling:
-            if batch_number % 2 != 0:
+            if batch_number % 2 == 0:
                 return 1
-            elif batch_number == 4:
+            elif batch_number == 3:
                 return self.num_long_explore_pipelines * self.top_n
         else:
-            if batch_number == 3:
+            if batch_number == 2:
                 return self.num_long_explore_pipelines * self.top_n
         return self.num_long_pipelines_per_batch * self.top_n
 
@@ -452,20 +450,18 @@ def next_batch(self):
         """
         if self.ensembling:
             if self._batch_number == 0:
-                next_batch = self._create_naive_pipelines()
-            elif self._batch_number == 1:
                 next_batch = self._create_naive_pipelines(
                     use_features=self.run_feature_selection,
                 )
-            elif self._batch_number == 2:
+            elif self._batch_number == 1:
                 next_batch = self._create_fast_final()
-            elif self.batch_number == 3:
+            elif self.batch_number == 2:
                 next_batch = self._create_ensemble(
                     self._pipeline_parameters.get("Label Encoder", {}),
                 )
-            elif self.batch_number == 4:
+            elif self.batch_number == 3:
                 next_batch = self._create_long_exploration(n=self.top_n)
-            elif self.batch_number % 2 != 0:
+            elif self.batch_number % 2 == 0:
                 next_batch = self._create_ensemble(
                     self._pipeline_parameters.get("Label Encoder", {}),
                 )
@@ -489,14 +485,12 @@ def next_batch(self):
                 )
         else:
             if self._batch_number == 0:
-                next_batch = self._create_naive_pipelines()
-            elif self._batch_number == 1:
                 next_batch = self._create_naive_pipelines(
                     use_features=self.run_feature_selection,
                 )
-            elif self._batch_number == 2:
+            elif self._batch_number == 1:
                 next_batch = self._create_fast_final()
-            elif self.batch_number == 3:
+            elif self.batch_number == 2:
                 next_batch = self._create_long_exploration(n=self.top_n)
             else:
                 next_batch = self._create_n_pipelines(
@@ -583,15 +577,15 @@ def add_result(
         """
         cached_data = cached_data or {}
         if pipeline.model_family != ModelFamily.ENSEMBLE:
-            if self.batch_number >= 3:
+            if self.batch_number >= 2:
                 super().add_result(
                     score_to_minimize,
                     pipeline,
                     trained_pipeline_results,
                 )
 
         if (
-            self.batch_number == 2
+            self.batch_number == 1
             and self._selected_cols is None
             and not is_time_series(self.problem_type)
         ):
diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -180,7 +180,7 @@ def search(
 
     max_batches = None
     if mode == "fast":
-        max_batches = 4  # corresponds to end of 'fast' mode
+        max_batches = 3  # corresponds to end of 'fast' mode
     elif mode == "long" and max_time:
         max_batches = 999  # defers to stopping criterion
     elif mode == "long" and max_time is None:
diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -266,8 +266,7 @@ def test_search_batch_times(caplog, X_y_binary, AutoMLTestEnv):
 
     assert len(batch_times) == 3
     assert len(batch_times[1]) == 2
-    assert len(batch_times[2]) == 2
-    assert len(batch_times[3]) == 6
+    assert len(batch_times[2]) == 6
 
     assert "Batch Time Stats" in out
     assert "Batch 1 time stats" in out
@@ -363,11 +362,11 @@ def test_pipeline_limits(
         automl.search()
     out = caplog.text
     if verbose:
-        assert "Using default limit of max_batches=3." in out
-        assert "Searching up to 3 batches for a total of" in out
+        assert "Using default limit of max_batches=2." in out
+        assert "Searching up to 2 batches for a total of" in out
     else:
-        assert "Using default limit of max_batches=3." not in out
-        assert "Searching up to 3 batches for a total of" not in out
+        assert "Using default limit of max_batches=2." not in out
+        assert "Searching up to 2 batches for a total of" not in out
     assert len(automl.results["pipeline_results"]) > 0
 
     caplog.clear()
@@ -1806,6 +1805,7 @@ def test_pipelines_in_batch_return_nan(
         y_train=y,
         problem_type="binary",
         max_batches=3,
+        automl_algorithm="iterative",
         allowed_component_graphs={"Name": [dummy_classifier_estimator_class]},
         n_jobs=1,
     )
@@ -1819,7 +1819,10 @@ def test_pipelines_in_batch_return_nan(
             automl.search()
     assert len(automl.errors) > 0
     for pipeline_name, pipeline_error in automl.errors.items():
-        assert "Label Encoder" in pipeline_error["Parameters"]
+        assert (
+            "Label Encoder" in pipeline_error["Parameters"]
+            or "Mock Classifier" in pipeline_error["Parameters"]
+        )
         assert isinstance(pipeline_error["Exception"], TypeError)
         assert "line" in pipeline_error["Traceback"]
 
@@ -1858,7 +1861,8 @@ def test_pipelines_in_batch_return_none(
         X_train=X,
         y_train=y,
         problem_type="binary",
-        max_batches=3,
+        max_batches=2,
+        automl_algorithm="iterative",
         allowed_component_graphs={"Name": [dummy_classifier_estimator_class]},
         n_jobs=1,
     )
@@ -2295,7 +2299,7 @@ def test_time_series_regression_with_parameters(ts_data):
         allowed_component_graphs={"Name_0": ["Imputer", "Linear Regressor"]},
         objective="auto",
         problem_configuration=problem_configuration,
-        max_batches=3,
+        max_batches=2,
     )
     assert (
         automl.automl_algorithm.search_parameters["pipeline"] == problem_configuration
@@ -2340,7 +2344,7 @@ def test_automl_accepts_component_graphs(graph_type, X_y_binary):
         problem_type="binary",
         allowed_component_graphs={"Dummy_Name": component_graph},
         objective="auto",
-        max_batches=3,
+        max_batches=2,
     )
     for pipeline_ in automl.allowed_pipelines:
         assert isinstance(pipeline_, BinaryClassificationPipeline)
@@ -4154,7 +4158,7 @@ def test_automl_drop_unknown_columns(columns, AutoMLTestEnv, X_y_binary, caplog)
         y_train=y,
         problem_type="binary",
         optimize_thresholds=False,
-        max_batches=3,
+        max_batches=2,
         verbose=True,
     )
     env = AutoMLTestEnv("binary")
@@ -4296,7 +4300,7 @@ def dummy_mock_get_preprocessing_components(*args, **kwargs):
             max_batches=1,
             verbose=verbose,
         )
-        env = AutoMLTestEnv("binary")
+        env = AutoMLTestEnv("regression")
         with env.test_context(score_return_value={automl.objective.name: 1.0}):
             automl.search()
 
@@ -4466,7 +4470,7 @@ def test_automl_ensembler_allowed_component_graphs(
         problem_type="regression",
         allowed_component_graphs=component_graphs,
         ensembling=True,
-        max_batches=4,
+        max_batches=3,
         verbose=True,
     )
     automl.search()
@@ -4569,7 +4573,7 @@ def test_automl_passes_known_in_advance_pipeline_parameters_to_all_pipelines(
         X_train=X,
         y_train=y,
         problem_type=problem_type,
-        max_batches=3,
+        max_batches=2,
         problem_configuration={
             "time_index": "date",
             "max_delay": 3,
@@ -4620,7 +4624,7 @@ def test_cv_ranking_scores(
         X_train=X,
         y_train=y,
         problem_type="binary",
-        max_batches=3,
+        max_batches=2,
         data_splitter=data_splitter,
         allowed_component_graphs={"Name": [dummy_classifier_estimator_class]},
         n_jobs=1,
@@ -4724,7 +4728,7 @@ def test_search_parameters_held_automl(
                 ],
             },
         }
-        batches = 2 if algorithm == "default" else batches
+    batches = 2 if algorithm == "default" else batches
 
     search_parameters = {
         "Imputer": {"numeric_impute_strategy": parameter},
@@ -4814,7 +4818,7 @@ def test_automl_accepts_features(
         y_train=y,
         problem_type="binary",
         optimize_thresholds=False,
-        max_batches=3,
+        max_batches=2,
         features=features,
         automl_algorithm=automl_algorithm,
     )
@@ -4858,7 +4862,7 @@ def test_automl_with_empty_features_list(
         y_train=y,
         problem_type="binary",
         optimize_thresholds=False,
-        max_batches=3,
+        max_batches=2,
         features=[],
         automl_algorithm=automl_algorithm,
     )
@@ -5071,7 +5075,7 @@ def test_default_algorithm_uses_n_jobs(X_y_binary, AutoMLTestEnv):
         X_train=X,
         y_train=y,
         problem_type="binary",
-        max_batches=3,
+        max_batches=2,
         automl_algorithm="default",
         n_jobs=2,
     )
@@ -5521,7 +5525,7 @@ def test_holdout_set_results_and_rankings(caplog, AutoMLTestEnv):
         X_train=X,
         y_train=y,
         problem_type="binary",
-        max_batches=3,
+        max_batches=2,
         automl_algorithm="default",
         verbose=True,
         holdout_set_size=0.1,
diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py
@@ -1022,7 +1022,7 @@ def test_automl_search_dictionary_undersampler(
         optimize_thresholds=False,
         sampler_method="Undersampler",
         search_parameters=search_parameters,
-        max_batches=3,
+        max_batches=2,
     )
     # check that the sampling dict got set properly
     automl.search()
@@ -1077,7 +1077,7 @@ def test_automl_search_dictionary_oversampler(
         sampler_method="Oversampler",
         optimize_thresholds=False,
         search_parameters=search_parameters,
-        max_batches=3,
+        max_batches=2,
     )
     # check that the sampling dict got set properly
     pipelines = automl.allowed_pipelines
@@ -1122,7 +1122,7 @@ def test_automl_search_sampler_dictionary_keys(
         sampler_method=sampler,
         optimize_thresholds=False,
         search_parameters=search_parameters,
-        max_batches=3,
+        max_batches=2,
     )
     if errors:
         with pytest.raises(
@@ -1254,7 +1254,7 @@ def test_automl_passes_allow_long_running_models(
         objective="Log Loss Multiclass",
         allow_long_running_models=allow_long_running_models,
         automl_algorithm=algo,
-        max_batches=3,
+        max_batches=2,
         verbose=True,
     )
     assert (
@@ -1280,7 +1280,7 @@ def test_automl_threshold_score(fraud_100):
         X_train,
         y_train,
         problem_type="binary",
-        max_batches=4,
+        max_batches=2,
         ensembling=True,
         verbose=False,
         automl_algorithm="default",
diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py
diff --git a/evalml/tests/automl_tests/test_search.py b/evalml/tests/automl_tests/test_search.py
diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py