Filter failed trials from plots (#4725)

shrutipatel31 · meta-codesync[bot] · commit 9a00edcf1902 · 2025-12-30T11:30:30.000-08:00
Summary: Pull Request resolved: #4725 This diff updates the default trial status filtering in Ax analysis plots to also exclude `FAILED` trials. Previously, the analysis plots (`ArmEffectsPlot`, `ObjectivePFeasibleFrontierPlot`, `ScatterPlot`) excluded only `ABANDONED` and `STALE` trials by default. This change adds `FAILED` to the exclusion list because failed trials typically don't have valid observation data and shouldn't be included in visualizations. Updated `get_trial_statuses_with_fallback()` in `utils.py` to exclude `TrialStatus.FAILED` from the default set Reviewed By: bernardbeckerman Differential Revision: D89913203 Privacy Context Container: L1307644 fbshipit-source-id: 7c67b7a8b448e09f7b3fb7ce293648235df92b6a
diff --git a/ax/analysis/plotly/arm_effects.py b/ax/analysis/plotly/arm_effects.py
@@ -91,7 +91,7 @@ def __init__(
                 against the status quo arm from the same trial.
             trial_index: If present, only use arms from the trial with the given index.
             trial_statuses: If present, only use arms from trials with the given
-                statuses. By default, exclude STALE and ABANDONED trials.
+                statuses. By default, exclude STALE, ABANDONED, and FAILED trials.
             additional_arms: If present, include these arms in the plot in addition to
                 the arms in the experiment. These arms will be marked as belonging to a
                 trial with index -1.
diff --git a/ax/analysis/plotly/objective_p_feasible_frontier.py b/ax/analysis/plotly/objective_p_feasible_frontier.py
@@ -82,7 +82,7 @@ def __init__(
             label: A label to use in the plot in place of the metric name.
             trial_index: If present, only use arms from the trial with the given index.
             trial_statuses: If present, only use arms from trials with the given
-                statuses. By default, exclude STALE and ABANDONED trials.
+                statuses. By default, exclude STALE, ABANDONED, and FAILED trials.
             num_points_to_generate: The number of points to generate on the frontier.
                 Ideally this should be sufficiently large to provide a frontier with
                 reasonably good coverage.
diff --git a/ax/analysis/plotly/scatter.py b/ax/analysis/plotly/scatter.py
@@ -153,7 +153,7 @@ def __init__(
                 against the status quo arm from the same trial.
             trial_index: If present, only use arms from the trial with the given index.
             trial_statuses: If present, only use arms from trials with the given
-                statuses. By default, exclude STALE and ABANDONED trials.
+                statuses. By default, exclude STALE, FAILED and ABANDONED trials.
             additional_arms: If present, include these arms in the plot in addition to
                 the arms in the experiment. These arms will be marked as belonging to a
                 trial with index -1.
diff --git a/ax/analysis/plotly/tests/test_arm_effects.py b/ax/analysis/plotly/tests/test_arm_effects.py
@@ -18,6 +18,7 @@
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.core_stubs import (
     get_branin_experiment,
+    get_non_failed_arm_names,
     get_offline_experiments,
     get_online_experiments,
 )
@@ -75,9 +76,13 @@ def setUp(self) -> None:
 
     def test_trial_statuses_behavior(self) -> None:
         # When neither trial_statuses nor trial_index is provided,
-        # should use default statuses (excluding ABANDONED and STALE)
+        # should use default statuses (excluding ABANDONED, STALE, and FAILED)
         analysis = ArmEffectsPlot(metric_name="foo")
-        expected_statuses = {*TrialStatus} - {TrialStatus.ABANDONED, TrialStatus.STALE}
+        expected_statuses = {*TrialStatus} - {
+            TrialStatus.ABANDONED,
+            TrialStatus.STALE,
+            TrialStatus.FAILED,
+        }
         self.assertEqual(set(none_throws(analysis.trial_statuses)), expected_statuses)
 
         # When trial_statuses is explicitly provided, it should be used
@@ -129,9 +134,11 @@ def test_compute_raw(self) -> None:
             },
         )
 
-        # Check that we have one row per arm and that each arm appears only once
-        self.assertEqual(len(card.df), len(self.client._experiment.arms_by_name))
-        for arm_name in self.client._experiment.arms_by_name:
+        # Check that we have one row per arm from non-failed trials and that each
+        # arm appears only once
+        non_failed_arms = get_non_failed_arm_names(self.client._experiment)
+        self.assertEqual(len(card.df), len(non_failed_arms))
+        for arm_name in non_failed_arms:
             self.assertEqual((card.df["arm_name"] == arm_name).sum(), 1)
 
         # Check that all SEMs are NaN
@@ -158,9 +165,11 @@ def test_compute_with_modeled(self) -> None:
             },
         )
 
-        # Check that we have one row per arm and that each arm appears only once
-        self.assertEqual(len(card.df), len(self.client._experiment.arms_by_name))
-        for arm_name in self.client._experiment.arms_by_name:
+        # Check that we have one row per arm from non-failed trials and that each
+        # arm appears only once
+        non_failed_arms = get_non_failed_arm_names(self.client._experiment)
+        self.assertEqual(len(card.df), len(non_failed_arms))
+        for arm_name in non_failed_arms:
             self.assertEqual((card.df["arm_name"] == arm_name).sum(), 1)
 
         # Check that all SEMs are not NaN
diff --git a/ax/analysis/plotly/tests/test_objective_p_feasible_frontier.py b/ax/analysis/plotly/tests/test_objective_p_feasible_frontier.py
@@ -50,9 +50,13 @@ def setUp(self) -> None:
 
     def test_trial_statuses_behavior(self) -> None:
         # When neither trial_statuses nor trial_index is provided,
-        # should use default statuses (excluding ABANDONED and STALE)
+        # should use default statuses (excluding ABANDONED, STALE, and FAILED)
         analysis = ObjectivePFeasibleFrontierPlot()
-        expected_statuses = {*TrialStatus} - {TrialStatus.ABANDONED, TrialStatus.STALE}
+        expected_statuses = {*TrialStatus} - {
+            TrialStatus.ABANDONED,
+            TrialStatus.STALE,
+            TrialStatus.FAILED,
+        }
         self.assertEqual(set(none_throws(analysis.trial_statuses)), expected_statuses)
 
         # When trial_statuses is explicitly provided, it should be used
diff --git a/ax/analysis/plotly/tests/test_scatter.py b/ax/analysis/plotly/tests/test_scatter.py
@@ -17,7 +17,11 @@
 from ax.core.trial_status import TrialStatus
 from ax.exceptions.core import UserInputError
 from ax.utils.common.testutils import TestCase
-from ax.utils.testing.core_stubs import get_offline_experiments, get_online_experiments
+from ax.utils.testing.core_stubs import (
+    get_non_failed_arm_names,
+    get_offline_experiments,
+    get_online_experiments,
+)
 from ax.utils.testing.mock import mock_botorch_optimize
 from ax.utils.testing.modeling_stubs import get_default_generation_strategy_at_MBM_node
 from pyre_extensions import assert_is_instance, none_throws
@@ -69,9 +73,13 @@ def setUp(self) -> None:
 
     def test_trial_statuses_behavior(self) -> None:
         # When neither trial_statuses nor trial_index is provided,
-        # should use default statuses (excluding ABANDONED and STALE)
+        # should use default statuses (excluding ABANDONED, STALE, and FAILED)
         analysis = ScatterPlot(x_metric_name="foo", y_metric_name="bar")
-        expected_statuses = {*TrialStatus} - {TrialStatus.ABANDONED, TrialStatus.STALE}
+        expected_statuses = {*TrialStatus} - {
+            TrialStatus.ABANDONED,
+            TrialStatus.STALE,
+            TrialStatus.FAILED,
+        }
         self.assertEqual(set(none_throws(analysis.trial_statuses)), expected_statuses)
 
         # When trial_statuses is explicitly provided, it should be used
@@ -133,9 +141,11 @@ def test_compute_raw(self) -> None:
         )
         self.assertIsNotNone(card.blob)
 
-        # Check that we have one row per arm and that each arm appears only once
-        self.assertEqual(len(card.df), len(self.client._experiment.arms_by_name))
-        for arm_name in self.client._experiment.arms_by_name:
+        # Check that we have one row per arm from non-failed trials and that each
+        # arm appears only once
+        non_failed_arms = get_non_failed_arm_names(self.client._experiment)
+        self.assertEqual(len(card.df), len(non_failed_arms))
+        for arm_name in non_failed_arms:
             self.assertEqual((card.df["arm_name"] == arm_name).sum(), 1)
 
         # Check that all SEMs are NaN
@@ -191,9 +201,11 @@ def test_compute_with_modeled(self) -> None:
 
         self.assertIsNotNone(card.blob)
 
-        # Check that we have one row per arm and that each arm appears only once
-        self.assertEqual(len(card.df), len(self.client._experiment.arms_by_name))
-        for arm_name in self.client._experiment.arms_by_name:
+        # Check that we have one row per arm from non-failed trials and that each
+        # arm appears only once
+        non_failed_arms = get_non_failed_arm_names(self.client._experiment)
+        self.assertEqual(len(card.df), len(non_failed_arms))
+        for arm_name in non_failed_arms:
             self.assertEqual((card.df["arm_name"] == arm_name).sum(), 1)
 
         # Check that all SEMs are not NaN
diff --git a/ax/analysis/plotly/tests/test_utils.py b/ax/analysis/plotly/tests/test_utils.py
@@ -33,15 +33,20 @@ def test_get_trial_statuses_with_fallback_with_trial_index(self) -> None:
 
     def test_get_trial_statuses_with_fallback_default(self) -> None:
         # When neither trial_statuses nor trial_index is provided,
-        # should return all statuses except ABANDONED and STALE
+        # should return all statuses except ABANDONED, STALE, and FAILED
         result = none_throws(
             get_trial_statuses_with_fallback(trial_statuses=None, trial_index=None)
         )
 
-        expected_statuses = {*TrialStatus} - {TrialStatus.ABANDONED, TrialStatus.STALE}
+        expected_statuses = {*TrialStatus} - {
+            TrialStatus.ABANDONED,
+            TrialStatus.STALE,
+            TrialStatus.FAILED,
+        }
         self.assertEqual(set(result), expected_statuses)
         self.assertNotIn(TrialStatus.ABANDONED, result)
         self.assertNotIn(TrialStatus.STALE, result)
+        self.assertNotIn(TrialStatus.FAILED, result)
 
     def test_get_trial_statuses_with_fallback_explicit_takes_precedence(self) -> None:
         # When both trial_statuses and trial_index are provided,
diff --git a/ax/analysis/plotly/utils.py b/ax/analysis/plotly/utils.py
@@ -224,12 +224,14 @@ def get_trial_statuses_with_fallback(
 ) -> list[TrialStatus] | None:
     """Get the default trial statuses to plot.
 
-    By default, include all trials except those that are abandoned or stale.
+    By default, include all trials except those that are abandoned, stale, or failed.
     If trial_index is provided, then we only filter based on trial_index,
     and therefore this function returns None.
     """
     if trial_index is not None:
         return None
     elif trial_statuses is not None:
         return [*trial_statuses]
-    return [*{*TrialStatus} - {TrialStatus.ABANDONED, TrialStatus.STALE}]
+    return [
+        *{*TrialStatus} - {TrialStatus.ABANDONED, TrialStatus.STALE, TrialStatus.FAILED}
+    ]
diff --git a/ax/utils/testing/core_stubs.py b/ax/utils/testing/core_stubs.py
@@ -1817,6 +1817,16 @@ def add_arm(
         return self
 
 
+def get_non_failed_arm_names(experiment: Experiment) -> set[str]:
+    """Get the names of all arms from non-failed trials."""
+    return {
+        arm.name
+        for trial in experiment.trials.values()
+        if trial.status != TrialStatus.FAILED
+        for arm in trial.arms
+    }
+
+
 ##############################
 # Parameters
 ##############################