Add FreshLILOLabelCheck transition criterion (#4994)

ItsMrLin · meta-codesync[bot] · commit e2056d2c591f · 2026-03-13T13:32:55.000-07:00
Summary: Pull Request resolved: #4994 Add a hash-aware transition criterion for LILO GS loops. `FreshLILOLabelCheck` counts only trials whose LILO input hash matches the current experiment state, ensuring transitions are gated on *fresh* labels (produced under current data + LLM messages). The `require_sufficient` flag controls the transition direction: - `require_sufficient=True` (LILO_LABELING -> MBG): is_met when fresh count >= threshold. "Enough fresh labels -- proceed to BO generation." - `require_sufficient=False` (MBG -> LILO_LABELING): is_met when fresh count < threshold. "Labels are stale -- relabel before generating." Non-LILO experiments (no pairwise DerivedMetric) short-circuit: `require_sufficient=True` -> always met, `require_sufficient=False` -> never met. This prevents false relabeling triggers on non-LILO experiments. Reviewed By: saitcakmak Differential Revision: D95284285 fbshipit-source-id: 457fdfa99d8a5f9f99345d3d9dc6a46d1debf8d1
diff --git a/ax/generation_strategy/tests/test_transition_criterion.py b/ax/generation_strategy/tests/test_transition_criterion.py
@@ -7,11 +7,15 @@
 
 
 from logging import Logger
+from unittest.mock import MagicMock
 
 import pandas as pd
 from ax.adapter.registry import Generators
+from ax.core.arm import Arm
 from ax.core.auxiliary import AuxiliaryExperiment, AuxiliaryExperimentPurpose
 from ax.core.data import Data
+from ax.core.derived_metric import DerivedMetric
+from ax.core.experiment import Experiment
 from ax.core.trial_status import TrialStatus
 from ax.exceptions.core import DataRequiredError, UserInputError
 from ax.exceptions.generation_strategy import MaxParallelismReachedException
@@ -24,11 +28,14 @@
 from ax.generation_strategy.transition_criterion import (
     AutoTransitionAfterGen,
     AuxiliaryExperimentCheck,
+    FreshLILOLabelCheck,
     IsSingleObjective,
     MaxGenerationParallelism,
     MaxTrialsAwaitingData,
     MinTrials,
 )
+from ax.utils.common.constants import Keys
+from ax.utils.common.hash_utils import compute_lilo_input_hash
 from ax.utils.common.logger import get_logger
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.core_stubs import (
@@ -41,6 +48,13 @@
 logger: Logger = get_logger(__name__)
 
 
+def _mock_node(trials_from_node: set[int]) -> MagicMock:
+    """Create a mock GenerationNode with a specified trials_from_node set."""
+    node = MagicMock()
+    node.trials_from_node = trials_from_node
+    return node
+
+
 class TestTransitionCriterion(TestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -614,3 +628,189 @@ def test_max_generation_parallelism_block_error(self) -> None:
                 experiment=self.experiment,
                 trials_from_node={0, 1, 2},
             )
+
+    def test_fresh_lilo_label_check(self) -> None:
+        """Verify FreshLILOLabelCheck counts only hash-fresh trials."""
+        exp = get_branin_experiment()
+
+        # Register a DerivedMetric with pairwise name.
+        pairwise_metric = DerivedMetric(
+            name=Keys.PAIRWISE_PREFERENCE_QUERY.value,
+            input_metric_names=["branin"],
+        )
+        exp.add_tracking_metric(pairwise_metric)
+
+        criterion = FreshLILOLabelCheck(
+            threshold=2,
+            transition_to="next_node",
+            only_in_statuses=[TrialStatus.COMPLETED],
+        )
+
+        # Helper to create and complete a trial with data.
+        def _add_trial(idx: int, exp: Experiment = exp) -> None:
+            trial = exp.new_batch_trial()
+            trial.add_arm(
+                Arm(name=f"{idx}_0", parameters={"x1": float(idx), "x2": 0.0})
+            )
+            trial.mark_running(no_runner_required=True)
+            trial.mark_completed()
+            exp.attach_data(
+                Data(
+                    df=pd.DataFrame(
+                        [
+                            {
+                                "trial_index": idx,
+                                "arm_name": f"{idx}_0",
+                                "metric_name": "branin",
+                                "metric_signature": "branin",
+                                "mean": float(idx),
+                                "sem": 0.1,
+                            }
+                        ]
+                    )
+                )
+            )
+
+        # Create 3 trials, stamp first 2 with current hash.
+        for i in range(3):
+            _add_trial(i)
+
+        current_hash = compute_lilo_input_hash(exp, ["branin"])
+        trials_from_node = {0, 1, 2}
+
+        with self.subTest("no_hashes_none_count"):
+            # No hash stamps → no trials counted (only LILO trials with
+            # a matching hash contribute).
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            self.assertEqual(count, 0)
+
+        # Stamp trials 0 and 1 with the current hash.
+        exp.trials[0]._properties[Keys.LILO_INPUT_HASH] = current_hash
+        exp.trials[1]._properties[Keys.LILO_INPUT_HASH] = current_hash
+
+        with self.subTest("fresh_hashes_count"):
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # Trials 0, 1 (fresh hash). Trial 2 (no hash → excluded).
+            self.assertEqual(count, 2)
+
+        # Make trial 1 stale.
+        exp.trials[1]._properties[Keys.LILO_INPUT_HASH] = "stale_hash"
+
+        with self.subTest("stale_hash_excluded"):
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # Trial 0 (fresh). Trial 1 (stale) and trial 2 (no hash) excluded.
+            self.assertEqual(count, 1)
+            self.assertFalse(criterion.is_met(exp, _mock_node(trials_from_node)))
+
+        # Make trial 0 stale too.
+        exp.trials[0]._properties[Keys.LILO_INPUT_HASH] = "another_stale"
+
+        with self.subTest("not_enough_fresh"):
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # All stamped trials are stale, trial 2 has no hash → 0.
+            self.assertEqual(count, 0)
+            self.assertFalse(criterion.is_met(exp, _mock_node(trials_from_node)))
+
+        with self.subTest("data_change_invalidates"):
+            # Add new data — changes the current hash, making ALL stamped
+            # trials stale.
+            _add_trial(3)
+            trials_from_node.add(3)
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # Trials 0, 1 stale. Trials 2, 3 have no hash → excluded.
+            self.assertEqual(count, 0)
+
+    def test_fresh_lilo_label_check_require_sufficient(self) -> None:
+        """Verify require_sufficient flag controls is_met direction."""
+        exp = get_branin_experiment()
+
+        pairwise_metric = DerivedMetric(
+            name=Keys.PAIRWISE_PREFERENCE_QUERY.value,
+            input_metric_names=["branin"],
+        )
+        exp.add_tracking_metric(pairwise_metric)
+
+        # Create 2 completed trials with data.
+        for i in range(2):
+            trial = exp.new_batch_trial()
+            trial.add_arm(Arm(name=f"{i}_0", parameters={"x1": float(i), "x2": 0.0}))
+            trial.mark_running(no_runner_required=True)
+            trial.mark_completed()
+            exp.attach_data(
+                Data(
+                    df=pd.DataFrame(
+                        [
+                            {
+                                "trial_index": i,
+                                "arm_name": f"{i}_0",
+                                "metric_name": "branin",
+                                "metric_signature": "branin",
+                                "mean": float(i),
+                                "sem": 0.1,
+                            }
+                        ]
+                    )
+                )
+            )
+
+        current_hash = compute_lilo_input_hash(exp, ["branin"])
+        # Stamp both trials as fresh.
+        exp.trials[0]._properties[Keys.LILO_INPUT_HASH] = current_hash
+        exp.trials[1]._properties[Keys.LILO_INPUT_HASH] = current_hash
+        trials_from_node = {0, 1}
+
+        sufficient = FreshLILOLabelCheck(
+            threshold=2,
+            transition_to="MBG",
+            require_sufficient=True,
+            only_in_statuses=[TrialStatus.COMPLETED],
+        )
+        insufficient = FreshLILOLabelCheck(
+            threshold=2,
+            transition_to="LILO",
+            require_sufficient=False,
+            only_in_statuses=[TrialStatus.COMPLETED],
+        )
+
+        with self.subTest("sufficient_met_when_enough_fresh"):
+            # 2 fresh >= threshold 2 → require_sufficient=True is met.
+            self.assertTrue(sufficient.is_met(exp, _mock_node(trials_from_node)))
+
+        with self.subTest("insufficient_not_met_when_enough_fresh"):
+            # 2 fresh >= threshold 2 → require_sufficient=False is NOT met.
+            self.assertFalse(insufficient.is_met(exp, _mock_node(trials_from_node)))
+
+        # Make trial 0 stale → only 1 fresh trial.
+        exp.trials[0]._properties[Keys.LILO_INPUT_HASH] = "stale"
+
+        with self.subTest("sufficient_not_met_when_stale"):
+            # 1 fresh < threshold 2 → require_sufficient=True is NOT met.
+            self.assertFalse(sufficient.is_met(exp, _mock_node(trials_from_node)))
+
+        with self.subTest("insufficient_met_when_stale"):
+            # 1 fresh < threshold 2 → require_sufficient=False IS met.
+            self.assertTrue(insufficient.is_met(exp, _mock_node(trials_from_node)))
+
+    def test_fresh_lilo_label_check_non_lilo_fallback(self) -> None:
+        """Non-LILO experiment: require_sufficient=True always met,
+        require_sufficient=False never met."""
+        exp = get_branin_experiment()
+        # No pairwise DerivedMetric registered — non-LILO experiment.
+        trials_from_node: set[int] = set()
+
+        sufficient = FreshLILOLabelCheck(
+            threshold=32,
+            transition_to="MBG",
+            require_sufficient=True,
+        )
+        insufficient = FreshLILOLabelCheck(
+            threshold=32,
+            transition_to="LILO",
+            require_sufficient=False,
+        )
+
+        with self.subTest("non_lilo_sufficient_always_met"):
+            self.assertTrue(sufficient.is_met(exp, _mock_node(trials_from_node)))
+
+        with self.subTest("non_lilo_insufficient_never_met"):
+            self.assertFalse(insufficient.is_met(exp, _mock_node(trials_from_node)))
diff --git a/ax/generation_strategy/transition_criterion.py b/ax/generation_strategy/transition_criterion.py
@@ -17,6 +17,8 @@
 from ax.core.utils import get_trial_indices_with_required_metrics
 from ax.exceptions.core import DataRequiredError, UserInputError
 from ax.exceptions.generation_strategy import MaxParallelismReachedException
+from ax.utils.common.constants import Keys
+from ax.utils.common.hash_utils import get_current_lilo_hash
 
 if TYPE_CHECKING:
     from ax.generation_strategy.generation_node import GenerationNode
@@ -644,6 +646,135 @@ def __init__(
         )
 
 
+class FreshLILOLabelCheck(TrialBasedCriterion):
+    """Transition criterion based on the freshness of LILO preference labels.
+
+    LILO (Language-in-the-Loop) trials are stamped with a hash of the
+    experiment state (metric data + LLM messages) at labeling time.
+    When the experiment state changes (new data arrives, or the user updates
+    LLM messages), old labels become stale.  This criterion gates transitions
+    based on how many *fresh* labels exist.
+
+    The ``require_sufficient`` flag controls the direction:
+
+    - **``require_sufficient=True``** (LILO_LABELING -> MBG): ``is_met``
+      when the number of fresh labels >= ``threshold``.  "We have enough
+      fresh labels -- proceed to BO generation."
+    - **``require_sufficient=False``** (MBG -> LILO_LABELING): ``is_met``
+      when the number of fresh labels < ``threshold``.  "Labels are stale
+      -- relabel before generating."
+
+    **Non-LILO fallback** (no pairwise ``DerivedMetric`` on the experiment):
+    ``require_sufficient=True`` -> always met (proceed normally).
+    ``require_sufficient=False`` -> never met (never trigger relabeling).
+    The fallback short-circuits *before* the count comparison so that a
+    non-LILO experiment with fewer than ``threshold`` trials does not
+    falsely trigger relabeling.
+
+    Args:
+        threshold: Number of fresh trials for the sufficiency check.
+        transition_to: The GenerationNode to transition to when met.
+        require_sufficient: If ``True``, ``is_met`` when fresh count >=
+            threshold.  If ``False``, ``is_met`` when fresh count <
+            threshold.  Defaults to ``True``.
+        only_in_statuses: Only count trials with these statuses.
+        not_in_statuses: Exclude trials with these statuses.
+        use_all_trials_in_exp: Count all experiment trials, not just
+            those from the current node.
+        continue_trial_generation: Continue generating arms for the
+            same trial after transition.
+        count_only_trials_with_data: Only count trials that have data.
+    """
+
+    def __init__(
+        self,
+        threshold: int,
+        transition_to: str,
+        require_sufficient: bool = True,
+        only_in_statuses: list[TrialStatus] | None = None,
+        not_in_statuses: list[TrialStatus] | None = None,
+        use_all_trials_in_exp: bool | None = False,
+        continue_trial_generation: bool | None = False,
+        count_only_trials_with_data: bool = False,
+    ) -> None:
+        self.require_sufficient = require_sufficient
+        super().__init__(
+            threshold=threshold,
+            transition_to=transition_to,
+            only_in_statuses=only_in_statuses,
+            not_in_statuses=not_in_statuses,
+            use_all_trials_in_exp=use_all_trials_in_exp,
+            continue_trial_generation=continue_trial_generation,
+            count_only_trials_with_data=count_only_trials_with_data,
+        )
+
+    def num_contributing_to_threshold(
+        self,
+        experiment: Experiment,
+        trials_from_node: set[int],
+    ) -> int:
+        """Count trials toward threshold, excluding those with stale hashes.
+
+        First applies the standard status-based filtering from the base class,
+        then further filters to only trials whose LILO input hash matches
+        the current experiment state.
+        """
+        # Get the base count of candidate trial indices (status-filtered).
+        all_trials = self.all_trials_to_check(experiment)
+        if self.count_only_trials_with_data:
+            data_trial_indices = get_trial_indices_with_required_metrics(
+                experiment=experiment,
+                df=experiment.lookup_data().df,
+                require_data_for_all_metrics=False,
+            )
+            all_trials = all_trials.intersection(data_trial_indices)
+
+        if not bool(self.use_all_trials_in_exp):
+            all_trials = trials_from_node.intersection(all_trials)
+
+        # Further filter by LILO input hash freshness.
+        current_hash = get_current_lilo_hash(experiment)
+        if current_hash is None:
+            # No pairwise DerivedMetric found — fall back to plain count.
+            return len(all_trials)
+
+        fresh_count = 0
+        for idx in all_trials:
+            trial = experiment.trials[idx]
+            trial_hash = trial._properties.get(Keys.LILO_INPUT_HASH)
+            # Only count trials that have a LILO_INPUT_HASH (i.e., actual
+            # LILO labeling trials) and whose hash matches the current state.
+            # Trials without a hash (regular Sobol/MBG trials) are excluded
+            # so they don't inflate the fresh-label count.
+            if trial_hash is not None and trial_hash == current_hash:
+                fresh_count += 1
+
+        return fresh_count
+
+    def is_met(
+        self,
+        experiment: Experiment,
+        curr_node: GenerationNode,
+    ) -> bool:
+        """Check whether the freshness condition is satisfied.
+
+        For non-LILO experiments (no pairwise ``DerivedMetric``), this
+        short-circuits: ``require_sufficient=True`` → always met,
+        ``require_sufficient=False`` → never met.
+        """
+        # Short-circuit for non-LILO experiments.
+        if get_current_lilo_hash(experiment) is None:
+            return self.require_sufficient
+
+        count = self.num_contributing_to_threshold(
+            experiment=experiment, trials_from_node=curr_node.trials_from_node
+        )
+        if self.require_sufficient:
+            return count >= self.threshold
+        else:
+            return count < self.threshold
+
+
 class AuxiliaryExperimentCheck(TransitionCriterion):
     """A class to transition from one GenerationNode to another by checking if certain
     types of Auxiliary Experiment purposes exists.
diff --git a/ax/storage/json_store/registry.py b/ax/storage/json_store/registry.py