Topology Assignment for Leader Worker Set

Alexandre James · changlan · commit 04d5fd74c328 · 2026-02-18T10:23:13.000-08:00
GitOrigin-RevId: 726c4815749fca66b2214ecfb8ef4e1edeef8855
diff --git a/axlearn/cloud/gcp/job.py b/axlearn/cloud/gcp/job.py
@@ -60,6 +60,42 @@ class _ServiceType(enum.Enum):
     EXTERNAL_NAME = "ExternalName"
 
 
+def get_topology_assignment() -> Optional[list[list[str]]]:
+    """Retrieves TPU topology assignments from the environment variable.
+
+    When TPU slice auto-provisioning is enabled, Bastion passes topology assignments
+    through an environment variable. These assignments specify which TPU slices should be
+    used for the job, enabling precise control over TPU resource allocation.
+
+    Example topology assignment:
+        [["sub-block-id", "sub-block-id"]]
+
+    This is the assignment for a job asking for tpu-7x-256, that needs 128 chips, using
+    2 sub-blocks (64 chips per sub-block). This job will run on a TPU slice formed by
+    2 sub-blocks. Each inner array represents the TPU slice info for a job's replica.
+
+    Returns:
+        A list of lists of strings representing topology assignments, where each inner list
+        contains slice identifiers for a particular job replica. Returns None if the
+        environment variable is not set or if parsing fails.
+    """
+    topology_assignments_env = os.environ.get(BASTION_JOB_TOPOLOGY_ASSIGNMENT_ENV_VAR)
+    if not topology_assignments_env:
+        logging.info("No %s environment variable set.", BASTION_JOB_TOPOLOGY_ASSIGNMENT_ENV_VAR)
+        return None
+
+    try:
+        return json.loads(topology_assignments_env)
+    except json.JSONDecodeError as e:
+        logging.warning(
+            "Failed to parse topology assignments from env var %s, value: %s, error: %s",
+            BASTION_JOB_TOPOLOGY_ASSIGNMENT_ENV_VAR,
+            topology_assignments_env,
+            e,
+        )
+        return None
+
+
 class GCPJob(Job):
     """Base GCP Job definition."""
 
@@ -173,41 +209,6 @@ def _delete(self):
         # fully blocking; after the call returns there can be a delay before everything is deleted.
         delete_k8s_jobset(cfg.name, namespace=cfg.namespace)
 
-    def _get_topology_assignment(self) -> Optional[list[list[str]]]:
-        """Retrieves TPU topology assignments from the environment variable.
-
-        When TPU slice auto-provisioning is enabled, Bastion passes topology assignments
-        through an environment variable. These assignments specify which TPU slices should be
-        used for the job, enabling precise control over TPU resource allocation.
-
-        Example topology assignment:
-            [["sub-block-id", "sub-block-id"]]
-
-        This is the assignment for a job asking for tpu-7x-256, that needs 128 chips, using
-        2 sub-blocks (64 chips per sub-block). This job will run on a TPU slice formed by
-        2 sub-blocks. Each inner array represents the TPU slice info for a job's replica.
-
-        Returns:
-            A list of lists of strings representing topology assignments, where each inner list
-            contains slice identifiers for a particular job replica. Returns None if the
-            environment variable is not set or if parsing fails.
-        """
-        topology_assignments_env = os.environ.get(BASTION_JOB_TOPOLOGY_ASSIGNMENT_ENV_VAR)
-        if not topology_assignments_env:
-            logging.info("No %s environment variable set.", BASTION_JOB_TOPOLOGY_ASSIGNMENT_ENV_VAR)
-            return None
-
-        try:
-            return json.loads(topology_assignments_env)
-        except json.JSONDecodeError as e:
-            logging.warning(
-                "Failed to parse topology assignments from env var %s, value: %s, error: %s",
-                BASTION_JOB_TOPOLOGY_ASSIGNMENT_ENV_VAR,
-                topology_assignments_env,
-                e,
-            )
-            return None
-
     def _lookup_system_by_node_selectors(
         self, node_selector: dict[str, str]
     ) -> Optional[tuple[str, _SystemCharacteristics]]:
@@ -382,7 +383,7 @@ def _build_jobset(self) -> Nested[Any]:
         # Bastion passes the job metadata to the runner through env vars
         # If the job has topology assigned, its also in the env var
         # Try to parse the env var and get the topology assignments.
-        topology_assignment = self._get_topology_assignment()
+        topology_assignment = get_topology_assignment()
         if cfg.enable_tpu_slice_auto_provisioning and topology_assignment:
             slice_selection_dict = self._get_tpu_replicated_job_topology_selection(
                 replicated_jobs, topology_assignment
@@ -563,6 +564,7 @@ class Config(GCPJob.Config):
         gke_gateway_route: bool = False
         http_route: Optional[LWSHTTPRoute.Config] = None
         health_check_policy: Optional[LWSHealthCheckPolicy.Config] = None
+        enable_tpu_slice_auto_provisioning: Optional[bool] = None
 
     @classmethod
     def set_defaults(cls, fv):
@@ -635,6 +637,12 @@ def define_flags(cls, fv: flags.FlagValues):
             "Enable gke_gateway_route with notary-proxy sidecars for direct gateway routing",
             **common_kwargs,
         )
+        flags.DEFINE_boolean(
+            "enable_tpu_slice_auto_provisioning",
+            None,
+            "Auto provision TPU slices based on the topology assignment.",
+            **common_kwargs,
+        )
 
     @classmethod
     def from_flags(cls, fv: flags.FlagValues, **kwargs):
@@ -653,6 +661,15 @@ def __init__(self, cfg: Config, *, bundler: BaseDockerBundler):
         super().__init__(cfg)
         cfg: GKELeaderWorkerSet.Config = self.config
         self._bundler = bundler
+
+        # Pass enable_tpu_slice_auto_provisioning from GKEJob to the builder
+        builder_cfg = cfg.builder
+        if (
+            hasattr(builder_cfg, "enable_tpu_slice_auto_provisioning")
+            and cfg.enable_tpu_slice_auto_provisioning is not None
+        ):
+            builder_cfg.enable_tpu_slice_auto_provisioning = cfg.enable_tpu_slice_auto_provisioning
+
         # This instantiatees a builder for constructing replicated job specs, which will be managed
         # together under the leaderworkerset represented by this class.
         # Note the distinction from bundlers, which are responsible for bundling any code assets
@@ -683,9 +700,42 @@ def _build_leaderworkerset(self) -> Nested[Any]:
         """
         cfg: GKELeaderWorkerSet.Config = self.config
         annotations = maybe_instantiate(cfg.annotations or {})
+        labels = {}
+
+        # If the topology is set and slice auto provisioning is configured
+        # set the necessary annotations
+        topology_assignment = get_topology_assignment()
+        if cfg.enable_tpu_slice_auto_provisioning and topology_assignment:
+            # Add TPU slice selection
+            logging.info("Adding slice selection: %s to leader worker set", topology_assignment)
+
+            # Note, we use async here rather than the jobset sync. Async will immediatly create
+            # the pods before the slice has been created. Once sync is supported for leader worker
+            # set we should consider switching.
+            labels["tpu-provisioner.cloud.google.com/slice-autoprovisioning"] = "async"
+
+            # For Leader worker sets, we only support topology assignments to workers.
+            # The format of the topology assignments (list of subblock groups) is what
+            # is expected by the TPU provisioner.
+            annotations.update(
+                {
+                    "tpu-provisioner.cloud.google.com/slice-selection": json.dumps(
+                        {
+                            "workers": topology_assignment,
+                        }
+                    )
+                }
+            )
+
+            # Remove exclusive topology annotation, the tpu provisioner will ensure replica
+            # affinity by injecting slice based node selectors, so we don't need to use
+            # the exclusive topology annotations
+            exclusive_topology_annotation = exclusive_topology_annotations_leaderworkerset()
+            for key in exclusive_topology_annotation:
+                annotations.pop(key, None)
 
         return dict(
-            metadata=dict(name=cfg.name, annotations=annotations),
+            metadata=dict(name=cfg.name, annotations=annotations, labels=labels),
             spec=dict(
                 replicas=cfg.num_replicas,
                 leaderWorkerTemplate=self._builder(),
diff --git a/axlearn/cloud/gcp/job_test.py b/axlearn/cloud/gcp/job_test.py
@@ -3,6 +3,7 @@
 """Tests jobs by launching commands on TPUs/VMs."""
 # pylint: disable=protected-access
 
+import json
 from typing import Optional, cast
 from unittest import mock
 
@@ -543,3 +544,113 @@ def test_delete(self):
             gke_job = cfg.instantiate(bundler=mock.create_autospec(Bundler))
             gke_job._delete()  # pylint: disable=protected-access
             mock_delete.assert_called()
+
+    @parameterized.parameters(
+        # Test when auto provisioning is enabled with topology assignment
+        dict(
+            enable_tpu_slice_auto_provisioning=True,
+            topology_assignment=[["subblock-1", "subblock-2"]],
+            expect_label=True,
+            expect_annotation=True,
+        ),
+        # Test when auto provisioning is disabled
+        dict(
+            enable_tpu_slice_auto_provisioning=False,
+            topology_assignment=[["subblock-1", "subblock-2"]],
+            expect_label=False,
+            expect_annotation=False,
+        ),
+        # Test when auto provisioning is None (not set)
+        dict(
+            enable_tpu_slice_auto_provisioning=None,
+            topology_assignment=[["subblock-1", "subblock-2"]],
+            expect_label=False,
+            expect_annotation=False,
+        ),
+        # Test when auto provisioning is enabled but no topology assignment
+        dict(
+            enable_tpu_slice_auto_provisioning=True,
+            topology_assignment=None,
+            expect_label=False,
+            expect_annotation=False,
+        ),
+    )
+    def test_build_leaderworkerset(
+        self,
+        enable_tpu_slice_auto_provisioning,
+        topology_assignment,
+        expect_label,
+        expect_annotation,
+    ):
+        """Test _build_leaderworkerset with enable_tpu_slice_auto_provisioning."""
+        cfg, bundler_cfg = self._job_config(
+            command="test-command",
+            bundler_cls=CloudBuildBundler,
+            enable_tpu_slice_auto_provisioning=enable_tpu_slice_auto_provisioning,
+        )
+
+        # Mock the builder to return a simple leader worker template
+        mock_leader_worker_template = {
+            "size": 8,
+            "workerTemplate": {
+                "metadata": {"labels": {"test-label": "test-value"}},
+                "spec": {"containers": []},
+            },
+        }
+
+        # Create a mock builder that returns our mock template
+        mock_builder = mock.Mock()
+        mock_builder.return_value = mock_leader_worker_template
+
+        # Create the GKE job instance first
+        gke_job = cfg.instantiate(bundler=bundler_cfg.instantiate())
+
+        # Replace the builder with our mock (this is what we're testing)
+        gke_job._builder = mock_builder
+
+        # Mock get_topology_assignment
+        with mock.patch(
+            f"{job.__name__}.get_topology_assignment",
+            return_value=topology_assignment,
+        ):
+            # Build the leaderworkerset
+            lws_spec = gke_job._build_leaderworkerset()
+
+            # Check metadata
+            self.assertIn("metadata", lws_spec)
+            self.assertIn("name", lws_spec["metadata"])
+            self.assertEqual(cfg.name, lws_spec["metadata"]["name"])
+
+            # Check labels
+            labels = lws_spec["metadata"].get("labels", {})
+            slice_auto_provisioning_label = (
+                "tpu-provisioner.cloud.google.com/slice-autoprovisioning"
+            )
+            if expect_label:
+                self.assertIn(slice_auto_provisioning_label, labels)
+                self.assertEqual("async", labels[slice_auto_provisioning_label])
+            else:
+                self.assertNotIn(slice_auto_provisioning_label, labels)
+
+            # Check annotations
+            annotations = lws_spec["metadata"].get("annotations", {})
+            slice_selection_annotation = "tpu-provisioner.cloud.google.com/slice-selection"
+            if expect_annotation:
+                self.assertIn(slice_selection_annotation, annotations)
+                slice_selection = json.loads(annotations[slice_selection_annotation])
+                self.assertIn("workers", slice_selection)
+                self.assertEqual(topology_assignment, slice_selection["workers"])
+            else:
+                self.assertNotIn(slice_selection_annotation, annotations)
+
+            # Verify exclusive topology annotations are removed when auto provisioning
+            if expect_annotation:
+                self.assertNotIn(
+                    "leaderworkerset.sigs.k8s.io/subgroup-exclusive-topology",
+                    annotations,
+                )
+
+            # Check spec
+            self.assertIn("spec", lws_spec)
+            self.assertIn("replicas", lws_spec["spec"])
+            self.assertIn("leaderWorkerTemplate", lws_spec["spec"])
diff --git a/axlearn/cloud/gcp/lws_utils.py b/axlearn/cloud/gcp/lws_utils.py
@@ -99,6 +99,20 @@ class TPULeaderWorkerTemplate(TPUJobBuilder):
 
     Config = TPUJobBuilder.Config
 
+    def _build_pod(self) -> Nested[Any]:
+        cfg: TPUJobBuilder.Config = self.config
+
+        # Add inject slice selector for slice auto provisioned jobs
+        pod = super()._build_pod()
+        if cfg.enable_tpu_slice_auto_provisioning:
+            pod["metadata"]["labels"].update(
+                {
+                    "tpu-provisioner.cloud.google.com/inject-slice-selector": "true",
+                }
+            )
+
+        return pod
+
     def __call__(self) -> Sequence[Nested[Any]]:
         system = USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS[self._tpu_type]
         return dict(  # pytype: disable=bad-return-type
diff --git a/axlearn/cloud/gcp/lws_utils_test.py b/axlearn/cloud/gcp/lws_utils_test.py