Skip to content

Commit 3cdef67

Browse files
committed
Adding an extra slice to the Pathways cluster to swap in when there is a slice failure
1 parent 96b5ddd commit 3cdef67

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
356356
args=[
357357
f"--server_port={_PATHWAYS_RESOURCE_MANAGER_PORT}",
358358
"--node_type=resource_manager",
359-
f"--instance_count={pathways_instance_count}",
359+
f"--instance_count={pathways_instance_count + 1}",
360360
f"--instance_type={pathways_tpu_version}:{system.topology}",
361361
f"--gcs_scratch_location={staging_location}",
362362
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
@@ -625,7 +625,7 @@ def __call__(self) -> Sequence[Nested[Any]]:
625625
),
626626
dict(
627627
name=_PATHWAYS_WORKER_REPLICATED_JOB_NAME,
628-
replicas=cfg.accelerator.num_replicas,
628+
replicas=cfg.accelerator.num_replicas + 1,
629629
template=self._build_pathways_worker_job(),
630630
),
631631
]

0 commit comments

Comments
 (0)