Skip to content

Commit 4426abc

Browse files
committed
Added the changes to the jobset for elastic training
1 parent 9bbc96b commit 4426abc

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,8 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
291291
f"--resource_manager_address=localhost:{_PATHWAYS_RESOURCE_MANAGER_PORT}",
292292
f"--server_port={_PATHWAYS_PROXY_PORT}",
293293
f"--gcs_scratch_location={staging_location}",
294+
# This should be made configurable
295+
f"--num_elastic_slices={cfg.accelerator.num_replicas}"
294296
]
295297
cmd_args.extend(xla_flags_from_options(self._xla_options).split())
296298

@@ -547,14 +549,19 @@ def _build_pathways_worker_job(
547549
annotations.update(
548550
{"alpha.jobset.sigs.k8s.io/exclusive-topology": "cloud.google.com/gke-nodepool"}
549551
)
552+
# Default value for suspend and resume.
553+
# References:
554+
# https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
555+
# backoffLimit = system.vms_per_slice * 4
556+
557+
# This backoffLimit is just for verifying elastic fast-resume
558+
large_number = 1000
559+
backoffLimit = system.vms_per_slice * 4 * large_number
550560

551561
spec = dict(
552562
parallelism=system.vms_per_slice,
553563
completions=system.vms_per_slice,
554-
# Default value for suspend and resume.
555-
# References:
556-
# https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
557-
backoffLimit=system.vms_per_slice * 4,
564+
backoffLimit=backoffLimit,
558565
template=self._build_pathways_worker_pod(pathways_worker_replicated_job_index),
559566
)
560567
worker_job = dict(

0 commit comments

Comments
 (0)