@@ -291,6 +291,8 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
291
291
f"--resource_manager_address=localhost:{ _PATHWAYS_RESOURCE_MANAGER_PORT } " ,
292
292
f"--server_port={ _PATHWAYS_PROXY_PORT } " ,
293
293
f"--gcs_scratch_location={ staging_location } " ,
294
+ # This should be made configurable
295
+ f"--num_elastic_slices={ cfg .accelerator .num_replicas } "
294
296
]
295
297
cmd_args .extend (xla_flags_from_options (self ._xla_options ).split ())
296
298
@@ -547,14 +549,19 @@ def _build_pathways_worker_job(
547
549
annotations .update (
548
550
{"alpha.jobset.sigs.k8s.io/exclusive-topology" : "cloud.google.com/gke-nodepool" }
549
551
)
552
+ # Default value for suspend and resume.
553
+ # References:
554
+ # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
555
+ # backoffLimit = system.vms_per_slice * 4
556
+
557
+ # This backoffLimit is just for verifying elastic fast-resume
558
+ large_number = 1000
559
+ backoffLimit = system .vms_per_slice * 4 * large_number
550
560
551
561
spec = dict (
552
562
parallelism = system .vms_per_slice ,
553
563
completions = system .vms_per_slice ,
554
- # Default value for suspend and resume.
555
- # References:
556
- # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
557
- backoffLimit = system .vms_per_slice * 4 ,
564
+ backoffLimit = backoffLimit ,
558
565
template = self ._build_pathways_worker_pod (pathways_worker_replicated_job_index ),
559
566
)
560
567
worker_job = dict (
0 commit comments