Skip to content

Commit 3ae1f24

Browse files
Increase backoff limit to handle preemption (#1313)
1 parent ffbea72 commit 3ae1f24

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@
6464
# Add node-selector for cpu workload to avoid sharing nodes with system services.
6565
_PATHWAYS_HEAD_NODE_POOL_SELECTOR_KEY = "axlearn/nodepool_type"
6666
_PATHWAYS_HEAD_NODE_POOL_SELECTOR_VALUE = "workload"
67+
# The back off limit of pathways pods.
68+
# Note that the head pod will back of exact this many times.
69+
# While workers will share #workers * _PATHWAYS_BACK_OFF_LIMIT total times.
70+
_PATHWAYS_BACK_OFF_LIMIT = 32
6771

6872

6973
def parse_xla_flag_value(value: str) -> Union[int, bool, str]:
@@ -449,7 +453,7 @@ def _build_pathways_head_job(self):
449453
spec = dict(
450454
parallelism=1,
451455
completions=1,
452-
backoffLimit=0,
456+
backoffLimit=_PATHWAYS_BACK_OFF_LIMIT,
453457
template=self._build_pathways_head_pod(),
454458
)
455459
head_job = dict(
@@ -605,7 +609,7 @@ def _build_pathways_worker_job(
605609
# Default value for suspend and resume.
606610
# References:
607611
# https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
608-
backoffLimit=system.vms_per_slice * 4,
612+
backoffLimit=system.vms_per_slice * _PATHWAYS_BACK_OFF_LIMIT,
609613
template=self._build_pathways_worker_pod(pathways_worker_replicated_job_index),
610614
)
611615
worker_job = dict(

0 commit comments

Comments
 (0)