File tree Expand file tree Collapse file tree 1 file changed +6
-2
lines changed Expand file tree Collapse file tree 1 file changed +6
-2
lines changed Original file line number Diff line number Diff line change 64
64
# Add node-selector for cpu workload to avoid sharing nodes with system services.
65
65
_PATHWAYS_HEAD_NODE_POOL_SELECTOR_KEY = "axlearn/nodepool_type"
66
66
_PATHWAYS_HEAD_NODE_POOL_SELECTOR_VALUE = "workload"
67
+ # The back off limit of pathways pods.
68
+ # Note that the head pod will back of exact this many times.
69
+ # While workers will share #workers * _PATHWAYS_BACK_OFF_LIMIT total times.
70
+ _PATHWAYS_BACK_OFF_LIMIT = 32
67
71
68
72
69
73
def parse_xla_flag_value (value : str ) -> Union [int , bool , str ]:
@@ -449,7 +453,7 @@ def _build_pathways_head_job(self):
449
453
spec = dict (
450
454
parallelism = 1 ,
451
455
completions = 1 ,
452
- backoffLimit = 0 ,
456
+ backoffLimit = _PATHWAYS_BACK_OFF_LIMIT ,
453
457
template = self ._build_pathways_head_pod (),
454
458
)
455
459
head_job = dict (
@@ -605,7 +609,7 @@ def _build_pathways_worker_job(
605
609
# Default value for suspend and resume.
606
610
# References:
607
611
# https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
608
- backoffLimit = system .vms_per_slice * 4 ,
612
+ backoffLimit = system .vms_per_slice * _PATHWAYS_BACK_OFF_LIMIT ,
609
613
template = self ._build_pathways_worker_pod (pathways_worker_replicated_job_index ),
610
614
)
611
615
worker_job = dict (
You can’t perform that action at this time.
0 commit comments