Skip to content

Commit b52dfd8

Browse files
committed
Adding a pipe break missing flag that makes the cluster fail quickly
1 parent ada6783 commit b52dfd8

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
294294
f"--resource_manager_address=localhost:{_PATHWAYS_RESOURCE_MANAGER_PORT}",
295295
f"--server_port={_PATHWAYS_PROXY_PORT}",
296296
f"--gcs_scratch_location={staging_location}",
297+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
297298
]
298299
cmd_args.extend(xla_flags_from_options(self._xla_options).split())
299300

@@ -331,6 +332,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
331332
f"--instance_type={pathways_tpu_version}:{system.topology}",
332333
f"--gcs_scratch_location={staging_location}",
333334
"--alsologtostderr",
335+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
334336
],
335337
),
336338
]
@@ -485,6 +487,7 @@ def _build_pathways_worker_container(
485487
f"--resource_manager_address={pathways_head_address}:"
486488
+ f"{_PATHWAYS_RESOURCE_MANAGER_PORT}",
487489
f"--gcs_scratch_location={cfg.output_dir}/pathways-staging",
490+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
488491
]
489492
mega_scale_args = xla_flags_from_options(self._mxla_options).split()
490493
worker_container["args"].extend(mega_scale_args)

0 commit comments

Comments
 (0)