Skip to content

Commit 1e1fd3e

Browse files
committed
Adding a pipe break missing flag that makes the cluster fail quickly
1 parent 2e70e3f commit 1e1fd3e

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
322322
f"--gcs_scratch_location={staging_location}",
323323
# This should be made configurable
324324
f"--num_elastic_slices={cfg.accelerator.num_replicas}",
325+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
325326
]
326327
cmd_args.extend(xla_flags_from_options(self._xla_options).split())
327328

@@ -359,6 +360,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
359360
f"--instance_type={pathways_tpu_version}:{system.topology}",
360361
f"--gcs_scratch_location={staging_location}",
361362
"--alsologtostderr",
363+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
362364
],
363365
),
364366
]
@@ -513,6 +515,7 @@ def _build_pathways_worker_container(
513515
f"--resource_manager_address={pathways_head_address}:"
514516
+ f"{_PATHWAYS_RESOURCE_MANAGER_PORT}",
515517
f"--gcs_scratch_location={cfg.output_dir}/pathways-staging",
518+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
516519
]
517520
mega_scale_args = xla_flags_from_options(self._mxla_options).split()
518521
worker_container["args"].extend(mega_scale_args)

0 commit comments

Comments
 (0)