Skip to content

Commit 96b5ddd

Browse files
committed
Adding a pipe break missing flag that makes the cluster fail quickly
1 parent 7a1fc85 commit 96b5ddd

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
322322
f"--gcs_scratch_location={staging_location}",
323323
# This should be made configurable
324324
f"--num_elastic_slices={cfg.accelerator.num_replicas}",
325+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
325326
]
326327
cmd_args.extend(xla_flags_from_options(self._xla_options).split())
327328

@@ -358,6 +359,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
358359
f"--instance_count={pathways_instance_count}",
359360
f"--instance_type={pathways_tpu_version}:{system.topology}",
360361
f"--gcs_scratch_location={staging_location}",
362+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
361363
],
362364
),
363365
]
@@ -512,6 +514,7 @@ def _build_pathways_worker_container(
512514
f"--resource_manager_address={pathways_head_address}:"
513515
+ f"{_PATHWAYS_RESOURCE_MANAGER_PORT}",
514516
f"--gcs_scratch_location={cfg.output_dir}/pathways-staging",
517+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
515518
]
516519
mega_scale_args = xla_flags_from_options(self._mxla_options).split()
517520
worker_container["args"].extend(mega_scale_args)

0 commit comments

Comments
 (0)