Skip to content

Commit 6452a6b

Browse files
committed
Adding a pipe break missing flag that makes the cluster fail quickly
1 parent a712220 commit 6452a6b

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
324324
f"--gcs_scratch_location={staging_location}",
325325
# This should be made configurable
326326
f"--num_elastic_slices={cfg.accelerator.num_replicas}",
327+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
327328
"--vmodule=grpc_host_buffer=3,rpc_helper=3,host_buffer=3,ifrt_backend=3,grpc_service_impl=3",
328329
]
329330
cmd_args.extend(xla_flags_from_options(self._xla_options).split())
@@ -361,6 +362,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
361362
f"--instance_count={pathways_instance_count}",
362363
f"--instance_type={pathways_tpu_version}:{system.topology}",
363364
f"--gcs_scratch_location={staging_location}",
365+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
364366
],
365367
),
366368
]
@@ -515,6 +517,7 @@ def _build_pathways_worker_container(
515517
f"--resource_manager_address={pathways_head_address}:"
516518
+ f"{_PATHWAYS_RESOURCE_MANAGER_PORT}",
517519
f"--gcs_scratch_location={cfg.output_dir}/pathways-staging",
520+
"--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
518521
]
519522
mega_scale_args = xla_flags_from_options(self._mxla_options).split()
520523
worker_container["args"].extend(mega_scale_args)

0 commit comments

Comments
 (0)