Skip to content

Commit 56c3528

Browse files
felipemello1Felipe Mello
andauthored
wandb hang fix - add timeout (meta-pytorch#460)
Co-authored-by: Felipe Mello <[email protected]>
1 parent 8711f02 commit 56c3528

File tree

1 file changed

+10
-3
lines changed

1 file changed

+10
-3
lines changed

src/forge/observability/metric_actors.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -372,10 +372,17 @@ def get_fetcher_count(self) -> int:
372372

373373
@endpoint
374374
async def shutdown(self) -> None:
375-
# Finish per-rank logger_backends via fetchers
376375
if self.fetchers:
377-
tasks = [fetcher.shutdown.call() for fetcher in self.fetchers.values()]
378-
await asyncio.gather(*tasks, return_exceptions=True)
376+
try:
377+
tasks = [fetcher.shutdown.call() for fetcher in self.fetchers.values()]
378+
await asyncio.wait_for(
379+
asyncio.gather(*tasks, return_exceptions=True), timeout=2.0
380+
)
381+
except asyncio.TimeoutError:
382+
logger.warning(
383+
"Metric logging fetcher shutdown timed out likely due to the child process being terminated before the parent."
384+
)
385+
379386
# Finish global logger_backends
380387
for logger_backend_name, logger_backend in self.global_logger_backends.items():
381388
await logger_backend.finish()

0 commit comments

Comments
 (0)