From da8db1828eef9099c67f9a018c04a04a427bdd25 Mon Sep 17 00:00:00 2001 From: Tony Kao Date: Thu, 21 Aug 2025 09:56:59 -0700 Subject: [PATCH] torchx - fix race condition issue that local_scheduler LogIterator that reads early Summary: torchx/cli/test:cmd_run_test - test_run_with_log (https://www.internalfb.com/intern/test/281475186013299?ref_report_id=0) regularly failed due to assertion on local_scheduler output is missing expected content. This is causing noise to oncall due to failed release test blocking torchx release. https://fburl.com/conveyor/a5u31rby issue looked to be in the LogIterator abort early if content has not written: https://www.internalfb.com/code/fbsource/[922fd5827417][history]/fbcode/torchx/schedulers/local_scheduler.py?lines=1185-1189 The propose fixed is add a small delay before fp_log is setup. Differential Revision: D80716088 --- torchx/schedulers/local_scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py index c7cf7cc76..c039ebf54 100644 --- a/torchx/schedulers/local_scheduler.py +++ b/torchx/schedulers/local_scheduler.py @@ -1159,6 +1159,7 @@ def __iter__(self) -> "LogIterator": self._check_finished() # check to see if app has finished running if os.path.isfile(self._log_file): + time.sleep(0.1) # fix timing issue self._log_fp = open( self._log_file, mode="rt",