1- import os , typing , shutil , time , itertools , threading
1+ import os , typing , shutil , time , itertools , threading , subprocess
22from random import sample , seed
33
44import rich , rich .table
3636# from worker threads which could leave the scheduler in an undefined state.
3737abort_tests = threading .Event ()
3838
39- class TestTimeoutError (MFCException ):
40- pass
41-
4239# pylint: disable=too-many-branches, trailing-whitespace
4340def __filter (cases_ ) -> typing .List [TestCase ]:
4441 cases = cases_ [:]
@@ -309,32 +306,16 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
309306 global current_test_number
310307 start_time = time .time ()
311308
312- # Set timeout using threading.Timer (works in worker threads)
313- # Note: we intentionally do not use signal.alarm() here because signals
314- # only work in the main thread; sched.sched runs tests in worker threads.
315- # threading.Timer works correctly in this threaded context.
316- timeout_flag = threading .Event ()
317- timeout_timer = threading .Timer (TEST_TIMEOUT_SECONDS , timeout_flag .set )
318- timeout_timer .start ()
319-
320309 tol = case .compute_tolerance ()
321310 case .delete_output ()
322311 case .create_directory ()
323312
324313 if ARG ("dry_run" ):
325314 cons .print (f" [bold magenta]{ case .get_uuid ()} [/bold magenta] SKIP { case .trace } " )
326- timeout_timer .cancel ()
327315 return
328316
329317 try :
330- # Check timeout before starting
331- if timeout_flag .is_set ():
332- raise TestTimeoutError ("Test case exceeded 1 hour timeout" )
333- cmd = case .run ([PRE_PROCESS , SIMULATION ], gpus = devices )
334-
335- # Check timeout after simulation
336- if timeout_flag .is_set ():
337- raise TestTimeoutError ("Test case exceeded 1 hour timeout" )
318+ cmd = case .run ([PRE_PROCESS , SIMULATION ], gpus = devices , timeout = TEST_TIMEOUT_SECONDS )
338319
339320 out_filepath = os .path .join (case .get_dirpath (), "out_pre_sim.txt" )
340321
@@ -380,13 +361,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
380361 # Clean up output from golden comparison run
381362 case .delete_output ()
382363
383- # Check timeout before full pipeline
384- if timeout_flag .is_set ():
385- raise TestTimeoutError ("Test case exceeded 1 hour timeout" )
386-
387364 # Run full pipeline: this ensures simulation and post_process use
388365 # consistent configuration (parallel_io, file_per_process, etc.)
389- cmd = case .run ([PRE_PROCESS , SIMULATION , POST_PROCESS ], gpus = devices )
366+ cmd = case .run ([PRE_PROCESS , SIMULATION , POST_PROCESS ], gpus = devices , timeout = TEST_TIMEOUT_SECONDS )
390367
391368 out_post_filepath = os .path .join (case .get_dirpath (), "out_post.txt" )
392369 common .file_write (out_post_filepath , cmd .stdout )
@@ -419,7 +396,8 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
419396 progress_str = f"({ current_test_number :3d} /{ total_test_count :3d} )"
420397 cons .print (f" { progress_str } [bold magenta]{ case .get_uuid ()} [/bold magenta] { duration :6.2f} { case .trace } " )
421398
422- except TestTimeoutError as exc :
399+ except subprocess .TimeoutExpired as exc :
400+ # Subprocess timeout - the process was actually killed
423401 log_path = os .path .join (case .get_dirpath (), 'out_pre_sim.txt' )
424402 if os .path .exists (log_path ):
425403 log_msg = f"Check the log at: { log_path } "
@@ -428,12 +406,10 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
428406 f"Log file ({ log_path } ) may not exist if the timeout occurred early."
429407 )
430408 raise MFCException (
431- f"Test { case } exceeded 1 hour timeout. "
409+ f"Test { case } exceeded 1 hour timeout (process was killed) . "
432410 f"This may indicate a hung simulation or misconfigured case. "
433411 f"{ log_msg } "
434412 ) from exc
435- finally :
436- timeout_timer .cancel () # Cancel timeout timer
437413
438414
439415def handle_case (case : TestCase , devices : typing .Set [int ]):
0 commit comments