@@ -327,10 +327,19 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
327327 return
328328
329329 try :
330+ # Decide which targets to run in a single pipeline.
331+ # - Default: PRE_PROCESS + SIMULATION (used for golden comparison)
332+ # - With --test-all (-a): also include POST_PROCESS so that the configuration
333+ # used by simulation and post_process is consistent (e.g. parallel_io,
334+ # file_per_process, *_wrt flags). This ensures simulation writes the
335+ # Lustre-style restart/grid files (e.g. restart_data/lustre_x_cb.dat)
336+ # that post_process expects.
337+ targets = [PRE_PROCESS , SIMULATION , POST_PROCESS ] if ARG ("test_all" ) else [PRE_PROCESS , SIMULATION ]
338+
330339 # Check timeout before starting
331340 if timeout_flag .is_set ():
332341 raise TestTimeoutError ("Test case exceeded 1 hour timeout" )
333- cmd = case .run ([ PRE_PROCESS , SIMULATION ] , gpus = devices )
342+ cmd = case .run (targets , gpus = devices )
334343
335344 # Check timeout after simulation
336345 if timeout_flag .is_set ():
@@ -342,6 +351,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
342351
343352 if cmd .returncode != 0 :
344353 cons .print (cmd .stdout )
354+ # If test_all is enabled and the pipeline failed, provide extra debug info
355+ if ARG ("test_all" ) and getattr (case , "ppn" , 1 ) >= 2 :
356+ _print_multirank_debug_info (case )
345357 raise MFCException (f"Test { case } : Failed to execute MFC." )
346358
347359 pack , err = packer .pack (case .get_dirpath ())
@@ -377,16 +389,20 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
377389 raise MFCException (f"Test { case } : { msg } " )
378390
379391 if ARG ("test_all" ):
380- # Don't delete output here - we need restart_data from the simulation above
381- # Check timeout before launching the (potentially long) post-process run
382- if timeout_flag .is_set ():
383- raise TestTimeoutError ("Test case exceeded 1 hour timeout" )
384- # Run only POST_PROCESS since PRE_PROCESS and SIMULATION already ran successfully above
385- cmd = case .run ([POST_PROCESS ], gpus = devices )
386- out_filepath = os .path .join (case .get_dirpath (), "out_post.txt" )
387- common .file_write (out_filepath , cmd .stdout )
388-
389- # Check return code from post-process run
392+ # We already ran PRE_PROCESS, SIMULATION, and POST_PROCESS together
393+ # in the single pipeline above. At this point:
394+ # - If cmd.returncode != 0, post_process (or an earlier stage)
395+ # failed and we want to surface that with verbose diagnostics.
396+ # - If cmd.returncode == 0, post_process completed and should
397+ # have written its outputs (e.g., silo_hdf5) based on a
398+ # configuration that had "post_process" in ARGS["mfc"]["targets"],
399+ # so parallel_io and restart_data layout are consistent.
400+
401+ out_post_filepath = os .path .join (case .get_dirpath (), "out_post.txt" )
402+ # Write the full pipeline output to an explicit post-process log too,
403+ # even though it includes pre/sim messages. This is helpful for CI.
404+ common .file_write (out_post_filepath , cmd .stdout )
405+
390406 if cmd .returncode != 0 :
391407 cons .print (cmd .stdout )
392408
@@ -396,14 +412,15 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
396412
397413 raise MFCException (
398414 f"Test { case } : Failed to execute MFC (post-process). "
399- f"See log at: { out_filepath } "
415+ f"See log at: { out_post_filepath } "
400416 )
401417
418+ # After a successful post_process run, inspect Silo/HDF5 outputs
402419 silo_dir = os .path .join (case .get_dirpath (), 'silo_hdf5' , 'p0' )
403420 if os .path .isdir (silo_dir ):
404421 for silo_filename in os .listdir (silo_dir ):
405422 silo_filepath = os .path .join (silo_dir , silo_filename )
406- _process_silo_file (silo_filepath , case , out_filepath )
423+ _process_silo_file (silo_filepath , case , out_post_filepath )
407424
408425 case .delete_output ()
409426
0 commit comments