@@ -322,17 +322,73 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
322322 raise MFCException (f"Test { case } : { msg } " )
323323
324324 if ARG ("test_all" ):
325- case . delete_output ()
325+ # Don't delete output here - we need restart_data from the simulation above
326326 # Check timeout before launching the (potentially long) post-process run
327327 if timeout_flag .is_set ():
328328 raise TestTimeoutError ("Test case exceeded 1 hour timeout" )
329- cmd = case .run ([PRE_PROCESS , SIMULATION , POST_PROCESS ], gpus = devices )
329+ # Run only POST_PROCESS since PRE_PROCESS and SIMULATION already ran successfully above
330+ cmd = case .run ([POST_PROCESS ], gpus = devices )
330331 out_filepath = os .path .join (case .get_dirpath (), "out_post.txt" )
331332 common .file_write (out_filepath , cmd .stdout )
332333
333334 # Check return code from post-process run
334335 if cmd .returncode != 0 :
335336 cons .print (cmd .stdout )
337+
338+ # Extra debug for multi-rank restart/post-process issues
339+ if getattr (case , "ppn" , 1 ) >= 2 :
340+ case_dir = case .get_dirpath ()
341+ restart_dir = os .path .join (case_dir , "restart_data" )
342+
343+ cons .print ("[bold yellow]Multi-rank debug (ppn >= 2): inspecting restart_data and post_process.inp[/bold yellow]" )
344+ cons .print (f"[bold yellow] Case directory:[/bold yellow] { case_dir } " )
345+ cons .print (f"[bold yellow] restart_data directory:[/bold yellow] { restart_dir } " )
346+
347+ # List restart_data contents
348+ if os .path .isdir (restart_dir ):
349+ try :
350+ entries = sorted (os .listdir (restart_dir ))
351+ except OSError as exc :
352+ cons .print (f"[bold yellow] Could not list restart_data contents: { exc } [/bold yellow]" )
353+ else :
354+ cons .print (f"[bold yellow] restart_data entries ({ len (entries )} total, showing up to 20):[/bold yellow]" )
355+ for name in entries [:20 ]:
356+ cons .print (f" - { name } " )
357+ else :
358+ cons .print ("[bold yellow] restart_data directory does not exist[/bold yellow]" )
359+
360+ # Dump key case parameters relevant to restart/post-process
361+ params = getattr (case , "params" , {})
362+ def _param (name : str ):
363+ return params .get (name , "<unset>" )
364+
365+ cons .print ("[bold yellow] Selected case parameters relevant to restart:[/bold yellow]" )
366+ for key in (
367+ "t_step_start" ,
368+ "t_step_stop" ,
369+ "t_step_save" ,
370+ "n_start" ,
371+ "t_save" ,
372+ "parallel_io" ,
373+ "file_per_process" ,
374+ ):
375+ cons .print (f" { key } = { _param (key )} " )
376+
377+ # Show the beginning of post_process.inp if present
378+ ppi_path = os .path .join (case_dir , "post_process.inp" )
379+ if os .path .exists (ppi_path ):
380+ cons .print (f"[bold yellow] First lines of post_process.inp ({ ppi_path } ):[/bold yellow]" )
381+ try :
382+ with open (ppi_path , "r" , encoding = "utf-8" , errors = "replace" ) as f :
383+ for i , line in enumerate (f ):
384+ if i >= 40 :
385+ break
386+ cons .print (" " + line .rstrip ())
387+ except OSError as exc :
388+ cons .print (f"[bold yellow] Could not read post_process.inp: { exc } [/bold yellow]" )
389+ else :
390+ cons .print ("[bold yellow] post_process.inp not found in case directory[/bold yellow]" )
391+
336392 raise MFCException (
337393 f"Test { case } : Failed to execute MFC (post-process). "
338394 f"See log at: { out_filepath } "
0 commit comments