@@ -249,6 +249,61 @@ def _process_silo_file(silo_filepath: str, case: TestCase, out_filepath: str):
249249 )
250250
251251
252+ def _print_multirank_debug_info (case : TestCase ):
253+ """Print debug information for multi-rank post-process failures."""
254+ case_dir = case .get_dirpath ()
255+ restart_dir = os .path .join (case_dir , "restart_data" )
256+
257+ cons .print ("[bold yellow]Multi-rank debug (ppn >= 2): inspecting restart_data and post_process.inp[/bold yellow]" )
258+ cons .print (f"[bold yellow] Case directory:[/bold yellow] { case_dir } " )
259+ cons .print (f"[bold yellow] restart_data directory:[/bold yellow] { restart_dir } " )
260+
261+ # List restart_data contents
262+ if os .path .isdir (restart_dir ):
263+ try :
264+ entries = sorted (os .listdir (restart_dir ))
265+ except OSError as exc :
266+ cons .print (f"[bold yellow] Could not list restart_data contents: { exc } [/bold yellow]" )
267+ else :
268+ cons .print (f"[bold yellow] restart_data entries ({ len (entries )} total, showing up to 20):[/bold yellow]" )
269+ for name in entries [:20 ]:
270+ cons .print (f" - { name } " )
271+ else :
272+ cons .print ("[bold yellow] restart_data directory does not exist[/bold yellow]" )
273+
274+ # Dump key case parameters relevant to restart/post-process
275+ params = getattr (case , "params" , {})
276+ def _param (name : str ):
277+ return params .get (name , "<unset>" )
278+
279+ cons .print ("[bold yellow] Selected case parameters relevant to restart:[/bold yellow]" )
280+ for key in (
281+ "t_step_start" ,
282+ "t_step_stop" ,
283+ "t_step_save" ,
284+ "n_start" ,
285+ "t_save" ,
286+ "parallel_io" ,
287+ "file_per_process" ,
288+ ):
289+ cons .print (f" { key } = { _param (key )} " )
290+
291+ # Show the beginning of post_process.inp if present
292+ ppi_path = os .path .join (case_dir , "post_process.inp" )
293+ if os .path .exists (ppi_path ):
294+ cons .print (f"[bold yellow] First lines of post_process.inp ({ ppi_path } ):[/bold yellow]" )
295+ try :
296+ with open (ppi_path , "r" , encoding = "utf-8" , errors = "replace" ) as f :
297+ for i , line in enumerate (f ):
298+ if i >= 40 :
299+ break
300+ cons .print (" " + line .rstrip ())
301+ except OSError as exc :
302+ cons .print (f"[bold yellow] Could not read post_process.inp: { exc } [/bold yellow]" )
303+ else :
304+ cons .print ("[bold yellow] post_process.inp not found in case directory[/bold yellow]" )
305+
306+
252307def _handle_case (case : TestCase , devices : typing .Set [int ]):
253308 # pylint: disable=global-statement, global-variable-not-assigned
254309 global current_test_number
@@ -337,57 +392,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
337392
338393 # Extra debug for multi-rank restart/post-process issues
339394 if getattr (case , "ppn" , 1 ) >= 2 :
340- case_dir = case .get_dirpath ()
341- restart_dir = os .path .join (case_dir , "restart_data" )
342-
343- cons .print ("[bold yellow]Multi-rank debug (ppn >= 2): inspecting restart_data and post_process.inp[/bold yellow]" )
344- cons .print (f"[bold yellow] Case directory:[/bold yellow] { case_dir } " )
345- cons .print (f"[bold yellow] restart_data directory:[/bold yellow] { restart_dir } " )
346-
347- # List restart_data contents
348- if os .path .isdir (restart_dir ):
349- try :
350- entries = sorted (os .listdir (restart_dir ))
351- except OSError as exc :
352- cons .print (f"[bold yellow] Could not list restart_data contents: { exc } [/bold yellow]" )
353- else :
354- cons .print (f"[bold yellow] restart_data entries ({ len (entries )} total, showing up to 20):[/bold yellow]" )
355- for name in entries [:20 ]:
356- cons .print (f" - { name } " )
357- else :
358- cons .print ("[bold yellow] restart_data directory does not exist[/bold yellow]" )
359-
360- # Dump key case parameters relevant to restart/post-process
361- params = getattr (case , "params" , {})
362- def _param (name : str ):
363- return params .get (name , "<unset>" )
364-
365- cons .print ("[bold yellow] Selected case parameters relevant to restart:[/bold yellow]" )
366- for key in (
367- "t_step_start" ,
368- "t_step_stop" ,
369- "t_step_save" ,
370- "n_start" ,
371- "t_save" ,
372- "parallel_io" ,
373- "file_per_process" ,
374- ):
375- cons .print (f" { key } = { _param (key )} " )
376-
377- # Show the beginning of post_process.inp if present
378- ppi_path = os .path .join (case_dir , "post_process.inp" )
379- if os .path .exists (ppi_path ):
380- cons .print (f"[bold yellow] First lines of post_process.inp ({ ppi_path } ):[/bold yellow]" )
381- try :
382- with open (ppi_path , "r" , encoding = "utf-8" , errors = "replace" ) as f :
383- for i , line in enumerate (f ):
384- if i >= 40 :
385- break
386- cons .print (" " + line .rstrip ())
387- except OSError as exc :
388- cons .print (f"[bold yellow] Could not read post_process.inp: { exc } [/bold yellow]" )
389- else :
390- cons .print ("[bold yellow] post_process.inp not found in case directory[/bold yellow]" )
395+ _print_multirank_debug_info (case )
391396
392397 raise MFCException (
393398 f"Test { case } : Failed to execute MFC (post-process). "
0 commit comments