Skip to content

Commit 6d830f3

Browse files
committed
fix brokenness
1 parent 0589b85 commit 6d830f3

File tree

1 file changed

+30
-13
lines changed

1 file changed

+30
-13
lines changed

toolchain/mfc/test/test.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -327,10 +327,19 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
327327
return
328328

329329
try:
330+
# Decide which targets to run in a single pipeline.
331+
# - Default: PRE_PROCESS + SIMULATION (used for golden comparison)
332+
# - With --test-all (-a): also include POST_PROCESS so that the configuration
333+
# used by simulation and post_process is consistent (e.g. parallel_io,
334+
# file_per_process, *_wrt flags). This ensures simulation writes the
335+
# Lustre-style restart/grid files (e.g. restart_data/lustre_x_cb.dat)
336+
# that post_process expects.
337+
targets = [PRE_PROCESS, SIMULATION, POST_PROCESS] if ARG("test_all") else [PRE_PROCESS, SIMULATION]
338+
330339
# Check timeout before starting
331340
if timeout_flag.is_set():
332341
raise TestTimeoutError("Test case exceeded 1 hour timeout")
333-
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
342+
cmd = case.run(targets, gpus=devices)
334343

335344
# Check timeout after simulation
336345
if timeout_flag.is_set():
@@ -342,6 +351,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
342351

343352
if cmd.returncode != 0:
344353
cons.print(cmd.stdout)
354+
# If test_all is enabled and the pipeline failed, provide extra debug info
355+
if ARG("test_all") and getattr(case, "ppn", 1) >= 2:
356+
_print_multirank_debug_info(case)
345357
raise MFCException(f"Test {case}: Failed to execute MFC.")
346358

347359
pack, err = packer.pack(case.get_dirpath())
@@ -377,16 +389,20 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
377389
raise MFCException(f"Test {case}: {msg}")
378390

379391
if ARG("test_all"):
380-
# Don't delete output here - we need restart_data from the simulation above
381-
# Check timeout before launching the (potentially long) post-process run
382-
if timeout_flag.is_set():
383-
raise TestTimeoutError("Test case exceeded 1 hour timeout")
384-
# Run only POST_PROCESS since PRE_PROCESS and SIMULATION already ran successfully above
385-
cmd = case.run([POST_PROCESS], gpus=devices)
386-
out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
387-
common.file_write(out_filepath, cmd.stdout)
388-
389-
# Check return code from post-process run
392+
# We already ran PRE_PROCESS, SIMULATION, and POST_PROCESS together
393+
# in the single pipeline above. At this point:
394+
# - If cmd.returncode != 0, post_process (or an earlier stage)
395+
# failed and we want to surface that with verbose diagnostics.
396+
# - If cmd.returncode == 0, post_process completed and should
397+
# have written its outputs (e.g., silo_hdf5) based on a
398+
# configuration that had "post_process" in ARGS["mfc"]["targets"],
399+
# so parallel_io and restart_data layout are consistent.
400+
401+
out_post_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
402+
# Write the full pipeline output to an explicit post-process log too,
403+
# even though it includes pre/sim messages. This is helpful for CI.
404+
common.file_write(out_post_filepath, cmd.stdout)
405+
390406
if cmd.returncode != 0:
391407
cons.print(cmd.stdout)
392408

@@ -396,14 +412,15 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
396412

397413
raise MFCException(
398414
f"Test {case}: Failed to execute MFC (post-process). "
399-
f"See log at: {out_filepath}"
415+
f"See log at: {out_post_filepath}"
400416
)
401417

418+
# After a successful post_process run, inspect Silo/HDF5 outputs
402419
silo_dir = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')
403420
if os.path.isdir(silo_dir):
404421
for silo_filename in os.listdir(silo_dir):
405422
silo_filepath = os.path.join(silo_dir, silo_filename)
406-
_process_silo_file(silo_filepath, case, out_filepath)
423+
_process_silo_file(silo_filepath, case, out_post_filepath)
407424

408425
case.delete_output()
409426

0 commit comments

Comments
 (0)