diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index 169ee076a4..94d549510c 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -97,6 +97,7 @@ def add_common_arguments(p: argparse.ArgumentParser, mask = None): test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." ) test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.") + test.add_argument( "--timeout", type=int, default=1000, help="Timeout in seconds for 2-rank test cases (to catch hung MPI runs).") test_meg = test.add_mutually_exclusive_group() test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.") diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 927beb07fd..d7559f0f7a 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -147,7 +147,16 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces *jobs, "-t", *target_names, *gpus_select, *ARG("--") ] - return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + # Enforce per-test timeout only for 2-rank cases (to catch hangs) + timeout = ARG("timeout") if self.ppn == 2 else None + return common.system( + command, + print_cmd=False, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=timeout + ) def get_trace(self) -> str: return self.trace diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 0ba6e160f7..860b853c16 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -1,4 +1,4 @@ -import os, typing, shutil, time, itertools +import os, typing, shutil, time, itertools, subprocess from random import sample, seed import rich, rich.table @@ -194,10 +194,31 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") return - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) - out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") + try: + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + except subprocess.TimeoutExpired as exc: + # Save any partial stdout we have + partial_output = "" + if exc.stdout: + try: + partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout + except Exception: + partial_output = str(exc.stdout) + + if partial_output: + common.file_write(out_filepath, partial_output) + + raise MFCException( + f"Test {case} (2-rank case): Timed out after {ARG('timeout')} seconds.\n" + f"This suggests the MPI job may have hung or deadlocked.\n" + f"Partial output (if any) saved to {out_filepath}.\n" + f"Case dictionary: {case.get_filepath()}.\n" + f"Command: {' '.join(str(x) for x in exc.cmd)}" + ) from exc + + # On normal completion, write full stdout common.file_write(out_filepath, cmd.stdout) if cmd.returncode != 0: @@ -238,8 +259,30 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("test_all"): case.delete_output() - cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") + + try: + cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) + except subprocess.TimeoutExpired as exc: + # Save any partial stdout we have + partial_output = "" + if exc.stdout: + try: + partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout + except Exception: + partial_output = str(exc.stdout) + + if partial_output: + common.file_write(out_filepath, partial_output) + + raise MFCException( + f"Test {case} (2-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n" + f"This suggests the MPI job may have hung or deadlocked.\n" + f"Partial output (if any) saved to {out_filepath}.\n" + f"Case dictionary: {case.get_filepath()}.\n" + f"Command: {' '.join(str(x) for x in exc.cmd)}" + ) from exc + common.file_write(out_filepath, cmd.stdout) for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')): diff --git a/toolchain/pyproject.toml b/toolchain/pyproject.toml index 517225f01b..cdffa9e51f 100644 --- a/toolchain/pyproject.toml +++ b/toolchain/pyproject.toml @@ -37,9 +37,9 @@ dependencies = [ "matplotlib", # Chemistry - "cantera==3.1.0", + "cantera>=3.1.0", #"pyrometheus == 1.0.5", - "pyrometheus @ git+https://github.com/wilfonba/pyrometheus-wilfong.git@OpenMPTest", + "pyrometheus @ git+https://github.com/sbryngelson/pyrometheus-bryngelson.git@OpenMPTest", # Frontier Profiling "astunparse==1.6.2",