Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def add_common_arguments(p: argparse.ArgumentParser, mask = None):
test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." )
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.")
test.add_argument( "--timeout", type=int, default=1000, help="Timeout in seconds for 2-rank test cases (to catch hung MPI runs).")

test_meg = test.add_mutually_exclusive_group()
test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.")
Expand Down
11 changes: 10 additions & 1 deletion toolchain/mfc/test/case.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

High-level Suggestion

The current timeout logic is limited to 2-rank tests. It should be generalized to apply to all multi-process tests (where ppn > 1) to prevent hangs in any MPI-based test. [High-level, importance: 7]

Solution Walkthrough:

Before:

# toolchain/mfc/test/case.py
class TestCase(case.Case):
    def run(self, ...):
        # ...
        # Enforce per-test timeout only for 2-rank cases (to catch hangs)
        timeout = ARG("timeout") if self.ppn == 2 else None
        return common.system(..., timeout=timeout)

# toolchain/mfc/test/test.py
def _handle_case(...):
    try:
        case.run(...)
    except subprocess.TimeoutExpired as exc:
        raise MFCException(f"Test {case} (2-rank case): Timed out...")

After:

# toolchain/mfc/test/case.py
class TestCase(case.Case):
    def run(self, ...):
        # ...
        # Enforce per-test timeout for all multi-rank cases
        timeout = ARG("timeout") if self.ppn > 1 else None
        return common.system(..., timeout=timeout)

# toolchain/mfc/test/test.py
def _handle_case(...):
    try:
        case.run(...)
    except subprocess.TimeoutExpired as exc:
        raise MFCException(f"Test {case} (multi-rank case): Timed out...")

Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,16 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
*jobs, "-t", *target_names, *gpus_select, *ARG("--")
]

return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# Enforce per-test timeout only for 2-rank cases (to catch hangs)
timeout = ARG("timeout") if self.ppn == 2 else None
return common.system(
command,
print_cmd=False,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout
)
Comment on lines +150 to +159
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Add validation to ensure the timeout value is positive before passing it to common.system. If the value is non-positive, it should be set to None to prevent an unhandled ValueError. [possible issue, importance: 8]

Suggested change
# Enforce per-test timeout only for 2-rank cases (to catch hangs)
timeout = ARG("timeout") if self.ppn == 2 else None
return common.system(
command,
print_cmd=False,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout
)
# Enforce per-test timeout only for 2-rank cases (to catch hangs)
timeout = None
if self.ppn == 2:
val = ARG("timeout")
if val > 0:
timeout = val
return common.system(
command,
print_cmd=False,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout
)


def get_trace(self) -> str:
return self.trace
Expand Down
51 changes: 47 additions & 4 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os, typing, shutil, time, itertools
import os, typing, shutil, time, itertools, subprocess
from random import sample, seed

import rich, rich.table
Expand Down Expand Up @@ -194,10 +194,31 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}")
return

cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)

out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")

try:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Extract the duplicated subprocess.TimeoutExpired exception handling logic into a new helper function to reduce code redundancy and improve maintainability. [general, importance: 6]

cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
except subprocess.TimeoutExpired as exc:
# Save any partial stdout we have
partial_output = ""
if exc.stdout:
try:
partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout
except Exception:
partial_output = str(exc.stdout)

if partial_output:
common.file_write(out_filepath, partial_output)

raise MFCException(
f"Test {case} (2-rank case): Timed out after {ARG('timeout')} seconds.\n"
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message hardcodes "2-rank case" but should dynamically reference the actual ppn value from the test case. Consider using f"Test {case} ({case.ppn}-rank case): Timed out..." to make the message more accurate and maintainable, especially if the timeout logic is ever extended to other ppn values.

Suggested change
f"Test {case} (2-rank case): Timed out after {ARG('timeout')} seconds.\n"
f"Test {case} ({case.ppn}-rank case): Timed out after {ARG('timeout')} seconds.\n"

Copilot uses AI. Check for mistakes.
f"This suggests the MPI job may have hung or deadlocked.\n"
f"Partial output (if any) saved to {out_filepath}.\n"
f"Case dictionary: {case.get_filepath()}.\n"
f"Command: {' '.join(str(x) for x in exc.cmd)}"
) from exc
Comment on lines +199 to +219
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The timeout handling logic is duplicated between this block (lines 199-219) and lines 264-284. Consider extracting this into a helper function that takes the case, command callable, out_filepath, and optional context string (e.g., "post-process") as parameters. This would improve maintainability and ensure consistent error handling.

Copilot uses AI. Check for mistakes.

# On normal completion, write full stdout
common.file_write(out_filepath, cmd.stdout)

if cmd.returncode != 0:
Expand Down Expand Up @@ -238,8 +259,30 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):

if ARG("test_all"):
case.delete_output()
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")

try:
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
except subprocess.TimeoutExpired as exc:
# Save any partial stdout we have
partial_output = ""
if exc.stdout:
try:
partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout
except Exception:
partial_output = str(exc.stdout)

if partial_output:
common.file_write(out_filepath, partial_output)

raise MFCException(
f"Test {case} (2-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n"
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message hardcodes "2-rank case" but should dynamically reference the actual ppn value from the test case. Consider using f"Test {case} ({case.ppn}-rank case, post-process): Timed out..." to make the message more accurate and maintainable, especially if the timeout logic is ever extended to other ppn values.

Suggested change
f"Test {case} (2-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n"
f"Test {case} ({case.ppn}-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n"

Copilot uses AI. Check for mistakes.
f"This suggests the MPI job may have hung or deadlocked.\n"
f"Partial output (if any) saved to {out_filepath}.\n"
f"Case dictionary: {case.get_filepath()}.\n"
f"Command: {' '.join(str(x) for x in exc.cmd)}"
) from exc

common.file_write(out_filepath, cmd.stdout)

for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):
Expand Down
4 changes: 2 additions & 2 deletions toolchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ dependencies = [
"matplotlib",

# Chemistry
"cantera==3.1.0",
"cantera>=3.1.0",
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The version constraint for cantera was changed from ==3.1.0 to >=3.1.0. While this allows for updates, it could potentially allow future major versions (e.g., 4.x) that might introduce breaking API changes. Consider using >=3.1.0,<4.0.0 to allow minor and patch updates while preventing breaking changes from major version updates.

Suggested change
"cantera>=3.1.0",
"cantera>=3.1.0,<4.0.0",

Copilot uses AI. Check for mistakes.
#"pyrometheus == 1.0.5",
"pyrometheus @ git+https://github.com/wilfonba/pyrometheus-wilfong.git@OpenMPTest",
"pyrometheus @ git+https://github.com/sbryngelson/pyrometheus-bryngelson.git@OpenMPTest",

# Frontier Profiling
"astunparse==1.6.2",
Expand Down
Loading