Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def add_common_arguments(p: argparse.ArgumentParser, mask = None):
test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." )
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.")
test.add_argument( "--timeout", type=int, default=1000, help="Timeout in seconds for 2-rank test cases (to catch hung MPI runs).")

test_meg = test.add_mutually_exclusive_group()
test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.")
Expand Down
11 changes: 10 additions & 1 deletion toolchain/mfc/test/case.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

High-level Suggestion

The current timeout logic is limited to 2-rank tests. It should be generalized to apply to all multi-process tests (where ppn > 1) to prevent hangs in any MPI-based test. [High-level, importance: 7]

Solution Walkthrough:

Before:

# toolchain/mfc/test/case.py
class TestCase(case.Case):
    def run(self, ...):
        # ...
        # Enforce per-test timeout only for 2-rank cases (to catch hangs)
        timeout = ARG("timeout") if self.ppn == 2 else None
        return common.system(..., timeout=timeout)

# toolchain/mfc/test/test.py
def _handle_case(...):
    try:
        case.run(...)
    except subprocess.TimeoutExpired as exc:
        raise MFCException(f"Test {case} (2-rank case): Timed out...")

After:

# toolchain/mfc/test/case.py
class TestCase(case.Case):
    def run(self, ...):
        # ...
        # Enforce per-test timeout for all multi-rank cases
        timeout = ARG("timeout") if self.ppn > 1 else None
        return common.system(..., timeout=timeout)

# toolchain/mfc/test/test.py
def _handle_case(...):
    try:
        case.run(...)
    except subprocess.TimeoutExpired as exc:
        raise MFCException(f"Test {case} (multi-rank case): Timed out...")

Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,16 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
*jobs, "-t", *target_names, *gpus_select, *ARG("--")
]

return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# Enforce per-test timeout only for 2-rank cases (to catch hangs)
timeout = ARG("timeout") if self.ppn == 2 else None
return common.system(
command,
print_cmd=False,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout
)
Comment on lines +150 to +159
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Add validation to ensure the timeout value is positive before passing it to common.system. If the value is non-positive, it should be set to None to prevent an unhandled ValueError. [possible issue, importance: 8]

Suggested change
# Enforce per-test timeout only for 2-rank cases (to catch hangs)
timeout = ARG("timeout") if self.ppn == 2 else None
return common.system(
command,
print_cmd=False,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout
)
# Enforce per-test timeout only for 2-rank cases (to catch hangs)
timeout = None
if self.ppn == 2:
val = ARG("timeout")
if val > 0:
timeout = val
return common.system(
command,
print_cmd=False,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout
)


def get_trace(self) -> str:
return self.trace
Expand Down
51 changes: 47 additions & 4 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os, typing, shutil, time, itertools
import os, typing, shutil, time, itertools, subprocess
from random import sample, seed

import rich, rich.table
Expand Down Expand Up @@ -194,10 +194,31 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}")
return

cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)

out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")

try:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Extract the duplicated subprocess.TimeoutExpired exception handling logic into a new helper function to reduce code redundancy and improve maintainability. [general, importance: 6]

cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
except subprocess.TimeoutExpired as exc:
# Save any partial stdout we have
partial_output = ""
if exc.stdout:
try:
partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout
except Exception:
partial_output = str(exc.stdout)

if partial_output:
common.file_write(out_filepath, partial_output)

raise MFCException(
f"Test {case} (2-rank case): Timed out after {ARG('timeout')} seconds.\n"
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message hardcodes "2-rank case" but should dynamically reference the actual ppn value from the test case. Consider using f"Test {case} ({case.ppn}-rank case): Timed out..." to make the message more accurate and maintainable, especially if the timeout logic is ever extended to other ppn values.

Suggested change
f"Test {case} (2-rank case): Timed out after {ARG('timeout')} seconds.\n"
f"Test {case} ({case.ppn}-rank case): Timed out after {ARG('timeout')} seconds.\n"

Copilot uses AI. Check for mistakes.
f"This suggests the MPI job may have hung or deadlocked.\n"
f"Partial output (if any) saved to {out_filepath}.\n"
f"Case dictionary: {case.get_filepath()}.\n"
f"Command: {' '.join(str(x) for x in exc.cmd)}"
) from exc
Comment on lines +199 to +219
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The timeout handling logic is duplicated between this block (lines 199-219) and lines 264-284. Consider extracting this into a helper function that takes the case, command callable, out_filepath, and optional context string (e.g., "post-process") as parameters. This would improve maintainability and ensure consistent error handling.

Copilot uses AI. Check for mistakes.

# On normal completion, write full stdout
common.file_write(out_filepath, cmd.stdout)

if cmd.returncode != 0:
Expand Down Expand Up @@ -238,8 +259,30 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):

if ARG("test_all"):
case.delete_output()
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")

try:
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
except subprocess.TimeoutExpired as exc:
# Save any partial stdout we have
partial_output = ""
if exc.stdout:
try:
partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout
except Exception:
partial_output = str(exc.stdout)

if partial_output:
common.file_write(out_filepath, partial_output)

raise MFCException(
f"Test {case} (2-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n"
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message hardcodes "2-rank case" but should dynamically reference the actual ppn value from the test case. Consider using f"Test {case} ({case.ppn}-rank case, post-process): Timed out..." to make the message more accurate and maintainable, especially if the timeout logic is ever extended to other ppn values.

Suggested change
f"Test {case} (2-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n"
f"Test {case} ({case.ppn}-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n"

Copilot uses AI. Check for mistakes.
f"This suggests the MPI job may have hung or deadlocked.\n"
f"Partial output (if any) saved to {out_filepath}.\n"
f"Case dictionary: {case.get_filepath()}.\n"
f"Command: {' '.join(str(x) for x in exc.cmd)}"
) from exc

common.file_write(out_filepath, cmd.stdout)

for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):
Expand Down
4 changes: 2 additions & 2 deletions toolchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ dependencies = [
"matplotlib",

# Chemistry
"cantera==3.1.0",
"cantera>=3.1.0",
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The version constraint for cantera was changed from ==3.1.0 to >=3.1.0. While this allows for updates, it could potentially allow future major versions (e.g., 4.x) that might introduce breaking API changes. Consider using >=3.1.0,<4.0.0 to allow minor and patch updates while preventing breaking changes from major version updates.

Suggested change
"cantera>=3.1.0",
"cantera>=3.1.0,<4.0.0",

Copilot uses AI. Check for mistakes.
#"pyrometheus == 1.0.5",
"pyrometheus @ git+https://github.com/wilfonba/pyrometheus-wilfong.git@OpenMPTest",
"pyrometheus @ git+https://github.com/sbryngelson/pyrometheus-bryngelson.git@OpenMPTest",

# Frontier Profiling
"astunparse==1.6.2",
Expand Down
Loading