Skip to content

Commit febfdcf

Browse files
committed
hangtimeout for 2 rank cases
1 parent 22af239 commit febfdcf

File tree

4 files changed

+60
-7
lines changed

4 files changed

+60
-7
lines changed

toolchain/mfc/args.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def add_common_arguments(p: argparse.ArgumentParser, mask = None):
9797
test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." )
9898
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
9999
test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.")
100+
test.add_argument( "--timeout", type=int, default=1000, help="Timeout in seconds for 2-rank test cases (to catch hung MPI runs).")
100101

101102
test_meg = test.add_mutually_exclusive_group()
102103
test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.")

toolchain/mfc/test/case.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,16 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
147147
*jobs, "-t", *target_names, *gpus_select, *ARG("--")
148148
]
149149

150-
return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
150+
# Enforce per-test timeout only for 2-rank cases (to catch hangs)
151+
timeout = ARG("timeout") if self.ppn == 2 else None
152+
return common.system(
153+
command,
154+
print_cmd=False,
155+
text=True,
156+
stdout=subprocess.PIPE,
157+
stderr=subprocess.STDOUT,
158+
timeout=timeout
159+
)
151160

152161
def get_trace(self) -> str:
153162
return self.trace

toolchain/mfc/test/test.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os, typing, shutil, time, itertools
1+
import os, typing, shutil, time, itertools, subprocess
22
from random import sample, seed
33

44
import rich, rich.table
@@ -194,10 +194,31 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
194194
cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}")
195195
return
196196

197-
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
198-
199197
out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
200198

199+
try:
200+
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
201+
except subprocess.TimeoutExpired as exc:
202+
# Save any partial stdout we have
203+
partial_output = ""
204+
if exc.stdout:
205+
try:
206+
partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout
207+
except Exception:
208+
partial_output = str(exc.stdout)
209+
210+
if partial_output:
211+
common.file_write(out_filepath, partial_output)
212+
213+
raise MFCException(
214+
f"Test {case} (2-rank case): Timed out after {ARG('timeout')} seconds.\n"
215+
f"This suggests the MPI job may have hung or deadlocked.\n"
216+
f"Partial output (if any) saved to {out_filepath}.\n"
217+
f"Case dictionary: {case.get_filepath()}.\n"
218+
f"Command: {' '.join(str(x) for x in exc.cmd)}"
219+
) from exc
220+
221+
# On normal completion, write full stdout
201222
common.file_write(out_filepath, cmd.stdout)
202223

203224
if cmd.returncode != 0:
@@ -238,8 +259,30 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
238259

239260
if ARG("test_all"):
240261
case.delete_output()
241-
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
242262
out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
263+
264+
try:
265+
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
266+
except subprocess.TimeoutExpired as exc:
267+
# Save any partial stdout we have
268+
partial_output = ""
269+
if exc.stdout:
270+
try:
271+
partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout
272+
except Exception:
273+
partial_output = str(exc.stdout)
274+
275+
if partial_output:
276+
common.file_write(out_filepath, partial_output)
277+
278+
raise MFCException(
279+
f"Test {case} (2-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n"
280+
f"This suggests the MPI job may have hung or deadlocked.\n"
281+
f"Partial output (if any) saved to {out_filepath}.\n"
282+
f"Case dictionary: {case.get_filepath()}.\n"
283+
f"Command: {' '.join(str(x) for x in exc.cmd)}"
284+
) from exc
285+
243286
common.file_write(out_filepath, cmd.stdout)
244287

245288
for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):

toolchain/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ dependencies = [
3737
"matplotlib",
3838

3939
# Chemistry
40-
"cantera==3.1.0",
40+
"cantera>=3.1.0",
4141
#"pyrometheus == 1.0.5",
42-
"pyrometheus @ git+https://github.com/wilfonba/pyrometheus-wilfong.git@OpenMPTest",
42+
"pyrometheus @ git+https://github.com/sbryngelson/pyrometheus-bryngelson.git@OpenMPTest",
4343

4444
# Frontier Profiling
4545
"astunparse==1.6.2",

0 commit comments

Comments
 (0)