Skip to content

Commit 8d804e3

Browse files
committed
fix bug in hung processes / timeout
1 parent b30fea3 commit 8d804e3

File tree

3 files changed

+13
-34
lines changed

3 files changed

+13
-34
lines changed

.github/workflows/bench.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ jobs:
101101
run: |
102102
set -e
103103
104+
# Capture path to monitoring script from PR checkout (it doesn't exist in master)
105+
MONITOR_SCRIPT="$(pwd)/pr/.github/scripts/monitor_slurm_job.sh"
106+
104107
# Function to submit and monitor using extracted script
105108
submit_and_monitor() {
106109
local dir=$1
@@ -124,8 +127,8 @@ jobs:
124127
return 1
125128
fi
126129
127-
# Use the monitoring script
128-
bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
130+
# Use the monitoring script from PR checkout
131+
bash "$MONITOR_SCRIPT" "$job_id" "$output_file"
129132
}
130133
131134
# Run both jobs with monitoring

toolchain/mfc/test/case.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def __init__(self, trace: str, mods: dict, ppn: int = None, override_tol: float
123123
self.override_tol = override_tol
124124
super().__init__({**BASE_CFG.copy(), **mods})
125125

126-
def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subprocess.CompletedProcess:
126+
def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int], timeout: Optional[float] = None) -> subprocess.CompletedProcess:
127127
if gpus is not None and len(gpus) != 0:
128128
gpus_select = ["--gpus"] + [str(_) for _ in gpus]
129129
else:
@@ -146,7 +146,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
146146
*jobs, "-t", *target_names, *gpus_select, *ARG("--")
147147
]
148148

149-
return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
149+
return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=timeout)
150150

151151
def get_trace(self) -> str:
152152
return self.trace

toolchain/mfc/test/test.py

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os, typing, shutil, time, itertools, threading
1+
import os, typing, shutil, time, itertools, threading, subprocess
22
from random import sample, seed
33

44
import rich, rich.table
@@ -36,9 +36,6 @@
3636
# from worker threads which could leave the scheduler in an undefined state.
3737
abort_tests = threading.Event()
3838

39-
class TestTimeoutError(MFCException):
40-
pass
41-
4239
# pylint: disable=too-many-branches, trailing-whitespace
4340
def __filter(cases_) -> typing.List[TestCase]:
4441
cases = cases_[:]
@@ -309,32 +306,16 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
309306
global current_test_number
310307
start_time = time.time()
311308

312-
# Set timeout using threading.Timer (works in worker threads)
313-
# Note: we intentionally do not use signal.alarm() here because signals
314-
# only work in the main thread; sched.sched runs tests in worker threads.
315-
# threading.Timer works correctly in this threaded context.
316-
timeout_flag = threading.Event()
317-
timeout_timer = threading.Timer(TEST_TIMEOUT_SECONDS, timeout_flag.set)
318-
timeout_timer.start()
319-
320309
tol = case.compute_tolerance()
321310
case.delete_output()
322311
case.create_directory()
323312

324313
if ARG("dry_run"):
325314
cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}")
326-
timeout_timer.cancel()
327315
return
328316

329317
try:
330-
# Check timeout before starting
331-
if timeout_flag.is_set():
332-
raise TestTimeoutError("Test case exceeded 1 hour timeout")
333-
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
334-
335-
# Check timeout after simulation
336-
if timeout_flag.is_set():
337-
raise TestTimeoutError("Test case exceeded 1 hour timeout")
318+
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, timeout=TEST_TIMEOUT_SECONDS)
338319

339320
out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
340321

@@ -380,13 +361,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
380361
# Clean up output from golden comparison run
381362
case.delete_output()
382363

383-
# Check timeout before full pipeline
384-
if timeout_flag.is_set():
385-
raise TestTimeoutError("Test case exceeded 1 hour timeout")
386-
387364
# Run full pipeline: this ensures simulation and post_process use
388365
# consistent configuration (parallel_io, file_per_process, etc.)
389-
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
366+
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, timeout=TEST_TIMEOUT_SECONDS)
390367

391368
out_post_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
392369
common.file_write(out_post_filepath, cmd.stdout)
@@ -419,7 +396,8 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
419396
progress_str = f"({current_test_number:3d}/{total_test_count:3d})"
420397
cons.print(f" {progress_str} [bold magenta]{case.get_uuid()}[/bold magenta] {duration:6.2f} {case.trace}")
421398

422-
except TestTimeoutError as exc:
399+
except subprocess.TimeoutExpired as exc:
400+
# Subprocess timeout - the process was actually killed
423401
log_path = os.path.join(case.get_dirpath(), 'out_pre_sim.txt')
424402
if os.path.exists(log_path):
425403
log_msg = f"Check the log at: {log_path}"
@@ -428,12 +406,10 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
428406
f"Log file ({log_path}) may not exist if the timeout occurred early."
429407
)
430408
raise MFCException(
431-
f"Test {case} exceeded 1 hour timeout. "
409+
f"Test {case} exceeded 1 hour timeout (process was killed). "
432410
f"This may indicate a hung simulation or misconfigured case. "
433411
f"{log_msg}"
434412
) from exc
435-
finally:
436-
timeout_timer.cancel() # Cancel timeout timer
437413

438414

439415
def handle_case(case: TestCase, devices: typing.Set[int]):

0 commit comments

Comments
 (0)