Skip to content

Commit f79f4d1

Browse files
committed
more fixing!
1 parent 0668fed commit f79f4d1

File tree

4 files changed

+136
-74
lines changed

4 files changed

+136
-74
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
# Monitor a SLURM job and stream its output in real-time
3+
# Usage: monitor_slurm_job.sh <job_id> <output_file>
4+
5+
set -e
6+
7+
if [ $# -ne 2 ]; then
8+
echo "Usage: $0 <job_id> <output_file>"
9+
exit 1
10+
fi
11+
12+
job_id="$1"
13+
output_file="$2"
14+
15+
echo "Submitted batch job $job_id"
16+
echo "Monitoring output file: $output_file"
17+
18+
# Wait for file to appear (check job status if it takes a while)
19+
echo "Waiting for job to start..."
20+
while [ ! -f "$output_file" ]; do
21+
# Check if job failed to start
22+
if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
23+
echo "ERROR: Job $job_id finished without creating output file"
24+
exit 1
25+
fi
26+
sleep 5
27+
done
28+
29+
echo "=== Streaming output for job $job_id ==="
30+
# Stream output while job runs
31+
tail -f "$output_file" &
32+
tail_pid=$!
33+
34+
# Wait for job to complete with retry logic for transient squeue failures
35+
squeue_failures=0
36+
while true; do
37+
if squeue -j "$job_id" &>/dev/null; then
38+
squeue_failures=0
39+
else
40+
squeue_failures=$((squeue_failures + 1))
41+
# Allow a few transient failures before concluding job is done
42+
if [ $squeue_failures -ge 3 ]; then
43+
break
44+
fi
45+
fi
46+
sleep 5
47+
done
48+
49+
# Stop tailing
50+
kill $tail_pid 2>/dev/null || true
51+
52+
echo ""
53+
echo "=== Final output ==="
54+
cat "$output_file"
55+
56+
# Check exit status
57+
exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+:[0-9]+' || echo "0:0")
58+
if [ "$exit_code" != "0:0" ]; then
59+
echo "ERROR: Job $job_id failed with exit code $exit_code"
60+
exit 1
61+
fi
62+
63+
echo "Job $job_id completed successfully"
64+
exit 0
65+

.github/workflows/bench.yml

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ jobs:
101101
run: |
102102
set -e
103103
104-
# Function to submit and monitor
104+
# Function to submit and monitor using extracted script
105105
submit_and_monitor() {
106106
local dir=$1
107107
local device=$2
@@ -124,43 +124,8 @@ jobs:
124124
return 1
125125
fi
126126
127-
echo "Submitted batch job $job_id"
128-
echo "Monitoring output file: $output_file"
129-
130-
# Wait for file to appear (check job status if it takes a while)
131-
echo "Waiting for job to start..."
132-
while [ ! -f "$output_file" ]; do
133-
# Check if job failed to start
134-
if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
135-
echo "ERROR: Job $job_id finished without creating output file"
136-
return 1
137-
fi
138-
sleep 5
139-
done
140-
141-
echo "=== Streaming output for job $job_id ==="
142-
# Stream output while job runs
143-
tail -f "$output_file" &
144-
tail_pid=$!
145-
146-
# Wait for job to complete (will wait up to GitHub Actions timeout)
147-
while squeue -j "$job_id" &>/dev/null; do
148-
sleep 5
149-
done
150-
151-
# Stop tailing
152-
kill $tail_pid 2>/dev/null || true
153-
154-
echo ""
155-
echo "=== Final output ==="
156-
cat "$output_file"
157-
158-
# Check exit status
159-
exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0")
160-
if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then
161-
echo "ERROR: Job $job_id failed with exit code $exit_code"
162-
return 1
163-
fi
127+
# Use the monitoring script
128+
bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
164129
}
165130
166131
# Run both jobs with monitoring

toolchain/mfc/bench.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,14 @@ def bench(targets = None):
6868
["--targets"] + [t.name for t in targets] +
6969
["--output-summary", summary_filepath] +
7070
case.args +
71-
["--", "--gbpp", ARG('mem')],
71+
["--", "--gbpp", str(ARG('mem'))],
7272
stdout=log_file,
7373
stderr=subprocess.STDOUT)
7474

75-
# Check return code
76-
if result.returncode != 0:
77-
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}")
75+
# Check return code (handle CompletedProcess or int defensively)
76+
rc = result.returncode if hasattr(result, "returncode") else result
77+
if rc != 0:
78+
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
7879
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
7980
failed_cases.append(case.slug)
8081
cons.unindent()
@@ -111,7 +112,6 @@ def bench(targets = None):
111112

112113
if validation_failed:
113114
failed_cases.append(case.slug)
114-
cons.unindent()
115115
continue
116116

117117
# Add to results
@@ -124,8 +124,8 @@ def bench(targets = None):
124124
except Exception as e:
125125
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
126126
failed_cases.append(case.slug)
127-
128-
cons.unindent()
127+
finally:
128+
cons.unindent()
129129

130130
# Report results
131131
if failed_cases:
@@ -209,7 +209,11 @@ def diff():
209209
cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
210210
err = 1
211211
except Exception as e:
212-
cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}")
212+
import traceback
213+
cons.print(
214+
f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n"
215+
f"{traceback.format_exc()}"
216+
)
213217
err = 1
214218

215219
table.add_row(f"[magenta]{slug}[/magenta]", *speedups)

toolchain/mfc/test/test.py

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os, typing, shutil, time, itertools, signal
1+
import os, typing, shutil, time, itertools, threading
22
from random import sample, seed
33

44
import rich, rich.table
@@ -30,12 +30,15 @@
3030
# Per-test timeout (1 hour)
3131
TEST_TIMEOUT_SECONDS = 3600
3232

33+
# Global abort flag for thread-safe early termination
34+
# This flag is set when the failure rate exceeds the threshold, signaling
35+
# all worker threads to exit gracefully. This avoids raising exceptions
36+
# from worker threads which could leave the scheduler in an undefined state.
37+
abort_tests = threading.Event()
38+
3339
class TestTimeoutError(MFCException):
3440
pass
3541

36-
def timeout_handler(signum, frame):
37-
raise TestTimeoutError("Test case exceeded 1 hour timeout")
38-
3942
# pylint: disable=too-many-branches, trailing-whitespace
4043
def __filter(cases_) -> typing.List[TestCase]:
4144
cases = cases_[:]
@@ -173,6 +176,15 @@ def test():
173176
[ sched.Task(ppn=case.ppn, func=handle_case, args=[case], load=case.get_cell_count()) for case in cases ],
174177
ARG("jobs"), ARG("gpus"))
175178

179+
# Check if we aborted due to high failure rate
180+
if abort_tests.is_set():
181+
total_completed = nFAIL + nPASS
182+
cons.print()
183+
cons.unindent()
184+
raise MFCException(
185+
f"Excessive test failures: {nFAIL}/{total_completed} failed ({nFAIL/total_completed*100:.1f}%)"
186+
)
187+
176188
nSKIP = len(skipped_cases)
177189
cons.print()
178190
cons.unindent()
@@ -199,22 +211,32 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
199211
global current_test_number
200212
start_time = time.time()
201213

202-
# Set timeout alarm
203-
signal.signal(signal.SIGALRM, timeout_handler)
204-
signal.alarm(TEST_TIMEOUT_SECONDS)
214+
# Set timeout using threading.Timer (works in worker threads)
215+
# Note: signal.alarm() only works in the main thread, so we use
216+
# threading.Timer which works correctly in worker threads spawned by sched.sched
217+
timeout_flag = threading.Event()
218+
timeout_timer = threading.Timer(TEST_TIMEOUT_SECONDS, timeout_flag.set)
219+
timeout_timer.start()
205220

206221
tol = case.compute_tolerance()
207222
case.delete_output()
208223
case.create_directory()
209224

210225
if ARG("dry_run"):
211226
cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}")
212-
signal.alarm(0) # Cancel alarm
227+
timeout_timer.cancel()
213228
return
214229

215230
try:
231+
# Check timeout before starting
232+
if timeout_flag.is_set():
233+
raise TestTimeoutError("Test case exceeded 1 hour timeout")
216234
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
217235

236+
# Check timeout after simulation
237+
if timeout_flag.is_set():
238+
raise TestTimeoutError("Test case exceeded 1 hour timeout")
239+
218240
out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
219241

220242
common.file_write(out_filepath, cmd.stdout)
@@ -261,26 +283,28 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
261283
out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
262284
common.file_write(out_filepath, cmd.stdout)
263285

264-
for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')):
265-
silo_filepath = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0', silo_filepath)
266-
h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
286+
silo_dir = os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')
287+
if os.path.isdir(silo_dir):
288+
for silo_filename in os.listdir(silo_dir):
289+
silo_filepath = os.path.join(silo_dir, silo_filename)
290+
h5dump = f"{HDF5.get_install_dirpath(case.to_input_file())}/bin/h5dump"
267291

268-
if not os.path.exists(h5dump or ""):
269-
if not does_command_exist("h5dump"):
270-
raise MFCException("h5dump couldn't be found.")
292+
if not os.path.exists(h5dump or ""):
293+
if not does_command_exist("h5dump"):
294+
raise MFCException("h5dump couldn't be found.")
271295

272-
h5dump = shutil.which("h5dump")
296+
h5dump = shutil.which("h5dump")
273297

274-
output, err = get_program_output([h5dump, silo_filepath])
298+
output, err = get_program_output([h5dump, silo_filepath])
275299

276-
if err != 0:
277-
raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
300+
if err != 0:
301+
raise MFCException(f"Test {case}: Failed to run h5dump. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
278302

279-
if "nan," in output:
280-
raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
303+
if "nan," in output:
304+
raise MFCException(f"Test {case}: Post Process has detected a NaN. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
281305

282-
if "inf," in output:
283-
raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
306+
if "inf," in output:
307+
raise MFCException(f"Test {case}: Post Process has detected an Infinity. You can find the run's output in {out_filepath}, and the case dictionary in {case.get_filepath()}.")
284308

285309
case.delete_output()
286310

@@ -298,14 +322,18 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
298322
f"Check the log at: {os.path.join(case.get_dirpath(), 'out_pre_sim.txt')}"
299323
) from exc
300324
finally:
301-
signal.alarm(0) # Cancel alarm
325+
timeout_timer.cancel() # Cancel timeout timer
302326

303327

304328
def handle_case(case: TestCase, devices: typing.Set[int]):
305329
# pylint: disable=global-statement, global-variable-not-assigned
306330
global nFAIL, nPASS, nSKIP
307331
global errors
308332

333+
# Check if we should abort before processing this case
334+
if abort_tests.is_set():
335+
return # Exit gracefully if abort was requested
336+
309337
nAttempts = 0
310338
if ARG('single'):
311339
max_attempts = max(ARG('max_attempts'), 3)
@@ -337,10 +365,10 @@ def handle_case(case: TestCase, devices: typing.Set[int]):
337365
failure_rate = nFAIL / total_completed
338366
if failure_rate >= FAILURE_RATE_THRESHOLD:
339367
cons.print(f"\n[bold red]CRITICAL: {failure_rate*100:.1f}% failure rate detected after {total_completed} tests.[/bold red]")
340-
cons.print(f"[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
341-
cons.print(f"[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
342-
raise MFCException(
343-
f"Excessive test failures: {nFAIL}/{total_completed} failed ({failure_rate*100:.1f}%)"
344-
)
368+
cons.print("[bold red]This suggests a systemic issue (bad build, broken environment, etc.)[/bold red]")
369+
cons.print("[bold red]Aborting remaining tests to fail fast.[/bold red]\n")
370+
# Set abort flag instead of raising exception from worker thread
371+
abort_tests.set()
372+
return # Exit gracefully
345373

346374
return

0 commit comments

Comments
 (0)