Skip to content

Commit 87fcd74

Browse files
committed
better monitoring
1 parent db91d6a commit 87fcd74

File tree

3 files changed

+141
-104
lines changed

3 files changed

+141
-104
lines changed

.github/scripts/monitor_slurm_job.sh

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,24 @@ while [ ! -f "$output_file" ]; do
5050
done
5151

5252
echo "=== Streaming output for job $job_id ==="
53-
# Stream output while job runs
54-
tail -f "$output_file" &
53+
# Stream output while job runs (explicitly redirect to ensure output visibility)
54+
tail -f "$output_file" 2>&1 &
5555
tail_pid=$!
5656

57+
# Give tail a moment to start and show initial output
58+
sleep 2
59+
5760
# Wait for job to complete with retry logic for transient squeue failures
5861
squeue_failures=0
62+
heartbeat_counter=0
5963
while true; do
6064
if squeue -j "$job_id" &>/dev/null; then
6165
squeue_failures=0
66+
# Print heartbeat every 60 seconds (12 iterations * 5 sec)
67+
heartbeat_counter=$((heartbeat_counter + 1))
68+
if [ $((heartbeat_counter % 12)) -eq 0 ]; then
69+
echo "[$(date +%H:%M:%S)] Job $job_id still running..."
70+
fi
6271
else
6372
squeue_failures=$((squeue_failures + 1))
6473
# Check if job actually completed using sacct (if available)
@@ -68,6 +77,7 @@ while true; do
6877
# Consider job done only if it reached a terminal state
6978
case "$state" in
7079
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
80+
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
7181
break
7282
;;
7383
*)

.github/workflows/bench.yml

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ jobs:
108108
local interface=$3
109109
local cluster=$4
110110
111+
echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
111112
cd "$dir"
112113
113114
# Submit job
@@ -119,38 +120,61 @@ jobs:
119120
output_file="${job_slug}.out"
120121
121122
if [ -z "$job_id" ]; then
122-
echo "ERROR: Failed to submit job"
123+
echo "[$dir] ERROR: Failed to submit job"
123124
echo "$submit_output"
124125
return 1
125126
fi
126127
128+
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
129+
127130
# Use the monitoring script
128131
bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
132+
133+
echo "[$dir] Monitoring complete for job $job_id"
129134
}
130135
131136
# Run both jobs with monitoring
137+
echo "=========================================="
138+
echo "Starting parallel benchmark jobs..."
139+
echo "=========================================="
140+
132141
(submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
133142
pr_pid=$!
143+
echo "PR job started in background (PID: $pr_pid)"
134144
135145
(submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
136146
master_pid=$!
147+
echo "Master job started in background (PID: $master_pid)"
148+
149+
echo "Waiting for both jobs to complete..."
137150
138151
# Wait and capture exit codes reliably
139152
pr_exit=0
140153
master_exit=0
141154
142155
if ! wait "$pr_pid"; then
143156
pr_exit=$?
157+
echo "PR job exited with code: $pr_exit"
158+
else
159+
echo "PR job completed successfully"
144160
fi
161+
145162
if ! wait "$master_pid"; then
146163
master_exit=$?
164+
echo "Master job exited with code: $master_exit"
165+
else
166+
echo "Master job completed successfully"
147167
fi
148168
149169
# Explicitly check and quote to avoid test errors
150170
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
151-
echo "One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
171+
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
152172
exit 1
153173
fi
174+
175+
echo "=========================================="
176+
echo "Both benchmark jobs completed successfully!"
177+
echo "=========================================="
154178
155179
- name: Generate & Post Comment
156180
run: |

toolchain/mfc/bench.py

Lines changed: 103 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -29,118 +29,121 @@ def bench(targets = None):
2929
cons.print()
3030
cons.print(f"[bold]Benchmarking {format_list_to_string(ARG('targets'), 'magenta')} ([magenta]{os.path.relpath(bench_dirpath)}[/magenta]):[/bold]")
3131
cons.indent()
32-
cons.print()
3332

34-
CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ]
33+
try:
34+
cons.print()
3535

36-
for case in CASES:
37-
case.args = case.args + ARG("--")
38-
case.path = os.path.abspath(case.path)
36+
CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ]
3937

40-
# Validate case file exists early
41-
if not os.path.exists(case.path):
42-
raise MFCException(f"Benchmark case file not found: {case.path}")
38+
for case in CASES:
39+
case.args = case.args + ARG("--")
40+
case.path = os.path.abspath(case.path)
4341

44-
results = {
45-
"metadata": {
46-
"invocation": sys.argv[1:],
47-
"lock": dataclasses.asdict(CFG())
48-
},
49-
"cases": {},
50-
}
42+
# Validate case file exists early
43+
if not os.path.exists(case.path):
44+
raise MFCException(f"Benchmark case file not found: {case.path}")
5145

52-
failed_cases = []
46+
results = {
47+
"metadata": {
48+
"invocation": sys.argv[1:],
49+
"lock": dataclasses.asdict(CFG())
50+
},
51+
"cases": {},
52+
}
5353

54-
for i, case in enumerate(CASES):
55-
summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
56-
log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out")
54+
failed_cases = []
5755

58-
cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]")
59-
cons.indent()
60-
cons.print()
61-
cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]")
62-
cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
63-
64-
try:
65-
with open(log_filepath, "w") as log_file:
66-
result = system(
67-
["./mfc.sh", "run", case.path, "--case-optimization"] +
68-
["--targets"] + [t.name for t in targets] +
69-
["--output-summary", summary_filepath] +
70-
case.args +
71-
["--", "--gbpp", str(ARG('mem'))],
72-
stdout=log_file,
73-
stderr=subprocess.STDOUT)
74-
75-
# Check return code (handle CompletedProcess or int defensively)
76-
rc = result.returncode if hasattr(result, "returncode") else result
77-
if rc != 0:
78-
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
79-
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
80-
failed_cases.append(case.slug)
81-
continue
56+
for i, case in enumerate(CASES):
57+
summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
58+
log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out")
8259

83-
# Validate summary file exists
84-
if not os.path.exists(summary_filepath):
85-
cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
86-
cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]")
87-
failed_cases.append(case.slug)
88-
continue
89-
90-
# Load summary
91-
summary = file_load_yaml(summary_filepath)
92-
93-
# Validate all targets have required data
94-
validation_failed = False
95-
for target in targets:
96-
if target.name not in summary:
97-
cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
98-
validation_failed = True
99-
break
100-
101-
if "exec" not in summary[target.name]:
102-
cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
103-
validation_failed = True
104-
break
105-
106-
if target.name == "simulation" and "grind" not in summary[target.name]:
107-
cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
108-
validation_failed = True
109-
break
110-
111-
if validation_failed:
60+
cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.slug} @ [bold]{os.path.relpath(case.path)}[/bold]")
61+
cons.indent()
62+
cons.print()
63+
cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]")
64+
cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
65+
66+
try:
67+
with open(log_filepath, "w") as log_file:
68+
result = system(
69+
["./mfc.sh", "run", case.path, "--case-optimization"] +
70+
["--targets"] + [t.name for t in targets] +
71+
["--output-summary", summary_filepath] +
72+
case.args +
73+
["--", "--gbpp", str(ARG('mem'))],
74+
stdout=log_file,
75+
stderr=subprocess.STDOUT)
76+
77+
# Check return code (handle CompletedProcess or int defensively)
78+
rc = result.returncode if hasattr(result, "returncode") else result
79+
if rc != 0:
80+
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
81+
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
82+
failed_cases.append(case.slug)
83+
continue
84+
85+
# Validate summary file exists
86+
if not os.path.exists(summary_filepath):
87+
cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
88+
cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]")
89+
failed_cases.append(case.slug)
90+
continue
91+
92+
# Load summary
93+
summary = file_load_yaml(summary_filepath)
94+
95+
# Validate all targets have required data
96+
validation_failed = False
97+
for target in targets:
98+
if target.name not in summary:
99+
cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
100+
validation_failed = True
101+
break
102+
103+
if "exec" not in summary[target.name]:
104+
cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
105+
validation_failed = True
106+
break
107+
108+
if target.name == "simulation" and "grind" not in summary[target.name]:
109+
cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
110+
validation_failed = True
111+
break
112+
113+
if validation_failed:
114+
failed_cases.append(case.slug)
115+
continue
116+
117+
# Add to results
118+
results["cases"][case.slug] = {
119+
"description": dataclasses.asdict(case),
120+
"output_summary": summary,
121+
}
122+
cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
123+
124+
except Exception as e:
125+
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
126+
cons.print(f"[dim]{traceback.format_exc()}[/dim]")
112127
failed_cases.append(case.slug)
113-
continue
114-
115-
# Add to results
116-
results["cases"][case.slug] = {
117-
"description": dataclasses.asdict(case),
118-
"output_summary": summary,
119-
}
120-
cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
121-
122-
except Exception as e:
123-
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
124-
cons.print(f"[dim]{traceback.format_exc()}[/dim]")
125-
failed_cases.append(case.slug)
126-
finally:
127-
cons.unindent()
128-
129-
# Report results
130-
if failed_cases:
131-
cons.print()
132-
cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
133-
for slug in failed_cases:
134-
cons.print(f" - {slug}")
135-
cons.print()
136-
raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
128+
finally:
129+
cons.unindent()
130+
131+
# Report results
132+
if failed_cases:
133+
cons.print()
134+
cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
135+
for slug in failed_cases:
136+
cons.print(f" - {slug}")
137+
cons.print()
138+
raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
137139

138-
# Write output
139-
file_dump_yaml(ARG("output"), results)
140+
# Write output
141+
file_dump_yaml(ARG("output"), results)
140142

141-
cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
143+
cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
142144

143-
cons.unindent()
145+
finally:
146+
cons.unindent()
144147

145148

146149
# TODO: This function is too long and not nicely written at all. Someone should

0 commit comments

Comments
 (0)