Skip to content

Commit e8090bd

Browse files
committed
benchmarks hardening!
1 parent 6c380cd commit e8090bd

File tree

4 files changed

+263
-80
lines changed

4 files changed

+263
-80
lines changed

.github/workflows/bench.yml

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
filters: ".github/file-filter.yml"
2424

2525
self:
26-
name: "${{ matrix.name }} (${{ matrix.device }})"
26+
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
2727
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
2828
needs: file-changes
2929
strategy:
@@ -73,7 +73,7 @@ jobs:
7373
runs-on:
7474
group: ${{ matrix.group }}
7575
labels: ${{ matrix.labels }}
76-
timeout-minutes: 1400
76+
timeout-minutes: 480
7777
env:
7878
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
7979
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
@@ -99,9 +99,83 @@ jobs:
9999
100100
- name: Bench (Master v. PR)
101101
run: |
102-
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
103-
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) &
104-
wait %1 && wait %2
102+
set -e
103+
104+
# Function to submit and monitor
105+
submit_and_monitor() {
106+
local dir=$1
107+
local device=$2
108+
local interface=$3
109+
local cluster=$4
110+
111+
cd "$dir"
112+
113+
# Submit job
114+
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
115+
.github/workflows/$cluster/bench.sh $device $interface 2>&1)
116+
117+
job_id=$(echo "$submit_output" | grep -oP 'Submitted batch job \K[0-9]+' || echo "")
118+
job_slug="bench-$device-$interface"
119+
output_file="${job_slug}.out"
120+
121+
if [ -z "$job_id" ]; then
122+
echo "ERROR: Failed to submit job"
123+
echo "$submit_output"
124+
return 1
125+
fi
126+
127+
echo "Submitted batch job $job_id"
128+
echo "Monitoring output file: $output_file"
129+
130+
# Wait for file to appear (check job status if it takes a while)
131+
echo "Waiting for job to start..."
132+
while [ ! -f "$output_file" ]; do
133+
# Check if job failed to start
134+
if ! squeue -j "$job_id" &>/dev/null && [ ! -f "$output_file" ]; then
135+
echo "ERROR: Job $job_id finished without creating output file"
136+
return 1
137+
fi
138+
sleep 5
139+
done
140+
141+
echo "=== Streaming output for job $job_id ==="
142+
# Stream output while job runs
143+
tail -f "$output_file" &
144+
tail_pid=$!
145+
146+
# Wait for job to complete (will wait up to GitHub Actions timeout)
147+
while squeue -j "$job_id" &>/dev/null; do
148+
sleep 5
149+
done
150+
151+
# Stop tailing
152+
kill $tail_pid 2>/dev/null || true
153+
154+
echo ""
155+
echo "=== Final output ==="
156+
cat "$output_file"
157+
158+
# Check exit status
159+
exit_code=$(scontrol show job "$job_id" 2>/dev/null | grep -oP 'ExitCode=\K[0-9]+' || echo "0:0")
160+
if [ "$exit_code" != "0" ] && [ "$exit_code" != "0:0" ]; then
161+
echo "ERROR: Job $job_id failed with exit code $exit_code"
162+
return 1
163+
fi
164+
}
165+
166+
# Run both jobs with monitoring
167+
(submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
168+
pr_pid=$!
169+
170+
(submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
171+
master_pid=$!
172+
173+
wait $pr_pid && pr_exit=$? || pr_exit=$?
174+
wait $master_pid && master_exit=$? || master_exit=$?
175+
176+
if [ $pr_exit -ne 0 ] || [ $master_exit -ne 0 ]; then
177+
exit 1
178+
fi
105179
106180
- name: Generate & Post Comment
107181
run: |

.github/workflows/test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@ jobs:
9393
OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
9494

9595
self:
96-
name: Self Hosted
96+
name: "${{ matrix.lbl == 'gt' && 'Georgia Tech | Phoenix' || 'Oak Ridge | Frontier' }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
9797
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
9898
needs: file-changes
9999
continue-on-error: false
100-
timeout-minutes: 1400
100+
timeout-minutes: 480
101101
strategy:
102102
matrix:
103103
device: ['gpu']

toolchain/mfc/bench.py

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class BenchCase:
1616
path: str
1717
args: typing.List[str]
1818

19+
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
1920
def bench(targets = None):
2021
if targets is None:
2122
targets = ARG("targets")
@@ -36,6 +37,10 @@ def bench(targets = None):
3637
case.args = case.args + ARG("--")
3738
case.path = os.path.abspath(case.path)
3839

40+
# Validate case file exists early
41+
if not os.path.exists(case.path):
42+
raise MFCException(f"Benchmark case file not found: {case.path}")
43+
3944
results = {
4045
"metadata": {
4146
"invocation": sys.argv[1:],
@@ -44,6 +49,8 @@ def bench(targets = None):
4449
"cases": {},
4550
}
4651

52+
failed_cases = []
53+
4754
for i, case in enumerate(CASES):
4855
summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
4956
log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out")
@@ -54,21 +61,82 @@ def bench(targets = None):
5461
cons.print(f"> Log: [bold]{os.path.relpath(log_filepath)}[/bold]")
5562
cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
5663

57-
with open(log_filepath, "w") as log_file:
58-
system(
59-
["./mfc.sh", "run", case.path, "--case-optimization"] +
60-
["--targets"] + [t.name for t in targets] +
61-
["--output-summary", summary_filepath] +
62-
case.args +
63-
["--", "--gbpp", ARG('mem')],
64-
stdout=log_file,
65-
stderr=subprocess.STDOUT)
66-
67-
results["cases"][case.slug] = {
68-
"description": dataclasses.asdict(case),
69-
"output_summary": file_load_yaml(summary_filepath),
70-
}
64+
try:
65+
with open(log_filepath, "w") as log_file:
66+
result = system(
67+
["./mfc.sh", "run", case.path, "--case-optimization"] +
68+
["--targets"] + [t.name for t in targets] +
69+
["--output-summary", summary_filepath] +
70+
case.args +
71+
["--", "--gbpp", ARG('mem')],
72+
stdout=log_file,
73+
stderr=subprocess.STDOUT)
74+
75+
# Check return code
76+
if result.returncode != 0:
77+
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {result.returncode}")
78+
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
79+
failed_cases.append(case.slug)
80+
cons.unindent()
81+
continue
82+
83+
# Validate summary file exists
84+
if not os.path.exists(summary_filepath):
85+
cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
86+
cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]")
87+
failed_cases.append(case.slug)
88+
cons.unindent()
89+
continue
90+
91+
# Load summary
92+
summary = file_load_yaml(summary_filepath)
93+
94+
# Validate all targets have required data
95+
validation_failed = False
96+
for target in targets:
97+
if target.name not in summary:
98+
cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
99+
validation_failed = True
100+
break
101+
102+
if "exec" not in summary[target.name]:
103+
cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
104+
validation_failed = True
105+
break
106+
107+
if target.name == "simulation" and "grind" not in summary[target.name]:
108+
cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
109+
validation_failed = True
110+
break
111+
112+
if validation_failed:
113+
failed_cases.append(case.slug)
114+
cons.unindent()
115+
continue
116+
117+
# Add to results
118+
results["cases"][case.slug] = {
119+
"description": dataclasses.asdict(case),
120+
"output_summary": summary,
121+
}
122+
cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
123+
124+
except Exception as e:
125+
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
126+
failed_cases.append(case.slug)
127+
128+
cons.unindent()
129+
130+
# Report results
131+
if failed_cases:
132+
cons.print()
133+
cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
134+
for slug in failed_cases:
135+
cons.print(f" - {slug}")
136+
cons.print()
137+
raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
71138

139+
# Write output
72140
file_dump_yaml(ARG("output"), results)
73141

74142
cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
@@ -140,8 +208,9 @@ def diff():
140208
if grind_time_value <0.95:
141209
cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
142210
err = 1
143-
except Exception as _:
144-
pass
211+
except Exception as e:
212+
cons.print(f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}")
213+
err = 1
145214

146215
table.add_row(f"[magenta]{slug}[/magenta]", *speedups)
147216

0 commit comments

Comments
 (0)