Skip to content

Commit 31c16a3

Browse files
author
Arsham
committed
ci: add automated benchmark summary
1 parent 3382c11 commit 31c16a3

File tree

3 files changed

+195
-9
lines changed

3 files changed

+195
-9
lines changed

.github/workflows/benchmarks.yml

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,48 @@
11
name: Benchmarks
22
on:
33
schedule:
4-
- cron: "0 5 * * 1"
4+
- cron: "0 5 * * 1" # Every Monday 05:00 UTC
55
workflow_dispatch:
66

77
jobs:
8-
bench:
8+
run-benchmarks:
99
runs-on: ubuntu-latest
1010
steps:
1111
- uses: actions/checkout@v4
12+
1213
- name: Set up Python
1314
uses: actions/setup-python@v5
1415
with:
1516
python-version: "3.11"
17+
1618
- name: Install project
1719
run: |
1820
pip install -e .
21+
1922
- name: Run quick benchmarks
2023
run: |
21-
python -m tsu.benchmarks.runner --quick
22-
- name: Archive benchmark output
24+
python -m tsu.benchmarks.runner --quick || echo "Benchmark runner failed or not present."
25+
26+
- name: Generate summary
2327
run: |
24-
mkdir -p bench_artifacts
25-
cp -r visual_output bench_artifacts || true
26-
- name: Upload benchmark artifacts
28+
python scripts/extract_benchmarks.py
29+
30+
- name: Show summary
31+
run: |
32+
cat BENCHMARK_SUMMARY.md || true
33+
34+
- name: Commit summary (if changed)
35+
run: |
36+
git config user.name "github-actions"
37+
git config user.email "actions@github.com"
38+
git add BENCHMARK_SUMMARY.md README.md || true
39+
git diff --cached --quiet && echo "No changes to commit." || git commit -m "ci: update benchmark summary (automated)"
40+
git push || echo "Push skipped (e.g., PR or no changes)."
41+
42+
- name: Upload artifacts
2743
uses: actions/upload-artifact@v4
2844
with:
29-
name: benchmarks
30-
path: bench_artifacts
45+
name: benchmark-visual-output
46+
path: |
47+
visual_output
48+
BENCHMARK_SUMMARY.md

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,10 @@ results = bench.run_all()
278278
- Calibration: 100% coverage on 95% intervals
279279
- Epistemic uncertainty: 3× increase in extrapolation regions
280280

281+
<!-- BENCHMARK_SUMMARY_START -->
282+
(benchmark summary will auto-update here)
283+
<!-- BENCHMARK_SUMMARY_END -->
284+
281285
## Testing
282286

283287
Run the complete test suite (121 tests):

scripts/extract_benchmarks.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
#!/usr/bin/env python
2+
"""
3+
Extract and summarize benchmark results for TSU.
4+
5+
Process:
6+
1. Reads any generated benchmark output files in `visual_output/`:
7+
- benchmark_report.txt (human-readable)
8+
- benchmark_results.json (machine-readable) if present
9+
2. Produces a concise summary in BENCHMARK_SUMMARY.md
10+
3. Optionally updates README.md between markers:
11+
<!-- BENCHMARK_SUMMARY_START --> ... <!-- BENCHMARK_SUMMARY_END -->
12+
13+
Design Goals:
14+
- Idempotent: safe to run multiple times
15+
- Fails gracefully if benchmarks haven't run yet
16+
- Minimal parsing assumptions
17+
18+
Extendability:
19+
- Add additional metrics parsing (KL divergence, ESS, timing) when format stabilizes.
20+
"""
21+
22+
from __future__ import annotations
23+
import json
24+
import re
25+
from pathlib import Path
26+
from datetime import datetime
27+
28+
ROOT = Path(__file__).resolve().parent.parent
29+
VIS_DIR = ROOT / "visual_output"
30+
SUMMARY_FILE = ROOT / "BENCHMARK_SUMMARY.md"
31+
README_FILE = ROOT / "README.md"
32+
33+
START_MARKER = "<!-- BENCHMARK_SUMMARY_START -->"
34+
END_MARKER = "<!-- BENCHMARK_SUMMARY_END -->"
35+
36+
37+
def load_report_text() -> str | None:
38+
txt_path = VIS_DIR / "benchmark_report.txt"
39+
if txt_path.exists():
40+
return txt_path.read_text(encoding="utf-8")
41+
return None
42+
43+
44+
def load_results_json() -> dict | None:
45+
json_path = VIS_DIR / "benchmark_results.json"
46+
if json_path.exists():
47+
try:
48+
return json.loads(json_path.read_text(encoding="utf-8"))
49+
except json.JSONDecodeError:
50+
return None
51+
return None
52+
53+
54+
def parse_metrics(report_text: str | None, results_json: dict | None) -> dict:
55+
"""
56+
Extract a few key metrics heuristically.
57+
Fallback to simple placeholders if unavailable.
58+
"""
59+
metrics = {
60+
"gaussian_kl": "n/a",
61+
"multimodal_modes": "n/a",
62+
"ising_gap": "n/a",
63+
"regression_coverage": "n/a",
64+
"timestamp": datetime.utcnow().isoformat(timespec="seconds") + "Z",
65+
}
66+
67+
if results_json:
68+
# Heuristic keys (adjust when stable)
69+
# Example expected structure (pseudo):
70+
# {
71+
# "sampling": {"gaussian": {"kl": 0.0023}, "multimodal": {"modes_found": 3}},
72+
# "optimization": {"ising": {"gap": 0.0}},
73+
# "ml": {"regression": {"coverage": 1.0}}
74+
# }
75+
sampling = results_json.get("sampling", {})
76+
gauss = sampling.get("gaussian", {})
77+
metrics["gaussian_kl"] = gauss.get("kl", metrics["gaussian_kl"])
78+
multi = sampling.get("multimodal", {})
79+
metrics["multimodal_modes"] = multi.get("modes_found", metrics["multimodal_modes"])
80+
81+
opt = results_json.get("optimization", {})
82+
ising = opt.get("ising", {})
83+
metrics["ising_gap"] = ising.get("gap", metrics["ising_gap"])
84+
85+
ml = results_json.get("ml", {})
86+
reg = ml.get("regression", {})
87+
cov = reg.get("coverage", None)
88+
if cov is not None:
89+
# Format as percentage if fraction
90+
try:
91+
cov_val = float(cov)
92+
metrics["regression_coverage"] = f"{cov_val*100:.1f}%"
93+
except Exception:
94+
metrics["regression_coverage"] = str(cov)
95+
96+
# Fallback attempt: parse text for hints if JSON missing data
97+
if report_text and metrics["gaussian_kl"] == "n/a":
98+
kl_match = re.search(r"Gaussian.*?KL\s*[:=]\s*([\d\.eE-]+)", report_text)
99+
if kl_match:
100+
metrics["gaussian_kl"] = kl_match.group(1)
101+
102+
return metrics
103+
104+
105+
def build_markdown(metrics: dict) -> str:
106+
lines = []
107+
lines.append("# Benchmark Summary")
108+
lines.append("")
109+
lines.append(f"Last updated (UTC): `{metrics['timestamp']}`")
110+
lines.append("")
111+
lines.append("Key metrics (quick mode or last run):")
112+
lines.append("")
113+
lines.append("| Metric | Value |")
114+
lines.append("|--------|-------|")
115+
lines.append(f"| Gaussian KL divergence | {metrics['gaussian_kl']} |")
116+
lines.append(f"| Multimodal modes found | {metrics['multimodal_modes']} |")
117+
lines.append(f"| Ising optimality gap | {metrics['ising_gap']} |")
118+
lines.append(f"| Regression coverage (95% CI) | {metrics['regression_coverage']} |")
119+
lines.append("")
120+
lines.append("Run locally:")
121+
lines.append("```bash")
122+
lines.append("python -m tsu.benchmarks.runner --quick")
123+
lines.append("```")
124+
lines.append("")
125+
lines.append("Full benchmark details: see `visual_output/` artifacts or run full mode without `--quick`.")
126+
return "\n".join(lines)
127+
128+
129+
def write_summary(markdown: str):
130+
SUMMARY_FILE.write_text(markdown, encoding="utf-8")
131+
132+
133+
def update_readme(markdown: str):
134+
if not README_FILE.exists():
135+
return
136+
original = README_FILE.read_text(encoding="utf-8")
137+
if START_MARKER not in original or END_MARKER not in original:
138+
# Do nothing if markers not present
139+
return
140+
141+
pattern = re.compile(
142+
rf"{START_MARKER}.*?{END_MARKER}",
143+
re.DOTALL,
144+
)
145+
replacement = f"{START_MARKER}\n{markdown}\n{END_MARKER}"
146+
updated = pattern.sub(replacement, original)
147+
if updated != original:
148+
README_FILE.write_text(updated, encoding="utf-8")
149+
150+
151+
def main():
152+
report_text = load_report_text()
153+
results_json = load_results_json()
154+
metrics = parse_metrics(report_text, results_json)
155+
md = build_markdown(metrics)
156+
write_summary(md)
157+
update_readme(md)
158+
print("Benchmark summary generated.")
159+
print(f" -> {SUMMARY_FILE}")
160+
print("README updated (if markers present).")
161+
162+
163+
if __name__ == "__main__":
164+
main()

0 commit comments

Comments
 (0)