ci: add automated benchmark summary

Arsham · Arsham · commit 31c16a3ed09d · 2025-11-24T18:20:14.000Z
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -1,30 +1,48 @@
 name: Benchmarks
 on:
   schedule:
-    - cron: "0 5 * * 1"
+    - cron: "0 5 * * 1"   # Every Monday 05:00 UTC
   workflow_dispatch:
 
 jobs:
-  bench:
+  run-benchmarks:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: "3.11"
+
       - name: Install project
         run: |
           pip install -e .
+
       - name: Run quick benchmarks
         run: |
-          python -m tsu.benchmarks.runner --quick
-      - name: Archive benchmark output
+          python -m tsu.benchmarks.runner --quick || echo "Benchmark runner failed or not present."
+
+      - name: Generate summary
         run: |
-          mkdir -p bench_artifacts
-          cp -r visual_output bench_artifacts || true
-      - name: Upload benchmark artifacts
+          python scripts/extract_benchmarks.py
+
+      - name: Show summary
+        run: |
+          cat BENCHMARK_SUMMARY.md || true
+
+      - name: Commit summary (if changed)
+        run: |
+          git config user.name "github-actions"
+          git config user.email "actions@github.com"
+          git add BENCHMARK_SUMMARY.md README.md || true
+          git diff --cached --quiet && echo "No changes to commit." || git commit -m "ci: update benchmark summary (automated)"
+          git push || echo "Push skipped (e.g., PR or no changes)."
+
+      - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: benchmarks
-          path: bench_artifacts
+            name: benchmark-visual-output
+            path: |
+              visual_output
+              BENCHMARK_SUMMARY.md
diff --git a/README.md b/README.md
@@ -278,6 +278,10 @@ results = bench.run_all()
 - Calibration: 100% coverage on 95% intervals
 - Epistemic uncertainty: 3× increase in extrapolation regions
 
+<!-- BENCHMARK_SUMMARY_START -->
+(benchmark summary will auto-update here)
+<!-- BENCHMARK_SUMMARY_END -->
+
 ## Testing
 
 Run the complete test suite (121 tests):
diff --git a/scripts/extract_benchmarks.py b/scripts/extract_benchmarks.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+"""
+Extract and summarize benchmark results for TSU.
+
+Process:
+1. Reads any generated benchmark output files in `visual_output/`:
+   - benchmark_report.txt (human-readable)
+   - benchmark_results.json (machine-readable) if present
+2. Produces a concise summary in BENCHMARK_SUMMARY.md
+3. Optionally updates README.md between markers:
+   <!-- BENCHMARK_SUMMARY_START --> ... <!-- BENCHMARK_SUMMARY_END -->
+
+Design Goals:
+- Idempotent: safe to run multiple times
+- Fails gracefully if benchmarks haven't run yet
+- Minimal parsing assumptions
+
+Extendability:
+- Add additional metrics parsing (KL divergence, ESS, timing) when format stabilizes.
+"""
+
+from __future__ import annotations
+import json
+import re
+from pathlib import Path
+from datetime import datetime
+
+ROOT = Path(__file__).resolve().parent.parent
+VIS_DIR = ROOT / "visual_output"
+SUMMARY_FILE = ROOT / "BENCHMARK_SUMMARY.md"
+README_FILE = ROOT / "README.md"
+
+START_MARKER = "<!-- BENCHMARK_SUMMARY_START -->"
+END_MARKER = "<!-- BENCHMARK_SUMMARY_END -->"
+
+
+def load_report_text() -> str | None:
+    txt_path = VIS_DIR / "benchmark_report.txt"
+    if txt_path.exists():
+        return txt_path.read_text(encoding="utf-8")
+    return None
+
+
+def load_results_json() -> dict | None:
+    json_path = VIS_DIR / "benchmark_results.json"
+    if json_path.exists():
+        try:
+            return json.loads(json_path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError:
+            return None
+    return None
+
+
+def parse_metrics(report_text: str | None, results_json: dict | None) -> dict:
+    """
+    Extract a few key metrics heuristically.
+    Fallback to simple placeholders if unavailable.
+    """
+    metrics = {
+        "gaussian_kl": "n/a",
+        "multimodal_modes": "n/a",
+        "ising_gap": "n/a",
+        "regression_coverage": "n/a",
+        "timestamp": datetime.utcnow().isoformat(timespec="seconds") + "Z",
+    }
+
+    if results_json:
+        # Heuristic keys (adjust when stable)
+        # Example expected structure (pseudo):
+        # {
+        #   "sampling": {"gaussian": {"kl": 0.0023}, "multimodal": {"modes_found": 3}},
+        #   "optimization": {"ising": {"gap": 0.0}},
+        #   "ml": {"regression": {"coverage": 1.0}}
+        # }
+        sampling = results_json.get("sampling", {})
+        gauss = sampling.get("gaussian", {})
+        metrics["gaussian_kl"] = gauss.get("kl", metrics["gaussian_kl"])
+        multi = sampling.get("multimodal", {})
+        metrics["multimodal_modes"] = multi.get("modes_found", metrics["multimodal_modes"])
+
+        opt = results_json.get("optimization", {})
+        ising = opt.get("ising", {})
+        metrics["ising_gap"] = ising.get("gap", metrics["ising_gap"])
+
+        ml = results_json.get("ml", {})
+        reg = ml.get("regression", {})
+        cov = reg.get("coverage", None)
+        if cov is not None:
+            # Format as percentage if fraction
+            try:
+                cov_val = float(cov)
+                metrics["regression_coverage"] = f"{cov_val*100:.1f}%"
+            except Exception:
+                metrics["regression_coverage"] = str(cov)
+
+    # Fallback attempt: parse text for hints if JSON missing data
+    if report_text and metrics["gaussian_kl"] == "n/a":
+        kl_match = re.search(r"Gaussian.*?KL\s*[:=]\s*([\d\.eE-]+)", report_text)
+        if kl_match:
+            metrics["gaussian_kl"] = kl_match.group(1)
+
+    return metrics
+
+
+def build_markdown(metrics: dict) -> str:
+    lines = []
+    lines.append("# Benchmark Summary")
+    lines.append("")
+    lines.append(f"Last updated (UTC): `{metrics['timestamp']}`")
+    lines.append("")
+    lines.append("Key metrics (quick mode or last run):")
+    lines.append("")
+    lines.append("| Metric | Value |")
+    lines.append("|--------|-------|")
+    lines.append(f"| Gaussian KL divergence | {metrics['gaussian_kl']} |")
+    lines.append(f"| Multimodal modes found | {metrics['multimodal_modes']} |")
+    lines.append(f"| Ising optimality gap | {metrics['ising_gap']} |")
+    lines.append(f"| Regression coverage (95% CI) | {metrics['regression_coverage']} |")
+    lines.append("")
+    lines.append("Run locally:")
+    lines.append("```bash")
+    lines.append("python -m tsu.benchmarks.runner --quick")
+    lines.append("```")
+    lines.append("")
+    lines.append("Full benchmark details: see `visual_output/` artifacts or run full mode without `--quick`.")
+    return "\n".join(lines)
+
+
+def write_summary(markdown: str):
+    SUMMARY_FILE.write_text(markdown, encoding="utf-8")
+
+
+def update_readme(markdown: str):
+    if not README_FILE.exists():
+        return
+    original = README_FILE.read_text(encoding="utf-8")
+    if START_MARKER not in original or END_MARKER not in original:
+        # Do nothing if markers not present
+        return
+
+    pattern = re.compile(
+        rf"{START_MARKER}.*?{END_MARKER}",
+        re.DOTALL,
+    )
+    replacement = f"{START_MARKER}\n{markdown}\n{END_MARKER}"
+    updated = pattern.sub(replacement, original)
+    if updated != original:
+        README_FILE.write_text(updated, encoding="utf-8")
+
+
+def main():
+    report_text = load_report_text()
+    results_json = load_results_json()
+    metrics = parse_metrics(report_text, results_json)
+    md = build_markdown(metrics)
+    write_summary(md)
+    update_readme(md)
+    print("Benchmark summary generated.")
+    print(f" -> {SUMMARY_FILE}")
+    print("README updated (if markers present).")
+
+
+if __name__ == "__main__":
+    main()