Add missing parse_benchmark_results.py script

mwiewior · claude · mwiewior · commit bf06cdda8fb5 · 2025-10-24T19:58:44.000+02:00
The compare_benchmark_results.sh script was calling this file but it didn't exist. This script: - Parses baseline and PR CSV benchmark results - Compares polars_bio performance against threshold - Generates JSON and markdown reports - Detects performance regressions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/benchmarks/parse_benchmark_results.py b/benchmarks/parse_benchmark_results.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""Parse and compare benchmark CSV results between baseline and PR."""
+
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+def parse_csv(csv_path: Path) -> Dict[str, float]:
+    """Parse benchmark CSV and extract polars_bio mean time.
+
+    Expected format:
+    Library,Min (s),Max (s),Mean (s),Speedup
+    polars_bio,0.051166,0.066682,0.056735,1.00x
+    ...
+    """
+    results = {}
+    with open(csv_path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            library = row["Library"]
+            mean_time = float(row["Mean (s)"])
+            results[library] = mean_time
+    return results
+
+
+def compare_results(
+    baseline: Dict[str, float],
+    pr: Dict[str, float],
+    threshold: float,
+) -> Tuple[Dict, List[str]]:
+    """Compare baseline and PR results, detecting regressions.
+
+    Returns:
+        (summary_dict, regressions_list)
+    """
+    summary = {
+        "baseline": {},
+        "pr": {},
+        "changes": {},
+        "regressions": 0,
+    }
+
+    regressions = []
+
+    for library in baseline:
+        if library not in pr:
+            continue
+
+        baseline_time = baseline[library]
+        pr_time = pr[library]
+
+        # Calculate percentage change
+        if baseline_time > 0:
+            change_pct = ((pr_time - baseline_time) / baseline_time) * 100
+        else:
+            change_pct = 0
+
+        summary["baseline"][library] = baseline_time
+        summary["pr"][library] = pr_time
+        summary["changes"][library] = {
+            "absolute": pr_time - baseline_time,
+            "percentage": change_pct,
+        }
+
+        # Check for regression (only for polars_bio)
+        if library == "polars_bio" and change_pct > threshold:
+            summary["regressions"] += 1
+            regressions.append(
+                f"{library}: {baseline_time:.3f}s -> {pr_time:.3f}s ({change_pct:+.1f}%)"
+            )
+
+    return summary, regressions
+
+
+def generate_report(
+    operation: str,
+    summary: Dict,
+    regressions: List[str],
+    baseline_tag: str,
+    pr_ref: str,
+    threshold: float,
+) -> str:
+    """Generate markdown report for this operation."""
+    lines = [
+        f"## {operation}",
+        "",
+        f"**Baseline:** {baseline_tag}  ",
+        f"**PR:** {pr_ref}  ",
+        f"**Threshold:** {threshold}%",
+        "",
+    ]
+
+    if summary["regressions"] > 0:
+        lines.extend(
+            [
+                f"⚠️ **{summary['regressions']} regression(s) detected**",
+                "",
+                "### Regressions",
+                "",
+            ]
+        )
+        for reg in regressions:
+            lines.append(f"- {reg}")
+        lines.append("")
+    else:
+        lines.extend(
+            [
+                "✓ **No regressions detected**",
+                "",
+            ]
+        )
+
+    # Performance comparison table
+    lines.extend(
+        [
+            "### Performance Comparison",
+            "",
+            "| Library | Baseline (s) | PR (s) | Change |",
+            "|---------|--------------|--------|---------|",
+        ]
+    )
+
+    for library in sorted(summary["baseline"].keys()):
+        baseline_time = summary["baseline"][library]
+        pr_time = summary["pr"][library]
+        change = summary["changes"][library]
+
+        change_str = f"{change['percentage']:+.1f}%"
+        if change["percentage"] > 0:
+            change_str = f"🔴 {change_str}"
+        elif change["percentage"] < -5:  # Improvement > 5%
+            change_str = f"🟢 {change_str}"
+
+        lines.append(
+            f"| {library} | {baseline_time:.3f} | {pr_time:.3f} | {change_str} |"
+        )
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare benchmark CSV results")
+    parser.add_argument("baseline_csv", type=Path, help="Baseline CSV file")
+    parser.add_argument("pr_csv", type=Path, help="PR CSV file")
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=150.0,
+        help="Regression threshold percentage (default: 150)",
+    )
+    parser.add_argument(
+        "--baseline-tag", default="baseline", help="Baseline version tag"
+    )
+    parser.add_argument("--pr-ref", default="PR", help="PR reference name")
+    parser.add_argument(
+        "--output-json", type=Path, help="Output JSON file for detailed results"
+    )
+    parser.add_argument(
+        "--output-comparison", type=Path, help="Output JSON file for comparison summary"
+    )
+    parser.add_argument(
+        "--output-report", type=Path, help="Output markdown report file"
+    )
+
+    args = parser.parse_args()
+
+    # Parse CSVs
+    try:
+        baseline = parse_csv(args.baseline_csv)
+        pr = parse_csv(args.pr_csv)
+    except Exception as e:
+        print(f"Error parsing CSV files: {e}", file=sys.stderr)
+        return 1
+
+    # Compare results
+    operation = args.baseline_csv.stem.split("_")[0]
+    summary, regressions = compare_results(baseline, pr, args.threshold)
+
+    # Generate outputs
+    if args.output_json:
+        args.output_json.parent.mkdir(parents=True, exist_ok=True)
+        with open(args.output_json, "w") as f:
+            json.dump(
+                {
+                    "baseline": baseline,
+                    "pr": pr,
+                },
+                f,
+                indent=2,
+            )
+
+    if args.output_comparison:
+        args.output_comparison.parent.mkdir(parents=True, exist_ok=True)
+        with open(args.output_comparison, "w") as f:
+            json.dump(
+                {
+                    "operation": operation,
+                    "summary": summary,
+                    "regressions": regressions,
+                },
+                f,
+                indent=2,
+            )
+
+    if args.output_report:
+        args.output_report.parent.mkdir(parents=True, exist_ok=True)
+        report = generate_report(
+            operation,
+            summary,
+            regressions,
+            args.baseline_tag,
+            args.pr_ref,
+            args.threshold,
+        )
+        with open(args.output_report, "w") as f:
+            f.write(report)
+
+    # Print summary
+    print(
+        f"  polars_bio: {baseline.get('polars_bio', 0):.3f}s -> "
+        f"{pr.get('polars_bio', 0):.3f}s",
+        end="",
+    )
+
+    if summary["regressions"] > 0:
+        print(f" ⚠️ {summary['regressions']} regression(s)")
+    else:
+        change = summary["changes"].get("polars_bio", {}).get("percentage", 0)
+        print(f" ({change:+.1f}%)")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())