Expand benchmark workflow and documentation

jmanhype · jmanhype · commit 5fe641ca7390 · 2025-10-22T15:22:07.000-05:00
diff --git a/.github/workflows/ace-benchmark.yml b/.github/workflows/ace-benchmark.yml
@@ -8,8 +8,42 @@ on:
 
 jobs:
   run-benchmark:
-    name: Run ACE benchmark
+    name: Run ACE benchmark (${{ matrix.name }})
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: finance-baseline
+            dataset: benchmarks/finance_subset.jsonl
+            variant: baseline
+            output: results/benchmark/baseline_finance.json
+            use_gt: "true"
+            temperature: "0.5"
+          - name: finance-ace-gt
+            dataset: benchmarks/finance_subset.jsonl
+            variant: ace_full
+            output: results/benchmark/ace_finance_gt.json
+            use_gt: "true"
+            temperature: "0.5"
+          - name: finance-ace-no-gt
+            dataset: benchmarks/finance_subset.jsonl
+            variant: ace_full
+            output: results/benchmark/ace_finance_no_gt.json
+            use_gt: "false"
+            temperature: "0.5"
+          - name: agent-baseline
+            dataset: benchmarks/agent_small.jsonl
+            variant: baseline
+            output: results/benchmark/baseline_agent.json
+            use_gt: "true"
+            temperature: "0.8"
+          - name: agent-ace
+            dataset: benchmarks/agent_small.jsonl
+            variant: ace_full
+            output: results/benchmark/ace_agent.json
+            use_gt: "true"
+            temperature: "0.8"
     env:
       OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -43,15 +77,16 @@ jobs:
       - name: Ensure benchmark results directory exists
         run: mkdir -p results/benchmark
 
-      - name: Baseline run
-        run: python scripts/run_benchmark.py benchmarks/finance_subset.jsonl baseline --output results/benchmark/baseline_finance.json
-
-      - name: ACE run
-        run: python scripts/run_benchmark.py benchmarks/finance_subset.jsonl ace_full --output results/benchmark/ace_finance.json
+      - name: Run benchmark (${{ matrix.variant }} | ${{ matrix.dataset }})
+        env:
+          ACE_BENCHMARK_USE_GROUND_TRUTH: ${{ matrix.use_gt }}
+          ACE_BENCHMARK_TEMPERATURE: ${{ matrix.temperature }}
+        run: |
+          python scripts/run_benchmark.py ${{ matrix.dataset }} ${{ matrix.variant }} --output ${{ matrix.output }}
 
-      - name: Upload benchmark artifacts
+      - name: Upload benchmark artifact
         uses: actions/upload-artifact@v4
         with:
-          name: ace-benchmark-results
-          path: results/benchmark/*.json
+          name: ace-benchmark-${{ matrix.name }}
+          path: ${{ matrix.output }}
           if-no-files-found: error
diff --git a/README.md b/README.md
@@ -150,6 +150,9 @@ python scripts/run_benchmark.py benchmarks/finance_subset.jsonl ace_full --outpu
 
 # ACE vs baseline live loop comparison (ACE + EE harness)
 python benchmarks/run_live_loop_benchmark.py --backend dspy --episodes 10
+
+# Trigger the CI workflow (optional)
+gh workflow run ace-benchmark.yml
 ```
 
 Key metrics in the JSON output:
diff --git a/docs/combined_quickstart.md b/docs/combined_quickstart.md
@@ -112,3 +112,52 @@ routes feedback back into the curator.  Replace the dummy client with a real
   latency or cost.
 * Feed the `ExperienceBuffer` into training dashboards or analytics systems to
   monitor adoption of new playbook bullets.
+
+## CI Benchmarks
+
+The repository includes a GitHub Actions workflow
+(`.github/workflows/ace-benchmark.yml`) that runs the finance and agent
+benchmarks under several configurations:
+
+- Finance baseline vs ACE (with ground-truth feedback),
+- Finance ACE with ground-truth disabled (reflector relies solely on execution
+  cues),
+- Agent baseline vs ACE on `benchmarks/agent_small.jsonl` at a higher generator
+  temperature.
+
+Each matrix entry runs in an isolated job, initialises a fresh SQLite schema,
+and uploads its metrics JSON as an artifact (for example,
+`ace-benchmark-finance-ace-no-gt`).
+
+### Triggering the workflow
+
+1. Store `OPENROUTER_API_KEY` (or another provider key) as a repository secret.
+2. From the **Actions** tab, choose **ACE Benchmark → Run workflow** (manual) or
+   rely on the automatic trigger for pushes to `main`.
+3. After the run completes, download the artifacts. You’ll find:
+   - `baseline_finance.json`, `ace_finance_gt.json`,
+     `ace_finance_no_gt.json`,
+   - `baseline_agent.json`, `ace_agent.json`.
+
+### Environment knobs
+
+`scripts/run_benchmark.py` respects the following environment variables (also
+used by the workflow matrix):
+
+- `ACE_BENCHMARK_TEMPERATURE` – overrides the generator temperature for both CoT
+  and ReAct variants.
+- `ACE_BENCHMARK_USE_GROUND_TRUTH` – set to `false`/`0`/`off` to withhold
+  ground-truth answers from the reflector (accuracy is still evaluated against
+  ground truth).
+
+Example local invocation:
+
+```bash
+ACE_BENCHMARK_TEMPERATURE=0.5 \
+ACE_BENCHMARK_USE_GROUND_TRUTH=false \
+python scripts/run_benchmark.py benchmarks/finance_subset.jsonl ace_full \
+  --output results/benchmark/ace_finance_no_gt.json
+```
+
+The resulting JSON files provide the raw evidence (accuracy, promotions,
+increments, auto-format corrections) that mirrors the tables in the ACE paper.
diff --git a/results/benchmark/ace_finance.json b/results/benchmark/ace_finance.json
@@ -2,54 +2,44 @@
   "variant": "ace_full",
   "total": 26,
   "correct": 26,
-  "promotions": 10,
+  "promotions": 18,
   "quarantines": 0,
-  "new_bullets": 2,
-  "increments": 5,
+  "new_bullets": 4,
+  "increments": 1,
   "latency_ms": [],
   "failures": [],
-  "auto_corrections": [
+  "format_corrections": [
     {
-      "task_id": "fin-002",
-      "original_answer": "38%",
+      "task_id": "fin-006",
+      "original_answer": "The ROI for the investment is 30%.",
       "corrected_answer": "30%"
     },
+    {
+      "task_id": "fin-007",
+      "original_answer": "The net profit margin is 15%.",
+      "corrected_answer": "15%"
+    },
+    {
+      "task_id": "fin-026",
+      "original_answer": "The retention ratio is 0.75 (or 75%).",
+      "corrected_answer": "75%"
+    }
+  ],
+  "auto_corrections": [
     {
       "task_id": "fin-014",
       "original_answer": "19.13%",
       "corrected_answer": "19.11%"
     },
     {
       "task_id": "fin-019",
-      "original_answer": "38.00%",
+      "original_answer": "21.60%",
       "corrected_answer": "21.65%"
     },
     {
       "task_id": "fin-020",
       "original_answer": "188.77",
       "corrected_answer": "188.71"
-    },
-    {
-      "task_id": "fin-025",
-      "original_answer": "6.00%",
-      "corrected_answer": "8.40%"
-    }
-  ],
-  "format_corrections": [
-    {
-      "task_id": "fin-006",
-      "original_answer": "The ROI is (240 / 800) * 100 = 30%.",
-      "corrected_answer": "30%"
-    },
-    {
-      "task_id": "fin-007",
-      "original_answer": "The net profit margin is 15%.",
-      "corrected_answer": "15%"
-    },
-    {
-      "task_id": "fin-026",
-      "original_answer": "The retention ratio is 0.75 (or 75%).",
-      "corrected_answer": "75%"
     }
   ]
 }
diff --git a/results/benchmark/baseline_finance.json b/results/benchmark/baseline_finance.json
@@ -1,50 +1,66 @@
 {
   "variant": "baseline",
   "total": 26,
-  "correct": 26,
+  "correct": 24,
   "promotions": 0,
   "quarantines": 0,
-  "new_bullets": 5,
-  "increments": 83,
+  "new_bullets": 62,
+  "increments": 16,
   "latency_ms": [],
-  "failures": [],
-  "format_corrections": [
-    {
-      "task_id": "fin-006",
-      "original_answer": "The ROI is 30%.",
-      "corrected_answer": "30%"
-    },
+  "failures": [
     {
-      "task_id": "fin-007",
-      "original_answer": "The net profit margin is 15%.",
-      "corrected_answer": "15%"
+      "task_id": "fin-009",
+      "answer": "75.54",
+      "ground_truth": "78.25"
     },
     {
       "task_id": "fin-026",
-      "original_answer": "Retention Ratio = (1200 - 300) / 1200 = 900 / 1200 = 0.75 or 75%",
-      "corrected_answer": "75%"
+      "answer": "0.75",
+      "ground_truth": "75%"
     }
   ],
   "auto_corrections": [
+    {
+      "task_id": "fin-002",
+      "original_answer": "29%",
+      "corrected_answer": "30%"
+    },
     {
       "task_id": "fin-014",
-      "original_answer": "30.0%",
+      "original_answer": "19.13%",
       "corrected_answer": "19.11%"
     },
     {
       "task_id": "fin-019",
-      "original_answer": "19.66%",
+      "original_answer": "21.64%",
       "corrected_answer": "21.65%"
     },
+    {
+      "task_id": "fin-023",
+      "original_answer": "8.33%",
+      "corrected_answer": "9.38%"
+    },
     {
       "task_id": "fin-024",
-      "original_answer": "5247.77",
+      "original_answer": "3579.14",
       "corrected_answer": "3685.69"
     },
     {
       "task_id": "fin-025",
-      "original_answer": "9.60%",
+      "original_answer": "10.00%",
       "corrected_answer": "8.40%"
     }
+  ],
+  "format_corrections": [
+    {
+      "task_id": "fin-006",
+      "original_answer": "The ROI is 30%.",
+      "corrected_answer": "30%"
+    },
+    {
+      "task_id": "fin-007",
+      "original_answer": "The net profit margin is 15.0%.",
+      "corrected_answer": "15%"
+    }
   ]
 }
diff --git a/scripts/run_benchmark.py b/scripts/run_benchmark.py
@@ -4,12 +4,12 @@
 
 import argparse
 import json
+import math
 import os
 import re
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List
-import math
 
 import dspy
 from dotenv import load_dotenv
@@ -177,15 +177,33 @@ def run_variant(tasks: List[Dict], variant: VariantConfig) -> Dict:
 
     model_name = configure_lm()
 
+    default_temperature = 0.2 if variant.enable_react else 0.7
+    temperature_override = os.getenv("ACE_BENCHMARK_TEMPERATURE")
+    if temperature_override:
+        try:
+            default_temperature = float(temperature_override)
+        except ValueError:
+            logger.warning("invalid_temperature_override", value=temperature_override)
+
     generator = (
-        create_react_generator(model=model_name) if variant.enable_react else create_cot_generator(model=model_name)
+        create_react_generator(model=model_name, temperature=default_temperature)
+        if variant.enable_react
+        else create_cot_generator(model=model_name, temperature=default_temperature)
     )
     reflector = GroundedReflector(model=model_name)
     curator = CuratorService()
     merge_coordinator = MergeCoordinator(curator) if variant.enable_merge_coordinator else None
     refinement_scheduler = None
     runtime_adapter = None
 
+    use_ground_truth_env = os.getenv("ACE_BENCHMARK_USE_GROUND_TRUTH")
+    use_ground_truth = True
+    if use_ground_truth_env is not None:
+        use_ground_truth = use_ground_truth_env.strip().lower() not in {"0", "false", "off", "no"}
+
+    metrics["generator_temperature"] = default_temperature
+    metrics["reflector_use_ground_truth"] = use_ground_truth
+
     with get_session() as session:
         stage_manager = StageManager(session)
         curator_service = curator
@@ -242,7 +260,7 @@ def run_variant(tasks: List[Dict], variant: VariantConfig) -> Dict:
                 answer=original_answer,
                 confidence=result.confidence,
                 bullets_referenced=result.bullets_referenced,
-                ground_truth=task.get("ground_truth"),
+                ground_truth=task.get("ground_truth") if use_ground_truth else None,
                 domain="benchmark",
             )