Add Opus correctness reviewer

mattgodbolt-molty · mattgodbolt-molty · commit 14b8ad5932bc · 2026-02-21T11:36:10.000-06:00
Adds a focused correctness checker that uses Claude Opus to verify
factual claims in explanations. Instead of abstract scoring dimensions,
it identifies specific errors and warnings:

- Instruction semantics (e.g., lea as address calc vs memory access)
- Complexity/performance claims (e.g., O(2^n) vs O(n))
- Optimisation level characterisation
- Register usage and calling conventions

Usage:
  prompt-test run --review              # Run + review in one step
  prompt-test review results.json       # Review existing results

Each issue is classified as error (would mislead a student) or
warning (imprecise but not strictly wrong).

🤖 Generated by LLM (Claude, via OpenClaw)
diff --git a/prompt_testing/README.md b/prompt_testing/README.md
@@ -8,10 +8,16 @@ Simple framework for testing prompt changes against curated test cases.
 # Run all test cases with the current production prompt
 uv run prompt-test run
 
+# Run with Opus correctness review (catches factual errors)
+uv run prompt-test run --review
+
 # Run specific cases or categories
-uv run prompt-test run --cases basic_loop_001 basic_inline_001
+uv run prompt-test run --cases basic_loop_001 --cases basic_inline_001
 uv run prompt-test run --categories loop_optimization
 
+# Review existing results with Opus
+uv run prompt-test review results/20250221_120000_current.json
+
 # Compare two result files
 uv run prompt-test compare results_a.json results_b.json
 
@@ -23,10 +29,20 @@ uv run prompt-test list
 
 1. **Test cases** live in `test_cases/*.yaml` — each has source code, compiler flags, and real assembly output
 2. `prompt-test run` sends each case to the Claude API using the current prompt and saves all outputs
-3. You read the outputs and decide if they're good
-4. To compare prompt changes: run once before, once after, then `prompt-test compare`
+3. `--review` flag runs each output through Opus for **correctness checking** — it identifies specific factual errors rather than giving abstract scores
+4. You read the outputs (and any flagged issues) and decide if they're good
+5. To compare prompt changes: run once before, once after, then `prompt-test compare`
+
+### Correctness Review
+
+The `--review` flag uses Claude Opus to check explanations for factual errors. Unlike generic scoring, it looks for specific issues:
+
+- **Instruction semantics**: Is `lea` correctly described as address computation, not memory access?
+- **Complexity claims**: Does it claim O(n) when it's actually O(2^n)?
+- **Optimisation characterisation**: Does it correctly identify unoptimised code?
+- **Register usage**: Are calling conventions right?
 
-No automated scoring, no Claude-as-judge, no web UI. The human is the judge.
+Each issue is flagged as an **error** (would mislead a student) or **warning** (imprecise but not wrong).
 
 ## Test Case Format
 
@@ -68,6 +84,7 @@ prompt_testing/
 ├── results/          # Saved test run outputs (JSON, gitignored)
 ├── ce_api/           # Compiler Explorer API client
 ├── runner.py         # Test runner
+├── reviewer.py       # Opus correctness checker
 ├── cli.py            # CLI commands
 ├── enricher.py       # CE API enrichment
 ├── file_utils.py     # File I/O helpers
diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py
@@ -3,6 +3,8 @@
 Simple commands:
   prompt-test run                  Run all test cases, save results
   prompt-test run --cases foo bar  Run specific cases
+  prompt-test run --review         Also run Opus correctness review
+  prompt-test review results.json  Review existing results with Opus
   prompt-test compare A B          Compare two result files side by side
   prompt-test list                 List available test cases
   prompt-test enrich               Enrich test cases with real CE assembly
@@ -40,21 +42,29 @@ def cli(ctx, project_root):
 @click.option("--categories", multiple=True, help="Filter by category")
 @click.option("--output", help="Output filename")
 @click.option("--max-concurrent", type=int, default=5)
+@click.option("--review", is_flag=True, help="Also run Opus correctness review on results")
+@click.option("--review-model", default="claude-opus-4-6", help="Model for correctness review")
 @click.pass_context
-def run(ctx, prompt, cases, categories, output, max_concurrent):
+def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model):
     """Run test cases and save results for review."""
     tester = PromptTester(ctx.obj["project_root"], max_concurrent=max_concurrent)
     results = tester.run(
         prompt_version=prompt,
         case_ids=list(cases) if cases else None,
         categories=list(categories) if categories else None,
     )
+
+    if review:
+        results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model))
+
     tester.save(results, output)
 
     # Summary
     click.echo(
         f"\n{results['successful']}/{results['total_cases']} succeeded, total cost: ${results['total_cost_usd']:.4f}"
     )
+    if review:
+        _print_review_summary(results)
 
 
 @cli.command()
@@ -184,6 +194,84 @@ def compilers(ctx, language, search, limit):  # noqa: ARG001
             click.echo(f"... and {len(results) - limit} more")
 
 
+async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
+    """Run correctness reviews on all successful results."""
+    from prompt_testing.reviewer import CorrectnessReviewer
+
+    reviewer = CorrectnessReviewer(model=model)
+    test_dir = project_root / "prompt_testing" / "test_cases"
+    all_cases = load_all_test_cases(str(test_dir))
+    cases_by_id = {c["id"]: c for c in all_cases}
+
+    successful = [r for r in results["results"] if r["success"]]
+    click.echo(f"\nReviewing {len(successful)} results with {model}...")
+
+    review_cost = 0.0
+    errors_found = 0
+
+    for i, result in enumerate(successful, 1):
+        case = cases_by_id.get(result["case_id"])
+        if not case:
+            continue
+
+        review = await reviewer.review_test_result(case, result["explanation"])
+        result["review"] = review
+
+        status = "✓" if review.get("correct") else "✗"
+        n_issues = len(review.get("issues", []))
+        if not review.get("correct"):
+            errors_found += 1
+        # Opus pricing: $15/M in, $75/M out
+        cost = review.get("reviewer_input_tokens", 0) * 15 / 1e6 + review.get("reviewer_output_tokens", 0) * 75 / 1e6
+        review_cost += cost
+        click.echo(f"  [{i}/{len(successful)}] {status} {result['case_id']} ({n_issues} issues, ${cost:.4f})")
+
+    results["review_model"] = model
+    results["review_cost_usd"] = round(review_cost, 6)
+    results["total_cost_usd"] = round(results["total_cost_usd"] + review_cost, 6)
+    results["errors_found"] = errors_found
+    return results
+
+
+def _print_review_summary(results: dict) -> None:
+    """Print a summary of correctness reviews."""
+    reviewed = [r for r in results["results"] if r.get("review")]
+    correct = sum(1 for r in reviewed if r["review"].get("correct"))
+    incorrect = len(reviewed) - correct
+
+    click.echo(f"\nCorrectness: {correct}/{len(reviewed)} passed")
+    if incorrect:
+        click.echo(f"\n⚠ {incorrect} case(s) with issues:")
+        for r in reviewed:
+            review = r["review"]
+            if not review.get("correct"):
+                click.echo(f"\n  {r['case_id']}:")
+                for issue in review.get("issues", []):
+                    sev = "🔴" if issue["severity"] == "error" else "🟡"
+                    click.echo(f"    {sev} {issue['claim']}")
+                    click.echo(f"       → {issue['correction']}")
+
+    click.echo(f"\nReview cost: ${results.get('review_cost_usd', 0):.4f} ({results.get('review_model', '?')})")
+
+
+@cli.command()
+@click.argument("results_file")
+@click.option("--model", default="claude-opus-4-6", help="Reviewer model")
+@click.pass_context
+def review(ctx, results_file, model):
+    """Run Opus correctness review on existing results."""
+    results_dir = ctx.obj["project_root"] / "prompt_testing" / "results"
+    path = results_dir / results_file if not Path(results_file).is_absolute() else Path(results_file)
+
+    results = json.loads(path.read_text())
+    results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model))
+
+    # Save updated results
+    path.write_text(json.dumps(results, indent=2))
+    click.echo(f"\nUpdated {path}")
+    _print_review_summary(results)
+
+
 def main():
     cli()
 
diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py
@@ -0,0 +1,148 @@
+"""Correctness reviewer using a powerful model to check explanations.
+
+Uses Opus to verify factual claims in explanations generated by cheaper models.
+Instead of abstract scoring dimensions, asks specific questions about correctness.
+"""
+
+import json
+from typing import Any
+
+from anthropic import AsyncAnthropic
+
+REVIEW_SYSTEM_PROMPT = """\
+You are an expert reviewer of assembly language explanations. Your job is to \
+verify the factual correctness of explanations generated by another AI model.
+
+You will receive:
+1. Source code and compilation options
+2. The assembly output
+3. An explanation of that assembly
+
+Your task is to check the explanation for factual errors. Focus on:
+- **Instruction semantics**: Are instructions correctly described? (e.g., does \
+`lea` actually access memory, or just compute an address?)
+- **Register usage**: Are calling conventions and register purposes correct?
+- **Optimisation claims**: Are claims about what optimisations were applied accurate?
+- **Complexity/performance claims**: Are any Big-O or performance claims correct?
+- **Optimisation level characterisation**: If the code is unoptimised (no flags), \
+does the explanation say so confidently rather than hedging?
+- **Completeness**: Are important aspects of the assembly missed entirely?
+
+Respond with a JSON object (no markdown fencing):
+{
+  "correct": true/false,
+  "issues": [
+    {
+      "severity": "error" | "warning",
+      "claim": "The specific claim from the explanation",
+      "correction": "What's actually correct",
+      "location": "Brief quote from the explanation where this appears"
+    }
+  ],
+  "summary": "One-line overall assessment"
+}
+
+"error" = factually wrong (would mislead a student)
+"warning" = imprecise, misleading, or could be better but not strictly wrong
+
+If the explanation is fully correct, return {"correct": true, "issues": [], \
+"summary": "..."}."""
+
+REVIEW_USER_TEMPLATE = """\
+## Source code ({language}, compiled with {compiler} {options})
+```
+{code}
+```
+
+## Assembly ({arch})
+```
+{assembly}
+```
+
+## Explanation to review
+{explanation}"""
+
+
+class CorrectnessReviewer:
+    """Reviews explanations for factual correctness using a powerful model."""
+
+    def __init__(self, model: str = "claude-opus-4-6"):
+        self.model = model
+        self.client = AsyncAnthropic()
+
+    async def review(
+        self,
+        *,
+        language: str,
+        compiler: str,
+        options: list[str],
+        arch: str,
+        code: str,
+        assembly: str,
+        explanation: str,
+    ) -> dict[str, Any]:
+        """Review a single explanation for correctness.
+
+        Returns a dict with 'correct' (bool), 'issues' (list), 'summary' (str).
+        """
+        user_prompt = REVIEW_USER_TEMPLATE.format(
+            language=language,
+            compiler=compiler,
+            options=" ".join(options) if options else "(no flags)",
+            code=code,
+            arch=arch or "unknown",
+            assembly=assembly,
+            explanation=explanation,
+        )
+
+        msg = await self.client.messages.create(
+            model=self.model,
+            max_tokens=2048,
+            temperature=0.0,
+            system=REVIEW_SYSTEM_PROMPT,
+            messages=[{"role": "user", "content": user_prompt}],
+        )
+
+        text = msg.content[0].text.strip()
+
+        # Parse JSON response
+        try:
+            result = json.loads(text)
+        except json.JSONDecodeError:
+            # Try to extract JSON from markdown fencing
+            if "```" in text:
+                json_part = text.split("```")[1]
+                if json_part.startswith("json"):
+                    json_part = json_part[4:]
+                result = json.loads(json_part.strip())
+            else:
+                result = {
+                    "correct": None,
+                    "issues": [],
+                    "summary": f"Failed to parse reviewer response: {text[:200]}",
+                }
+
+        result["reviewer_model"] = self.model
+        result["reviewer_input_tokens"] = msg.usage.input_tokens
+        result["reviewer_output_tokens"] = msg.usage.output_tokens
+
+        return result
+
+    async def review_test_result(
+        self,
+        test_case: dict[str, Any],
+        explanation: str,
+    ) -> dict[str, Any]:
+        """Review a test result using the test case data."""
+        inp = test_case["input"]
+        asm_text = "\n".join(a["text"] for a in inp["asm"] if isinstance(a, dict) and "text" in a)
+
+        return await self.review(
+            language=inp.get("language", "unknown"),
+            compiler=inp.get("compiler", "unknown"),
+            options=inp.get("compilationOptions", []),
+            arch=inp.get("instructionSet", "unknown"),
+            code=inp.get("code", ""),
+            assembly=asm_text,
+            explanation=explanation,
+        )