Skip to content

Commit 14b8ad5

Browse files
Add Opus correctness reviewer
Adds a focused correctness checker that uses Claude Opus to verify factual claims in explanations. Instead of abstract scoring dimensions, it identifies specific errors and warnings: - Instruction semantics (e.g., lea as address calc vs memory access) - Complexity/performance claims (e.g., O(2^n) vs O(n)) - Optimisation level characterisation - Register usage and calling conventions Usage: prompt-test run --review # Run + review in one step prompt-test review results.json # Review existing results Each issue is classified as error (would mislead a student) or warning (imprecise but not strictly wrong). 🤖 Generated by LLM (Claude, via OpenClaw)
1 parent 9988c18 commit 14b8ad5

File tree

3 files changed

+258
-5
lines changed

3 files changed

+258
-5
lines changed

prompt_testing/README.md

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,16 @@ Simple framework for testing prompt changes against curated test cases.
88
# Run all test cases with the current production prompt
99
uv run prompt-test run
1010

11+
# Run with Opus correctness review (catches factual errors)
12+
uv run prompt-test run --review
13+
1114
# Run specific cases or categories
12-
uv run prompt-test run --cases basic_loop_001 basic_inline_001
15+
uv run prompt-test run --cases basic_loop_001 --cases basic_inline_001
1316
uv run prompt-test run --categories loop_optimization
1417

18+
# Review existing results with Opus
19+
uv run prompt-test review results/20250221_120000_current.json
20+
1521
# Compare two result files
1622
uv run prompt-test compare results_a.json results_b.json
1723

@@ -23,10 +29,20 @@ uv run prompt-test list
2329

2430
1. **Test cases** live in `test_cases/*.yaml` — each has source code, compiler flags, and real assembly output
2531
2. `prompt-test run` sends each case to the Claude API using the current prompt and saves all outputs
26-
3. You read the outputs and decide if they're good
27-
4. To compare prompt changes: run once before, once after, then `prompt-test compare`
32+
3. `--review` flag runs each output through Opus for **correctness checking** — it identifies specific factual errors rather than giving abstract scores
33+
4. You read the outputs (and any flagged issues) and decide if they're good
34+
5. To compare prompt changes: run once before, once after, then `prompt-test compare`
35+
36+
### Correctness Review
37+
38+
The `--review` flag uses Claude Opus to check explanations for factual errors. Unlike generic scoring, it looks for specific issues:
39+
40+
- **Instruction semantics**: Is `lea` correctly described as address computation, not memory access?
41+
- **Complexity claims**: Does it claim O(n) when it's actually O(2^n)?
42+
- **Optimisation characterisation**: Does it correctly identify unoptimised code?
43+
- **Register usage**: Are calling conventions right?
2844

29-
No automated scoring, no Claude-as-judge, no web UI. The human is the judge.
45+
Each issue is flagged as an **error** (would mislead a student) or **warning** (imprecise but not wrong).
3046

3147
## Test Case Format
3248

@@ -68,6 +84,7 @@ prompt_testing/
6884
├── results/ # Saved test run outputs (JSON, gitignored)
6985
├── ce_api/ # Compiler Explorer API client
7086
├── runner.py # Test runner
87+
├── reviewer.py # Opus correctness checker
7188
├── cli.py # CLI commands
7289
├── enricher.py # CE API enrichment
7390
├── file_utils.py # File I/O helpers

prompt_testing/cli.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
Simple commands:
44
prompt-test run Run all test cases, save results
55
prompt-test run --cases foo bar Run specific cases
6+
prompt-test run --review Also run Opus correctness review
7+
prompt-test review results.json Review existing results with Opus
68
prompt-test compare A B Compare two result files side by side
79
prompt-test list List available test cases
810
prompt-test enrich Enrich test cases with real CE assembly
@@ -40,21 +42,29 @@ def cli(ctx, project_root):
4042
@click.option("--categories", multiple=True, help="Filter by category")
4143
@click.option("--output", help="Output filename")
4244
@click.option("--max-concurrent", type=int, default=5)
45+
@click.option("--review", is_flag=True, help="Also run Opus correctness review on results")
46+
@click.option("--review-model", default="claude-opus-4-6", help="Model for correctness review")
4347
@click.pass_context
44-
def run(ctx, prompt, cases, categories, output, max_concurrent):
48+
def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model):
4549
"""Run test cases and save results for review."""
4650
tester = PromptTester(ctx.obj["project_root"], max_concurrent=max_concurrent)
4751
results = tester.run(
4852
prompt_version=prompt,
4953
case_ids=list(cases) if cases else None,
5054
categories=list(categories) if categories else None,
5155
)
56+
57+
if review:
58+
results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, review_model))
59+
5260
tester.save(results, output)
5361

5462
# Summary
5563
click.echo(
5664
f"\n{results['successful']}/{results['total_cases']} succeeded, total cost: ${results['total_cost_usd']:.4f}"
5765
)
66+
if review:
67+
_print_review_summary(results)
5868

5969

6070
@cli.command()
@@ -184,6 +194,84 @@ def compilers(ctx, language, search, limit): # noqa: ARG001
184194
click.echo(f"... and {len(results) - limit} more")
185195

186196

197+
async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
198+
"""Run correctness reviews on all successful results."""
199+
from prompt_testing.reviewer import CorrectnessReviewer
200+
201+
reviewer = CorrectnessReviewer(model=model)
202+
test_dir = project_root / "prompt_testing" / "test_cases"
203+
all_cases = load_all_test_cases(str(test_dir))
204+
cases_by_id = {c["id"]: c for c in all_cases}
205+
206+
successful = [r for r in results["results"] if r["success"]]
207+
click.echo(f"\nReviewing {len(successful)} results with {model}...")
208+
209+
review_cost = 0.0
210+
errors_found = 0
211+
212+
for i, result in enumerate(successful, 1):
213+
case = cases_by_id.get(result["case_id"])
214+
if not case:
215+
continue
216+
217+
review = await reviewer.review_test_result(case, result["explanation"])
218+
result["review"] = review
219+
220+
status = "✓" if review.get("correct") else "✗"
221+
n_issues = len(review.get("issues", []))
222+
if not review.get("correct"):
223+
errors_found += 1
224+
# Opus pricing: $15/M in, $75/M out
225+
cost = review.get("reviewer_input_tokens", 0) * 15 / 1e6 + review.get("reviewer_output_tokens", 0) * 75 / 1e6
226+
review_cost += cost
227+
click.echo(f" [{i}/{len(successful)}] {status} {result['case_id']} ({n_issues} issues, ${cost:.4f})")
228+
229+
results["review_model"] = model
230+
results["review_cost_usd"] = round(review_cost, 6)
231+
results["total_cost_usd"] = round(results["total_cost_usd"] + review_cost, 6)
232+
results["errors_found"] = errors_found
233+
return results
234+
235+
236+
def _print_review_summary(results: dict) -> None:
237+
"""Print a summary of correctness reviews."""
238+
reviewed = [r for r in results["results"] if r.get("review")]
239+
correct = sum(1 for r in reviewed if r["review"].get("correct"))
240+
incorrect = len(reviewed) - correct
241+
242+
click.echo(f"\nCorrectness: {correct}/{len(reviewed)} passed")
243+
if incorrect:
244+
click.echo(f"\n{incorrect} case(s) with issues:")
245+
for r in reviewed:
246+
review = r["review"]
247+
if not review.get("correct"):
248+
click.echo(f"\n {r['case_id']}:")
249+
for issue in review.get("issues", []):
250+
sev = "🔴" if issue["severity"] == "error" else "🟡"
251+
click.echo(f" {sev} {issue['claim']}")
252+
click.echo(f" → {issue['correction']}")
253+
254+
click.echo(f"\nReview cost: ${results.get('review_cost_usd', 0):.4f} ({results.get('review_model', '?')})")
255+
256+
257+
@cli.command()
258+
@click.argument("results_file")
259+
@click.option("--model", default="claude-opus-4-6", help="Reviewer model")
260+
@click.pass_context
261+
def review(ctx, results_file, model):
262+
"""Run Opus correctness review on existing results."""
263+
results_dir = ctx.obj["project_root"] / "prompt_testing" / "results"
264+
path = results_dir / results_file if not Path(results_file).is_absolute() else Path(results_file)
265+
266+
results = json.loads(path.read_text())
267+
results = asyncio.run(_run_reviews(ctx.obj["project_root"], results, model))
268+
269+
# Save updated results
270+
path.write_text(json.dumps(results, indent=2))
271+
click.echo(f"\nUpdated {path}")
272+
_print_review_summary(results)
273+
274+
187275
def main():
188276
cli()
189277

prompt_testing/reviewer.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""Correctness reviewer using a powerful model to check explanations.
2+
3+
Uses Opus to verify factual claims in explanations generated by cheaper models.
4+
Instead of abstract scoring dimensions, asks specific questions about correctness.
5+
"""
6+
7+
import json
8+
from typing import Any
9+
10+
from anthropic import AsyncAnthropic
11+
12+
REVIEW_SYSTEM_PROMPT = """\
13+
You are an expert reviewer of assembly language explanations. Your job is to \
14+
verify the factual correctness of explanations generated by another AI model.
15+
16+
You will receive:
17+
1. Source code and compilation options
18+
2. The assembly output
19+
3. An explanation of that assembly
20+
21+
Your task is to check the explanation for factual errors. Focus on:
22+
- **Instruction semantics**: Are instructions correctly described? (e.g., does \
23+
`lea` actually access memory, or just compute an address?)
24+
- **Register usage**: Are calling conventions and register purposes correct?
25+
- **Optimisation claims**: Are claims about what optimisations were applied accurate?
26+
- **Complexity/performance claims**: Are any Big-O or performance claims correct?
27+
- **Optimisation level characterisation**: If the code is unoptimised (no flags), \
28+
does the explanation say so confidently rather than hedging?
29+
- **Completeness**: Are important aspects of the assembly missed entirely?
30+
31+
Respond with a JSON object (no markdown fencing):
32+
{
33+
"correct": true/false,
34+
"issues": [
35+
{
36+
"severity": "error" | "warning",
37+
"claim": "The specific claim from the explanation",
38+
"correction": "What's actually correct",
39+
"location": "Brief quote from the explanation where this appears"
40+
}
41+
],
42+
"summary": "One-line overall assessment"
43+
}
44+
45+
"error" = factually wrong (would mislead a student)
46+
"warning" = imprecise, misleading, or could be better but not strictly wrong
47+
48+
If the explanation is fully correct, return {"correct": true, "issues": [], \
49+
"summary": "..."}."""
50+
51+
REVIEW_USER_TEMPLATE = """\
52+
## Source code ({language}, compiled with {compiler} {options})
53+
```
54+
{code}
55+
```
56+
57+
## Assembly ({arch})
58+
```
59+
{assembly}
60+
```
61+
62+
## Explanation to review
63+
{explanation}"""
64+
65+
66+
class CorrectnessReviewer:
67+
"""Reviews explanations for factual correctness using a powerful model."""
68+
69+
def __init__(self, model: str = "claude-opus-4-6"):
70+
self.model = model
71+
self.client = AsyncAnthropic()
72+
73+
async def review(
74+
self,
75+
*,
76+
language: str,
77+
compiler: str,
78+
options: list[str],
79+
arch: str,
80+
code: str,
81+
assembly: str,
82+
explanation: str,
83+
) -> dict[str, Any]:
84+
"""Review a single explanation for correctness.
85+
86+
Returns a dict with 'correct' (bool), 'issues' (list), 'summary' (str).
87+
"""
88+
user_prompt = REVIEW_USER_TEMPLATE.format(
89+
language=language,
90+
compiler=compiler,
91+
options=" ".join(options) if options else "(no flags)",
92+
code=code,
93+
arch=arch or "unknown",
94+
assembly=assembly,
95+
explanation=explanation,
96+
)
97+
98+
msg = await self.client.messages.create(
99+
model=self.model,
100+
max_tokens=2048,
101+
temperature=0.0,
102+
system=REVIEW_SYSTEM_PROMPT,
103+
messages=[{"role": "user", "content": user_prompt}],
104+
)
105+
106+
text = msg.content[0].text.strip()
107+
108+
# Parse JSON response
109+
try:
110+
result = json.loads(text)
111+
except json.JSONDecodeError:
112+
# Try to extract JSON from markdown fencing
113+
if "```" in text:
114+
json_part = text.split("```")[1]
115+
if json_part.startswith("json"):
116+
json_part = json_part[4:]
117+
result = json.loads(json_part.strip())
118+
else:
119+
result = {
120+
"correct": None,
121+
"issues": [],
122+
"summary": f"Failed to parse reviewer response: {text[:200]}",
123+
}
124+
125+
result["reviewer_model"] = self.model
126+
result["reviewer_input_tokens"] = msg.usage.input_tokens
127+
result["reviewer_output_tokens"] = msg.usage.output_tokens
128+
129+
return result
130+
131+
async def review_test_result(
132+
self,
133+
test_case: dict[str, Any],
134+
explanation: str,
135+
) -> dict[str, Any]:
136+
"""Review a test result using the test case data."""
137+
inp = test_case["input"]
138+
asm_text = "\n".join(a["text"] for a in inp["asm"] if isinstance(a, dict) and "text" in a)
139+
140+
return await self.review(
141+
language=inp.get("language", "unknown"),
142+
compiler=inp.get("compiler", "unknown"),
143+
options=inp.get("compilationOptions", []),
144+
arch=inp.get("instructionSet", "unknown"),
145+
code=inp.get("code", ""),
146+
assembly=asm_text,
147+
explanation=explanation,
148+
)

0 commit comments

Comments
 (0)