-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.json
More file actions
68 lines (68 loc) · 2.35 KB
/
eval.json
File metadata and controls
68 lines (68 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://agent-run.dev/schemas/eval.json",
"title": "EvalResult",
"description": "Evaluation result for a scored agent run.",
"type": "object",
"required": ["pass", "score"],
"properties": {
"task_type": {
"type": ["string", "null"],
"description": "Category of task being evaluated (e.g., 'pr-review', 'bug-fix', 'test-gen')."
},
"eval_layer": {
"type": ["string", "null"],
"description": "Which evaluation layer scored this (e.g., 'convergence', 'quality', 'regression')."
},
"pass": {
"type": "boolean",
"description": "Whether the run met its acceptance criteria."
},
"score": {
"type": "number",
"minimum": 0,
"maximum": 100,
"description": "Numeric score (0-100). Interpretation depends on task_type."
},
"expected_outcome": {
"type": ["string", "null"],
"description": "What the eval expected the agent to produce."
},
"actual_outcome": {
"type": ["string", "null"],
"description": "What the agent actually produced."
},
"metrics": {
"type": "object",
"properties": {
"cost_usd": { "type": "number", "minimum": 0 },
"duration_s": { "type": "number", "minimum": 0 },
"tool_calls": { "type": "integer", "minimum": 0 },
"retries": { "type": "integer", "minimum": 0 },
"convergence_score": {
"type": ["number", "null"],
"minimum": 0,
"maximum": 1,
"description": "How directly the agent reached the solution (1.0 = optimal path, 0.0 = lost)."
},
"total_steps": { "type": ["integer", "null"], "minimum": 0 },
"optimal_steps": { "type": ["integer", "null"], "minimum": 0 }
},
"additionalProperties": true,
"description": "Quantitative metrics for this evaluation."
},
"regression_from": {
"type": ["string", "null"],
"description": "Run ID this was compared against for regression detection."
},
"detail": {
"type": ["string", "null"],
"description": "Human-readable evaluation notes or failure explanation."
},
"benchmark_id": {
"type": ["string", "null"],
"description": "Identifier of the benchmark pack used (e.g., 'agent-run/bench/bug-fix@1.0')."
}
},
"additionalProperties": false
}