agent-run/schemas/eval.json at main · builderz-labs/agent-run · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://agent-run.dev/schemas/eval.json",
  "title": "EvalResult",
  "description": "Evaluation result for a scored agent run.",
  "type": "object",
  "required": ["pass", "score"],
  "properties": {
    "task_type": {
      "type": ["string", "null"],
      "description": "Category of task being evaluated (e.g., 'pr-review', 'bug-fix', 'test-gen')."
    },
    "eval_layer": {
      "type": ["string", "null"],
      "description": "Which evaluation layer scored this (e.g., 'convergence', 'quality', 'regression')."
    },
    "pass": {
      "type": "boolean",
      "description": "Whether the run met its acceptance criteria."
    },
    "score": {
      "type": "number",
      "minimum": 0,
      "maximum": 100,
      "description": "Numeric score (0-100). Interpretation depends on task_type."
    },
    "expected_outcome": {
      "type": ["string", "null"],
      "description": "What the eval expected the agent to produce."
    },
    "actual_outcome": {
      "type": ["string", "null"],
      "description": "What the agent actually produced."
    },
    "metrics": {
      "type": "object",
      "properties": {
        "cost_usd": { "type": "number", "minimum": 0 },
        "duration_s": { "type": "number", "minimum": 0 },
        "tool_calls": { "type": "integer", "minimum": 0 },
        "retries": { "type": "integer", "minimum": 0 },
        "convergence_score": {
          "type": ["number", "null"],
          "minimum": 0,
          "maximum": 1,
          "description": "How directly the agent reached the solution (1.0 = optimal path, 0.0 = lost)."
        },
        "total_steps": { "type": ["integer", "null"], "minimum": 0 },
        "optimal_steps": { "type": ["integer", "null"], "minimum": 0 }
      },
      "additionalProperties": true,
      "description": "Quantitative metrics for this evaluation."
    },
    "regression_from": {
      "type": ["string", "null"],
      "description": "Run ID this was compared against for regression detection."
    },
    "detail": {
      "type": ["string", "null"],
      "description": "Human-readable evaluation notes or failure explanation."
    },
    "benchmark_id": {
      "type": ["string", "null"],
      "description": "Identifier of the benchmark pack used (e.g., 'agent-run/bench/bug-fix@1.0')."
    }
  },
  "additionalProperties": false
}