Skip to content

Commit 27b92d6

Browse files
Add RunLedger replay gate
1 parent 12cd338 commit 27b92d6

File tree

9 files changed

+215
-0
lines changed

9 files changed

+215
-0
lines changed

.github/workflows/runledger.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: runledger-gate
2+
on:
3+
pull_request:
4+
5+
jobs:
6+
evals:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- uses: actions/checkout@v4
10+
- uses: actions/setup-python@v5
11+
with:
12+
python-version: "3.11"
13+
- name: Install RunLedger
14+
run: |
15+
python -m pip install --upgrade pip
16+
python -m pip install runledger
17+
- name: Run deterministic evals (replay)
18+
run: runledger run evals/runledger --mode replay --baseline baselines/runledger-demo.json
19+
- name: Upload artifacts
20+
if: always()
21+
uses: actions/upload-artifact@v4
22+
with:
23+
name: runledger-artifacts
24+
path: runledger_out/**

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@ yarn.lock
2828
test-injection/
2929
notepad.md
3030
oauth-success.html
31+
runledger_out/

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,3 +954,14 @@ I have no affiliation with any project or model mentioned here. This is purely p
954954
- Fun fact: That PR was discovered and fixed thanks to OhMyOpenCode's Librarian, Explore, and Oracle setup.
955955

956956
*Special thanks to [@junhoyeo](https://github.com/junhoyeo) for this amazing hero image.*
957+
958+
## RunLedger CI gate
959+
960+
This repo includes a deterministic CI gate for tool-using agents:
961+
962+
```bash
963+
runledger run evals/runledger --mode replay --baseline baselines/runledger-demo.json
964+
```
965+
966+
It replays recorded tool calls and fails the PR on schema/tool/budget regressions.
967+

baselines/runledger-demo.json

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
{
2+
"aggregates": {
3+
"cases_error": 0,
4+
"cases_fail": 0,
5+
"cases_pass": 1,
6+
"cases_total": 1,
7+
"metrics": {
8+
"cost_usd": {
9+
"max": null,
10+
"mean": null,
11+
"min": null,
12+
"p50": null,
13+
"p95": null
14+
},
15+
"steps": {
16+
"max": null,
17+
"mean": null,
18+
"min": null,
19+
"p50": null,
20+
"p95": null
21+
},
22+
"tokens_in": {
23+
"max": null,
24+
"mean": null,
25+
"min": null,
26+
"p50": null,
27+
"p95": null
28+
},
29+
"tokens_out": {
30+
"max": null,
31+
"mean": null,
32+
"min": null,
33+
"p50": null,
34+
"p95": null
35+
},
36+
"tool_calls": {
37+
"max": 1.0,
38+
"mean": 1.0,
39+
"min": 1.0,
40+
"p50": 1.0,
41+
"p95": 1.0
42+
},
43+
"tool_errors": {
44+
"max": 0.0,
45+
"mean": 0.0,
46+
"min": 0.0,
47+
"p50": 0.0,
48+
"p95": 0.0
49+
},
50+
"wall_ms": {
51+
"max": 51.0,
52+
"mean": 51.0,
53+
"min": 51.0,
54+
"p50": 51.0,
55+
"p95": 51.0
56+
}
57+
},
58+
"pass_rate": 1.0
59+
},
60+
"cases": [
61+
{
62+
"assertions": {
63+
"failed": 0,
64+
"total": 2
65+
},
66+
"cost_usd": null,
67+
"failed_assertions": null,
68+
"failure_reason": null,
69+
"id": "t1",
70+
"replay": {
71+
"cassette_path": "evals/runledger/cassettes/t1.jsonl",
72+
"cassette_sha256": "3ca88ba1cde6952e1927e83ba82cc13948f96157d200ef01a7a15b5e586883e5"
73+
},
74+
"status": "pass",
75+
"steps": null,
76+
"tokens_in": null,
77+
"tokens_out": null,
78+
"tool_calls": 1,
79+
"tool_calls_by_name": {
80+
"search_docs": 1
81+
},
82+
"tool_errors": 0,
83+
"tool_errors_by_name": {},
84+
"wall_ms": 51
85+
}
86+
],
87+
"generated_at": "2025-12-19T12:11:00.657302Z",
88+
"policy_snapshot": {
89+
"thresholds": {
90+
"min_pass_rate": 1.0
91+
}
92+
},
93+
"run": {
94+
"ci": null,
95+
"exit_status": "success",
96+
"git_sha": null,
97+
"mode": "replay",
98+
"run_id": "20251219-121100-485aac"
99+
},
100+
"runledger_version": "0.1.0",
101+
"schema_version": 1,
102+
"suite": {
103+
"agent_command": [
104+
"python",
105+
"evals/runledger/agent/agent.py"
106+
],
107+
"cases_total": 1,
108+
"name": "runledger-demo",
109+
"suite_config_hash": null,
110+
"suite_path": "evals/runledger/suite.yaml",
111+
"tool_mode": "replay"
112+
}
113+
}

evals/runledger/agent/agent.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
import sys
3+
4+
def send(payload):
5+
sys.stdout.write(json.dumps(payload) + "\n")
6+
sys.stdout.flush()
7+
8+
def main():
9+
for line in sys.stdin:
10+
line = line.strip()
11+
if not line:
12+
continue
13+
msg = json.loads(line)
14+
if msg.get("type") == "task_start":
15+
ticket = msg.get("input", {}).get("ticket", "")
16+
send({"type": "tool_call", "name": "search_docs", "call_id": "c1", "args": {"q": ticket}})
17+
elif msg.get("type") == "tool_result":
18+
send({"type": "final_output", "output": {"category": "account", "reply": "Reset password instructions sent."}})
19+
break
20+
21+
if __name__ == "__main__":
22+
main()

evals/runledger/cases/t1.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
id: t1
2+
description: triage a login ticket
3+
input:
4+
ticket: reset password
5+
cassette: cassettes/t1.jsonl
6+
assertions:
7+
- type: required_fields
8+
fields:
9+
- category
10+
- reply

evals/runledger/cassettes/t1.jsonl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"args": {"q": "reset password"}, "ok": true, "result": {"hits": [{"snippet": "Use the reset link.", "title": "Reset password"}]}, "tool": "search_docs"}

evals/runledger/schema.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"type": "object",
3+
"properties": {
4+
"category": {
5+
"type": "string"
6+
},
7+
"reply": {
8+
"type": "string"
9+
}
10+
},
11+
"required": [
12+
"category",
13+
"reply"
14+
]
15+
}

evals/runledger/suite.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
suite_name: runledger-demo
2+
agent_command:
3+
- python
4+
- agent/agent.py
5+
mode: replay
6+
cases_path: cases
7+
tool_registry:
8+
- search_docs
9+
assertions:
10+
- type: json_schema
11+
schema_path: schema.json
12+
budgets:
13+
max_wall_ms: 20000
14+
max_tool_calls: 1
15+
max_tool_errors: 0
16+
regression:
17+
min_pass_rate: 1.0
18+
baseline_path: ../../baselines/runledger-demo.json

0 commit comments

Comments
 (0)