Add RunLedger replay gate

ZackMitchell910 · ZackMitchell910 · commit 27b92d6be1c1 · 2025-12-19T05:11:19.000-07:00
diff --git a/.github/workflows/runledger.yml b/.github/workflows/runledger.yml
@@ -0,0 +1,24 @@
+name: runledger-gate
+on:
+  pull_request:
+
+jobs:
+  evals:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install RunLedger
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install runledger
+      - name: Run deterministic evals (replay)
+        run: runledger run evals/runledger --mode replay --baseline baselines/runledger-demo.json
+      - name: Upload artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: runledger-artifacts
+          path: runledger_out/**
diff --git a/.gitignore b/.gitignore
@@ -28,3 +28,4 @@ yarn.lock
 test-injection/
 notepad.md
 oauth-success.html
+runledger_out/
diff --git a/README.md b/README.md
@@ -954,3 +954,14 @@ I have no affiliation with any project or model mentioned here. This is purely p
     - Fun fact: That PR was discovered and fixed thanks to OhMyOpenCode's Librarian, Explore, and Oracle setup.
 
 *Special thanks to [@junhoyeo](https://github.com/junhoyeo) for this amazing hero image.*
+
+## RunLedger CI gate
+
+This repo includes a deterministic CI gate for tool-using agents:
+
+```bash
+runledger run evals/runledger --mode replay --baseline baselines/runledger-demo.json
+```
+
+It replays recorded tool calls and fails the PR on schema/tool/budget regressions.
+
diff --git a/baselines/runledger-demo.json b/baselines/runledger-demo.json
@@ -0,0 +1,113 @@
+{
+  "aggregates": {
+    "cases_error": 0,
+    "cases_fail": 0,
+    "cases_pass": 1,
+    "cases_total": 1,
+    "metrics": {
+      "cost_usd": {
+        "max": null,
+        "mean": null,
+        "min": null,
+        "p50": null,
+        "p95": null
+      },
+      "steps": {
+        "max": null,
+        "mean": null,
+        "min": null,
+        "p50": null,
+        "p95": null
+      },
+      "tokens_in": {
+        "max": null,
+        "mean": null,
+        "min": null,
+        "p50": null,
+        "p95": null
+      },
+      "tokens_out": {
+        "max": null,
+        "mean": null,
+        "min": null,
+        "p50": null,
+        "p95": null
+      },
+      "tool_calls": {
+        "max": 1.0,
+        "mean": 1.0,
+        "min": 1.0,
+        "p50": 1.0,
+        "p95": 1.0
+      },
+      "tool_errors": {
+        "max": 0.0,
+        "mean": 0.0,
+        "min": 0.0,
+        "p50": 0.0,
+        "p95": 0.0
+      },
+      "wall_ms": {
+        "max": 51.0,
+        "mean": 51.0,
+        "min": 51.0,
+        "p50": 51.0,
+        "p95": 51.0
+      }
+    },
+    "pass_rate": 1.0
+  },
+  "cases": [
+    {
+      "assertions": {
+        "failed": 0,
+        "total": 2
+      },
+      "cost_usd": null,
+      "failed_assertions": null,
+      "failure_reason": null,
+      "id": "t1",
+      "replay": {
+        "cassette_path": "evals/runledger/cassettes/t1.jsonl",
+        "cassette_sha256": "3ca88ba1cde6952e1927e83ba82cc13948f96157d200ef01a7a15b5e586883e5"
+      },
+      "status": "pass",
+      "steps": null,
+      "tokens_in": null,
+      "tokens_out": null,
+      "tool_calls": 1,
+      "tool_calls_by_name": {
+        "search_docs": 1
+      },
+      "tool_errors": 0,
+      "tool_errors_by_name": {},
+      "wall_ms": 51
+    }
+  ],
+  "generated_at": "2025-12-19T12:11:00.657302Z",
+  "policy_snapshot": {
+    "thresholds": {
+      "min_pass_rate": 1.0
+    }
+  },
+  "run": {
+    "ci": null,
+    "exit_status": "success",
+    "git_sha": null,
+    "mode": "replay",
+    "run_id": "20251219-121100-485aac"
+  },
+  "runledger_version": "0.1.0",
+  "schema_version": 1,
+  "suite": {
+    "agent_command": [
+      "python",
+      "evals/runledger/agent/agent.py"
+    ],
+    "cases_total": 1,
+    "name": "runledger-demo",
+    "suite_config_hash": null,
+    "suite_path": "evals/runledger/suite.yaml",
+    "tool_mode": "replay"
+  }
+}
diff --git a/evals/runledger/agent/agent.py b/evals/runledger/agent/agent.py
@@ -0,0 +1,22 @@
+import json
+import sys
+
+def send(payload):
+    sys.stdout.write(json.dumps(payload) + "\n")
+    sys.stdout.flush()
+
+def main():
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        msg = json.loads(line)
+        if msg.get("type") == "task_start":
+            ticket = msg.get("input", {}).get("ticket", "")
+            send({"type": "tool_call", "name": "search_docs", "call_id": "c1", "args": {"q": ticket}})
+        elif msg.get("type") == "tool_result":
+            send({"type": "final_output", "output": {"category": "account", "reply": "Reset password instructions sent."}})
+            break
+
+if __name__ == "__main__":
+    main()
diff --git a/evals/runledger/cases/t1.yaml b/evals/runledger/cases/t1.yaml
@@ -0,0 +1,10 @@
+id: t1
+description: triage a login ticket
+input:
+  ticket: reset password
+cassette: cassettes/t1.jsonl
+assertions:
+- type: required_fields
+  fields:
+  - category
+  - reply
diff --git a/evals/runledger/cassettes/t1.jsonl b/evals/runledger/cassettes/t1.jsonl
@@ -0,0 +1 @@
+{"args": {"q": "reset password"}, "ok": true, "result": {"hits": [{"snippet": "Use the reset link.", "title": "Reset password"}]}, "tool": "search_docs"}
diff --git a/evals/runledger/schema.json b/evals/runledger/schema.json
@@ -0,0 +1,15 @@
+{
+  "type": "object",
+  "properties": {
+    "category": {
+      "type": "string"
+    },
+    "reply": {
+      "type": "string"
+    }
+  },
+  "required": [
+    "category",
+    "reply"
+  ]
+}
diff --git a/evals/runledger/suite.yaml b/evals/runledger/suite.yaml
@@ -0,0 +1,18 @@
+suite_name: runledger-demo
+agent_command:
+- python
+- agent/agent.py
+mode: replay
+cases_path: cases
+tool_registry:
+- search_docs
+assertions:
+- type: json_schema
+  schema_path: schema.json
+budgets:
+  max_wall_ms: 20000
+  max_tool_calls: 1
+  max_tool_errors: 0
+regression:
+  min_pass_rate: 1.0
+baseline_path: ../../baselines/runledger-demo.json

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"args": {"q": "reset password"}, "ok": true, "result": {"hits": [{"snippet": "Use the reset link.", "title": "Reset password"}]}, "tool": "search_docs"}`