Skip to content

Commit aa20a81

Browse files
committed
Add tests
1 parent bb8a048 commit aa20a81

File tree

15 files changed

+155
-13
lines changed

15 files changed

+155
-13
lines changed

evals/packages/db/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
test.db

evals/packages/db/drizzle/0003_familiar_miss_america.sql

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE `taskMetrics` ADD `toolUsage` text;

evals/packages/db/drizzle/meta/0003_snapshot.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"version": "6",
33
"dialect": "sqlite",
4-
"id": "a7a893e2-373a-4706-bcd4-772e2525db62",
4+
"id": "61d48d20-f662-445d-9962-cf9cb165cbe7",
55
"prevId": "f49d9b0b-fda9-467a-9adb-c941d6cbf7ce",
66
"tables": {
77
"runs": {
@@ -165,7 +165,7 @@
165165
},
166166
"toolUsage": {
167167
"name": "toolUsage",
168-
"type": "blob",
168+
"type": "text",
169169
"primaryKey": false,
170170
"notNull": false,
171171
"autoincrement": false

evals/packages/db/drizzle/meta/_journal.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
{
2727
"idx": 3,
2828
"version": "6",
29-
"when": 1744933023667,
30-
"tag": "0003_familiar_miss_america",
29+
"when": 1744950664129,
30+
"tag": "0003_sweet_chimera",
3131
"breakpoints": true
3232
}
3333
]

evals/packages/db/package.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"scripts": {
77
"lint": "eslint src/**/*.ts --max-warnings=0",
88
"check-types": "tsc --noEmit",
9+
"test": "vitest --globals --run",
910
"format": "prettier --write src",
1011
"drizzle-kit": "dotenvx run -f ../../.env -- tsx node_modules/drizzle-kit/bin.cjs",
1112
"db:generate": "pnpm drizzle-kit generate",
@@ -29,6 +30,8 @@
2930
"devDependencies": {
3031
"@evals/eslint-config": "workspace:^",
3132
"@evals/typescript-config": "workspace:^",
32-
"drizzle-kit": "^0.30.5"
33+
"drizzle-kit": "^0.30.5",
34+
"execa": "^9.5.2",
35+
"vitest": "^3.0.9"
3336
}
3437
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import { createRun, finishRun } from "../runs.js"
2+
import { createTask } from "../tasks.js"
3+
import { createTaskMetrics } from "../taskMetrics.js"
4+
5+
describe("finishRun", async () => {
6+
it("aggregates task metrics, including tool usage", async () => {
7+
const run = await createRun({ model: "gpt-4.1-mini", socketPath: "/tmp/roo.sock" })
8+
9+
await createTask({
10+
runId: run.id,
11+
taskMetricsId: (
12+
await createTaskMetrics({
13+
duration: 45_000,
14+
tokensIn: 100_000,
15+
tokensOut: 2_000,
16+
tokensContext: 102_000,
17+
cacheWrites: 0,
18+
cacheReads: 0,
19+
cost: 0.05,
20+
toolUsage: {
21+
read_file: {
22+
attempts: 3,
23+
failures: 0,
24+
},
25+
apply_diff: {
26+
attempts: 3,
27+
failures: 1,
28+
},
29+
},
30+
})
31+
).id,
32+
language: "go",
33+
exercise: "go/say",
34+
passed: true,
35+
startedAt: new Date(),
36+
finishedAt: new Date(),
37+
})
38+
39+
await createTask({
40+
runId: run.id,
41+
taskMetricsId: (
42+
await createTaskMetrics({
43+
duration: 30_000,
44+
tokensIn: 75_000,
45+
tokensOut: 1_000,
46+
tokensContext: 76_000,
47+
cacheWrites: 0,
48+
cacheReads: 0,
49+
cost: 0.04,
50+
toolUsage: {
51+
read_file: {
52+
attempts: 3,
53+
failures: 0,
54+
},
55+
apply_diff: {
56+
attempts: 2,
57+
failures: 0,
58+
},
59+
},
60+
})
61+
).id,
62+
language: "go",
63+
exercise: "go/octal",
64+
passed: true,
65+
startedAt: new Date(),
66+
finishedAt: new Date(),
67+
})
68+
69+
const { taskMetrics } = await finishRun(run.id)
70+
71+
expect(taskMetrics).toEqual({
72+
id: expect.any(Number),
73+
tokensIn: 175000,
74+
tokensOut: 3000,
75+
tokensContext: 178000,
76+
cacheWrites: 0,
77+
cacheReads: 0,
78+
cost: 0.09,
79+
duration: 75000,
80+
toolUsage: {
81+
read_file: { attempts: 6, failures: 0 },
82+
apply_diff: { attempts: 5, failures: 1 },
83+
},
84+
createdAt: expect.any(Date),
85+
})
86+
})
87+
})

evals/packages/db/src/queries/runs.ts

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { desc, eq, inArray, sql, sum } from "drizzle-orm"
22

3+
import { ToolUsage } from "@evals/types"
4+
35
import { RecordNotFoundError, RecordNotCreatedError } from "./errors.js"
46
import type { InsertRun, UpdateRun } from "../schema.js"
57
import { insertRunSchema, schema } from "../schema.js"
68
import { db } from "../db.js"
79
import { createTaskMetrics } from "./taskMetrics.js"
10+
import { getTasks } from "./tasks.js"
811

912
const table = schema.runs
1013

@@ -71,17 +74,30 @@ export const finishRun = async (runId: number) => {
7174
throw new RecordNotFoundError()
7275
}
7376

77+
const tasks = await getTasks(runId)
78+
79+
const toolUsage = tasks.reduce((acc, task) => {
80+
Object.entries(task.taskMetrics?.toolUsage || {}).forEach(([key, { attempts, failures }]) => {
81+
const tool = key as keyof ToolUsage
82+
acc[tool] ??= { attempts: 0, failures: 0 }
83+
acc[tool].attempts += attempts
84+
acc[tool].failures += failures
85+
})
86+
87+
return acc
88+
}, {} as ToolUsage)
89+
7490
const { passed, failed, ...rest } = values
75-
const taskMetrics = await createTaskMetrics(rest)
91+
const taskMetrics = await createTaskMetrics({ ...rest, toolUsage })
7692
await updateRun(runId, { taskMetricsId: taskMetrics.id, passed, failed })
7793

78-
const run = await db.query.runs.findFirst({ where: eq(table.id, runId), with: { taskMetrics: true } })
94+
const run = await findRun(runId)
7995

8096
if (!run) {
8197
throw new RecordNotFoundError()
8298
}
8399

84-
return run
100+
return { ...run, taskMetrics }
85101
}
86102

87103
export const deleteRun = async (runId: number) => {

evals/packages/db/src/schema.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ export const taskMetrics = sqliteTable("taskMetrics", {
8484
cacheReads: integer({ mode: "number" }).notNull(),
8585
cost: real().notNull(),
8686
duration: integer({ mode: "number" }).notNull(),
87-
toolUsage: blob({ mode: "json" }).$type<ToolUsage>(),
87+
toolUsage: text({ mode: "json" }).$type<ToolUsage>(),
8888
createdAt: integer({ mode: "timestamp" }).notNull(),
8989
})
9090

evals/packages/db/tsconfig.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
{
22
"extends": "@evals/typescript-config/base.json",
3+
"compilerOptions": {
4+
"types": ["vitest/globals"]
5+
},
36
"include": ["src"],
47
"exclude": ["node_modules"]
58
}

0 commit comments

Comments
 (0)