Skip to content

Commit 61e23cc

Browse files
authored
Record tool use errors encountered during eval runs (#2816)
1 parent a244a9d commit 61e23cc

File tree

20 files changed

+544
-46
lines changed

20 files changed

+544
-46
lines changed

evals/apps/cli/src/index.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import {
2929
updateTask,
3030
createTaskMetrics,
3131
updateTaskMetrics,
32+
createToolError,
3233
} from "@evals/db"
3334
import { IpcServer, IpcClient } from "@evals/ipc"
3435

@@ -255,6 +256,12 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
255256
rooTaskId = payload[0]
256257
}
257258

259+
if (eventName === RooCodeEventName.TaskToolFailed) {
260+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
261+
const [_taskId, toolName, error] = payload
262+
await createToolError({ taskId: task.id, toolName, error })
263+
}
264+
258265
if (
259266
(eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) &&
260267
taskMetricsId
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
CREATE TABLE `toolErrors` (
2+
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
3+
`runId` integer,
4+
`taskId` integer,
5+
`toolName` text NOT NULL,
6+
`error` text NOT NULL,
7+
`createdAt` integer NOT NULL,
8+
FOREIGN KEY (`runId`) REFERENCES `runs`(`id`) ON UPDATE no action ON DELETE no action,
9+
FOREIGN KEY (`taskId`) REFERENCES `tasks`(`id`) ON UPDATE no action ON DELETE no action
10+
);
Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
{
2+
"version": "6",
3+
"dialect": "sqlite",
4+
"id": "ae766c54-aff4-4ce6-b492-24813790c279",
5+
"prevId": "61d48d20-f662-445d-9962-cf9cb165cbe7",
6+
"tables": {
7+
"runs": {
8+
"name": "runs",
9+
"columns": {
10+
"id": {
11+
"name": "id",
12+
"type": "integer",
13+
"primaryKey": true,
14+
"notNull": true,
15+
"autoincrement": true
16+
},
17+
"taskMetricsId": {
18+
"name": "taskMetricsId",
19+
"type": "integer",
20+
"primaryKey": false,
21+
"notNull": false,
22+
"autoincrement": false
23+
},
24+
"model": {
25+
"name": "model",
26+
"type": "text",
27+
"primaryKey": false,
28+
"notNull": true,
29+
"autoincrement": false
30+
},
31+
"description": {
32+
"name": "description",
33+
"type": "text",
34+
"primaryKey": false,
35+
"notNull": false,
36+
"autoincrement": false
37+
},
38+
"settings": {
39+
"name": "settings",
40+
"type": "blob",
41+
"primaryKey": false,
42+
"notNull": false,
43+
"autoincrement": false
44+
},
45+
"pid": {
46+
"name": "pid",
47+
"type": "integer",
48+
"primaryKey": false,
49+
"notNull": false,
50+
"autoincrement": false
51+
},
52+
"socketPath": {
53+
"name": "socketPath",
54+
"type": "text",
55+
"primaryKey": false,
56+
"notNull": true,
57+
"autoincrement": false
58+
},
59+
"concurrency": {
60+
"name": "concurrency",
61+
"type": "integer",
62+
"primaryKey": false,
63+
"notNull": true,
64+
"autoincrement": false,
65+
"default": 2
66+
},
67+
"passed": {
68+
"name": "passed",
69+
"type": "integer",
70+
"primaryKey": false,
71+
"notNull": true,
72+
"autoincrement": false,
73+
"default": 0
74+
},
75+
"failed": {
76+
"name": "failed",
77+
"type": "integer",
78+
"primaryKey": false,
79+
"notNull": true,
80+
"autoincrement": false,
81+
"default": 0
82+
},
83+
"createdAt": {
84+
"name": "createdAt",
85+
"type": "integer",
86+
"primaryKey": false,
87+
"notNull": true,
88+
"autoincrement": false
89+
}
90+
},
91+
"indexes": {},
92+
"foreignKeys": {
93+
"runs_taskMetricsId_taskMetrics_id_fk": {
94+
"name": "runs_taskMetricsId_taskMetrics_id_fk",
95+
"tableFrom": "runs",
96+
"tableTo": "taskMetrics",
97+
"columnsFrom": ["taskMetricsId"],
98+
"columnsTo": ["id"],
99+
"onDelete": "no action",
100+
"onUpdate": "no action"
101+
}
102+
},
103+
"compositePrimaryKeys": {},
104+
"uniqueConstraints": {},
105+
"checkConstraints": {}
106+
},
107+
"taskMetrics": {
108+
"name": "taskMetrics",
109+
"columns": {
110+
"id": {
111+
"name": "id",
112+
"type": "integer",
113+
"primaryKey": true,
114+
"notNull": true,
115+
"autoincrement": true
116+
},
117+
"tokensIn": {
118+
"name": "tokensIn",
119+
"type": "integer",
120+
"primaryKey": false,
121+
"notNull": true,
122+
"autoincrement": false
123+
},
124+
"tokensOut": {
125+
"name": "tokensOut",
126+
"type": "integer",
127+
"primaryKey": false,
128+
"notNull": true,
129+
"autoincrement": false
130+
},
131+
"tokensContext": {
132+
"name": "tokensContext",
133+
"type": "integer",
134+
"primaryKey": false,
135+
"notNull": true,
136+
"autoincrement": false
137+
},
138+
"cacheWrites": {
139+
"name": "cacheWrites",
140+
"type": "integer",
141+
"primaryKey": false,
142+
"notNull": true,
143+
"autoincrement": false
144+
},
145+
"cacheReads": {
146+
"name": "cacheReads",
147+
"type": "integer",
148+
"primaryKey": false,
149+
"notNull": true,
150+
"autoincrement": false
151+
},
152+
"cost": {
153+
"name": "cost",
154+
"type": "real",
155+
"primaryKey": false,
156+
"notNull": true,
157+
"autoincrement": false
158+
},
159+
"duration": {
160+
"name": "duration",
161+
"type": "integer",
162+
"primaryKey": false,
163+
"notNull": true,
164+
"autoincrement": false
165+
},
166+
"toolUsage": {
167+
"name": "toolUsage",
168+
"type": "text",
169+
"primaryKey": false,
170+
"notNull": false,
171+
"autoincrement": false
172+
},
173+
"createdAt": {
174+
"name": "createdAt",
175+
"type": "integer",
176+
"primaryKey": false,
177+
"notNull": true,
178+
"autoincrement": false
179+
}
180+
},
181+
"indexes": {},
182+
"foreignKeys": {},
183+
"compositePrimaryKeys": {},
184+
"uniqueConstraints": {},
185+
"checkConstraints": {}
186+
},
187+
"tasks": {
188+
"name": "tasks",
189+
"columns": {
190+
"id": {
191+
"name": "id",
192+
"type": "integer",
193+
"primaryKey": true,
194+
"notNull": true,
195+
"autoincrement": true
196+
},
197+
"runId": {
198+
"name": "runId",
199+
"type": "integer",
200+
"primaryKey": false,
201+
"notNull": true,
202+
"autoincrement": false
203+
},
204+
"taskMetricsId": {
205+
"name": "taskMetricsId",
206+
"type": "integer",
207+
"primaryKey": false,
208+
"notNull": false,
209+
"autoincrement": false
210+
},
211+
"language": {
212+
"name": "language",
213+
"type": "text",
214+
"primaryKey": false,
215+
"notNull": true,
216+
"autoincrement": false
217+
},
218+
"exercise": {
219+
"name": "exercise",
220+
"type": "text",
221+
"primaryKey": false,
222+
"notNull": true,
223+
"autoincrement": false
224+
},
225+
"passed": {
226+
"name": "passed",
227+
"type": "integer",
228+
"primaryKey": false,
229+
"notNull": false,
230+
"autoincrement": false
231+
},
232+
"startedAt": {
233+
"name": "startedAt",
234+
"type": "integer",
235+
"primaryKey": false,
236+
"notNull": false,
237+
"autoincrement": false
238+
},
239+
"finishedAt": {
240+
"name": "finishedAt",
241+
"type": "integer",
242+
"primaryKey": false,
243+
"notNull": false,
244+
"autoincrement": false
245+
},
246+
"createdAt": {
247+
"name": "createdAt",
248+
"type": "integer",
249+
"primaryKey": false,
250+
"notNull": true,
251+
"autoincrement": false
252+
}
253+
},
254+
"indexes": {
255+
"tasks_language_exercise_idx": {
256+
"name": "tasks_language_exercise_idx",
257+
"columns": ["runId", "language", "exercise"],
258+
"isUnique": true
259+
}
260+
},
261+
"foreignKeys": {
262+
"tasks_runId_runs_id_fk": {
263+
"name": "tasks_runId_runs_id_fk",
264+
"tableFrom": "tasks",
265+
"tableTo": "runs",
266+
"columnsFrom": ["runId"],
267+
"columnsTo": ["id"],
268+
"onDelete": "no action",
269+
"onUpdate": "no action"
270+
},
271+
"tasks_taskMetricsId_taskMetrics_id_fk": {
272+
"name": "tasks_taskMetricsId_taskMetrics_id_fk",
273+
"tableFrom": "tasks",
274+
"tableTo": "taskMetrics",
275+
"columnsFrom": ["taskMetricsId"],
276+
"columnsTo": ["id"],
277+
"onDelete": "no action",
278+
"onUpdate": "no action"
279+
}
280+
},
281+
"compositePrimaryKeys": {},
282+
"uniqueConstraints": {},
283+
"checkConstraints": {}
284+
},
285+
"toolErrors": {
286+
"name": "toolErrors",
287+
"columns": {
288+
"id": {
289+
"name": "id",
290+
"type": "integer",
291+
"primaryKey": true,
292+
"notNull": true,
293+
"autoincrement": true
294+
},
295+
"runId": {
296+
"name": "runId",
297+
"type": "integer",
298+
"primaryKey": false,
299+
"notNull": false,
300+
"autoincrement": false
301+
},
302+
"taskId": {
303+
"name": "taskId",
304+
"type": "integer",
305+
"primaryKey": false,
306+
"notNull": false,
307+
"autoincrement": false
308+
},
309+
"toolName": {
310+
"name": "toolName",
311+
"type": "text",
312+
"primaryKey": false,
313+
"notNull": true,
314+
"autoincrement": false
315+
},
316+
"error": {
317+
"name": "error",
318+
"type": "text",
319+
"primaryKey": false,
320+
"notNull": true,
321+
"autoincrement": false
322+
},
323+
"createdAt": {
324+
"name": "createdAt",
325+
"type": "integer",
326+
"primaryKey": false,
327+
"notNull": true,
328+
"autoincrement": false
329+
}
330+
},
331+
"indexes": {},
332+
"foreignKeys": {
333+
"toolErrors_runId_runs_id_fk": {
334+
"name": "toolErrors_runId_runs_id_fk",
335+
"tableFrom": "toolErrors",
336+
"tableTo": "runs",
337+
"columnsFrom": ["runId"],
338+
"columnsTo": ["id"],
339+
"onDelete": "no action",
340+
"onUpdate": "no action"
341+
},
342+
"toolErrors_taskId_tasks_id_fk": {
343+
"name": "toolErrors_taskId_tasks_id_fk",
344+
"tableFrom": "toolErrors",
345+
"tableTo": "tasks",
346+
"columnsFrom": ["taskId"],
347+
"columnsTo": ["id"],
348+
"onDelete": "no action",
349+
"onUpdate": "no action"
350+
}
351+
},
352+
"compositePrimaryKeys": {},
353+
"uniqueConstraints": {},
354+
"checkConstraints": {}
355+
}
356+
},
357+
"views": {},
358+
"enums": {},
359+
"_meta": {
360+
"schemas": {},
361+
"tables": {},
362+
"columns": {}
363+
},
364+
"internal": {
365+
"indexes": {}
366+
}
367+
}

0 commit comments

Comments
 (0)