Skip to content

Commit e5472d0

Browse files
committed
Move evals
1 parent 900de7b commit e5472d0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3706
-0
lines changed

packages/evals/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
## Running Migrations
2+
3+
Update `src/schema.ts` as needed, and then run:
4+
5+
```sh
6+
pnpm db:generate
7+
```
8+
9+
Inspect the sql in the migration file added to `drizzle/`.
10+
11+
If it looks okay, then run:
12+
13+
```sh
14+
pnpm db:migrate
15+
```

packages/evals/drizzle.config.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import { defineConfig } from "drizzle-kit"
2+
3+
if ((!process.env.TURSO_CONNECTION_URL || !process.env.TURSO_AUTH_TOKEN) && !process.env.BENCHMARKS_DB_PATH) {
4+
throw new Error("TURSO_CONNECTION_URL and TURSO_AUTH_TOKEN or BENCHMARKS_DB_PATH must be set")
5+
}
6+
7+
const dialect = process.env.BENCHMARKS_DB_PATH ? "sqlite" : "turso"
8+
9+
const dbCredentials = process.env.BENCHMARKS_DB_PATH
10+
? { url: process.env.BENCHMARKS_DB_PATH }
11+
: { url: process.env.TURSO_CONNECTION_URL!, authToken: process.env.TURSO_AUTH_TOKEN! }
12+
13+
export default defineConfig({
14+
out: "./drizzle",
15+
schema: "./src/schema.ts",
16+
dialect,
17+
dbCredentials,
18+
})
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
CREATE TABLE `runs` (
2+
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
3+
`taskMetricsId` integer,
4+
`model` text NOT NULL,
5+
`description` text,
6+
`pid` integer,
7+
`socketPath` text NOT NULL,
8+
`passed` integer DEFAULT 0 NOT NULL,
9+
`failed` integer DEFAULT 0 NOT NULL,
10+
`createdAt` integer NOT NULL,
11+
FOREIGN KEY (`taskMetricsId`) REFERENCES `taskMetrics`(`id`) ON UPDATE no action ON DELETE no action
12+
);
13+
--> statement-breakpoint
14+
CREATE TABLE `taskMetrics` (
15+
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
16+
`tokensIn` integer NOT NULL,
17+
`tokensOut` integer NOT NULL,
18+
`tokensContext` integer NOT NULL,
19+
`cacheWrites` integer NOT NULL,
20+
`cacheReads` integer NOT NULL,
21+
`cost` real NOT NULL,
22+
`duration` integer NOT NULL,
23+
`createdAt` integer NOT NULL
24+
);
25+
--> statement-breakpoint
26+
CREATE TABLE `tasks` (
27+
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
28+
`runId` integer NOT NULL,
29+
`taskMetricsId` integer,
30+
`language` text NOT NULL,
31+
`exercise` text NOT NULL,
32+
`passed` integer,
33+
`startedAt` integer,
34+
`finishedAt` integer,
35+
`createdAt` integer NOT NULL,
36+
FOREIGN KEY (`runId`) REFERENCES `runs`(`id`) ON UPDATE no action ON DELETE no action,
37+
FOREIGN KEY (`taskMetricsId`) REFERENCES `taskMetrics`(`id`) ON UPDATE no action ON DELETE no action
38+
);
39+
--> statement-breakpoint
40+
CREATE UNIQUE INDEX `tasks_language_exercise_idx` ON `tasks` (`runId`,`language`,`exercise`);
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE `runs` ADD `settings` blob;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE `runs` ADD `concurrency` integer DEFAULT 2 NOT NULL;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE `taskMetrics` ADD `toolUsage` text;
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
CREATE TABLE `toolErrors` (
2+
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
3+
`runId` integer,
4+
`taskId` integer,
5+
`toolName` text NOT NULL,
6+
`error` text NOT NULL,
7+
`createdAt` integer NOT NULL,
8+
FOREIGN KEY (`runId`) REFERENCES `runs`(`id`) ON UPDATE no action ON DELETE no action,
9+
FOREIGN KEY (`taskId`) REFERENCES `tasks`(`id`) ON UPDATE no action ON DELETE no action
10+
);
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
{
2+
"version": "6",
3+
"dialect": "sqlite",
4+
"id": "c0fa8491-b5c0-493d-aa32-ddf280259c30",
5+
"prevId": "00000000-0000-0000-0000-000000000000",
6+
"tables": {
7+
"runs": {
8+
"name": "runs",
9+
"columns": {
10+
"id": {
11+
"name": "id",
12+
"type": "integer",
13+
"primaryKey": true,
14+
"notNull": true,
15+
"autoincrement": true
16+
},
17+
"taskMetricsId": {
18+
"name": "taskMetricsId",
19+
"type": "integer",
20+
"primaryKey": false,
21+
"notNull": false,
22+
"autoincrement": false
23+
},
24+
"model": {
25+
"name": "model",
26+
"type": "text",
27+
"primaryKey": false,
28+
"notNull": true,
29+
"autoincrement": false
30+
},
31+
"description": {
32+
"name": "description",
33+
"type": "text",
34+
"primaryKey": false,
35+
"notNull": false,
36+
"autoincrement": false
37+
},
38+
"pid": {
39+
"name": "pid",
40+
"type": "integer",
41+
"primaryKey": false,
42+
"notNull": false,
43+
"autoincrement": false
44+
},
45+
"socketPath": {
46+
"name": "socketPath",
47+
"type": "text",
48+
"primaryKey": false,
49+
"notNull": true,
50+
"autoincrement": false
51+
},
52+
"passed": {
53+
"name": "passed",
54+
"type": "integer",
55+
"primaryKey": false,
56+
"notNull": true,
57+
"autoincrement": false,
58+
"default": 0
59+
},
60+
"failed": {
61+
"name": "failed",
62+
"type": "integer",
63+
"primaryKey": false,
64+
"notNull": true,
65+
"autoincrement": false,
66+
"default": 0
67+
},
68+
"createdAt": {
69+
"name": "createdAt",
70+
"type": "integer",
71+
"primaryKey": false,
72+
"notNull": true,
73+
"autoincrement": false
74+
}
75+
},
76+
"indexes": {},
77+
"foreignKeys": {
78+
"runs_taskMetricsId_taskMetrics_id_fk": {
79+
"name": "runs_taskMetricsId_taskMetrics_id_fk",
80+
"tableFrom": "runs",
81+
"tableTo": "taskMetrics",
82+
"columnsFrom": ["taskMetricsId"],
83+
"columnsTo": ["id"],
84+
"onDelete": "no action",
85+
"onUpdate": "no action"
86+
}
87+
},
88+
"compositePrimaryKeys": {},
89+
"uniqueConstraints": {},
90+
"checkConstraints": {}
91+
},
92+
"taskMetrics": {
93+
"name": "taskMetrics",
94+
"columns": {
95+
"id": {
96+
"name": "id",
97+
"type": "integer",
98+
"primaryKey": true,
99+
"notNull": true,
100+
"autoincrement": true
101+
},
102+
"tokensIn": {
103+
"name": "tokensIn",
104+
"type": "integer",
105+
"primaryKey": false,
106+
"notNull": true,
107+
"autoincrement": false
108+
},
109+
"tokensOut": {
110+
"name": "tokensOut",
111+
"type": "integer",
112+
"primaryKey": false,
113+
"notNull": true,
114+
"autoincrement": false
115+
},
116+
"tokensContext": {
117+
"name": "tokensContext",
118+
"type": "integer",
119+
"primaryKey": false,
120+
"notNull": true,
121+
"autoincrement": false
122+
},
123+
"cacheWrites": {
124+
"name": "cacheWrites",
125+
"type": "integer",
126+
"primaryKey": false,
127+
"notNull": true,
128+
"autoincrement": false
129+
},
130+
"cacheReads": {
131+
"name": "cacheReads",
132+
"type": "integer",
133+
"primaryKey": false,
134+
"notNull": true,
135+
"autoincrement": false
136+
},
137+
"cost": {
138+
"name": "cost",
139+
"type": "real",
140+
"primaryKey": false,
141+
"notNull": true,
142+
"autoincrement": false
143+
},
144+
"duration": {
145+
"name": "duration",
146+
"type": "integer",
147+
"primaryKey": false,
148+
"notNull": true,
149+
"autoincrement": false
150+
},
151+
"createdAt": {
152+
"name": "createdAt",
153+
"type": "integer",
154+
"primaryKey": false,
155+
"notNull": true,
156+
"autoincrement": false
157+
}
158+
},
159+
"indexes": {},
160+
"foreignKeys": {},
161+
"compositePrimaryKeys": {},
162+
"uniqueConstraints": {},
163+
"checkConstraints": {}
164+
},
165+
"tasks": {
166+
"name": "tasks",
167+
"columns": {
168+
"id": {
169+
"name": "id",
170+
"type": "integer",
171+
"primaryKey": true,
172+
"notNull": true,
173+
"autoincrement": true
174+
},
175+
"runId": {
176+
"name": "runId",
177+
"type": "integer",
178+
"primaryKey": false,
179+
"notNull": true,
180+
"autoincrement": false
181+
},
182+
"taskMetricsId": {
183+
"name": "taskMetricsId",
184+
"type": "integer",
185+
"primaryKey": false,
186+
"notNull": false,
187+
"autoincrement": false
188+
},
189+
"language": {
190+
"name": "language",
191+
"type": "text",
192+
"primaryKey": false,
193+
"notNull": true,
194+
"autoincrement": false
195+
},
196+
"exercise": {
197+
"name": "exercise",
198+
"type": "text",
199+
"primaryKey": false,
200+
"notNull": true,
201+
"autoincrement": false
202+
},
203+
"passed": {
204+
"name": "passed",
205+
"type": "integer",
206+
"primaryKey": false,
207+
"notNull": false,
208+
"autoincrement": false
209+
},
210+
"startedAt": {
211+
"name": "startedAt",
212+
"type": "integer",
213+
"primaryKey": false,
214+
"notNull": false,
215+
"autoincrement": false
216+
},
217+
"finishedAt": {
218+
"name": "finishedAt",
219+
"type": "integer",
220+
"primaryKey": false,
221+
"notNull": false,
222+
"autoincrement": false
223+
},
224+
"createdAt": {
225+
"name": "createdAt",
226+
"type": "integer",
227+
"primaryKey": false,
228+
"notNull": true,
229+
"autoincrement": false
230+
}
231+
},
232+
"indexes": {
233+
"tasks_language_exercise_idx": {
234+
"name": "tasks_language_exercise_idx",
235+
"columns": ["runId", "language", "exercise"],
236+
"isUnique": true
237+
}
238+
},
239+
"foreignKeys": {
240+
"tasks_runId_runs_id_fk": {
241+
"name": "tasks_runId_runs_id_fk",
242+
"tableFrom": "tasks",
243+
"tableTo": "runs",
244+
"columnsFrom": ["runId"],
245+
"columnsTo": ["id"],
246+
"onDelete": "no action",
247+
"onUpdate": "no action"
248+
},
249+
"tasks_taskMetricsId_taskMetrics_id_fk": {
250+
"name": "tasks_taskMetricsId_taskMetrics_id_fk",
251+
"tableFrom": "tasks",
252+
"tableTo": "taskMetrics",
253+
"columnsFrom": ["taskMetricsId"],
254+
"columnsTo": ["id"],
255+
"onDelete": "no action",
256+
"onUpdate": "no action"
257+
}
258+
},
259+
"compositePrimaryKeys": {},
260+
"uniqueConstraints": {},
261+
"checkConstraints": {}
262+
}
263+
},
264+
"views": {},
265+
"enums": {},
266+
"_meta": {
267+
"schemas": {},
268+
"tables": {},
269+
"columns": {}
270+
},
271+
"internal": {
272+
"indexes": {}
273+
}
274+
}

0 commit comments

Comments
 (0)