Skip to content

Commit 8b36711

Browse files
committed
More progress
1 parent b123075 commit 8b36711

File tree

25 files changed

+548
-480
lines changed

25 files changed

+548
-480
lines changed

benchmark/apps/cli/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
"dependencies": {
1212
"@benchmark/db": "workspace:^",
1313
"@vscode/test-electron": "^2.4.0",
14-
"gluegun": "^5.1.2"
14+
"gluegun": "^5.1.2",
15+
"p-map": "^7.0.3"
1516
},
1617
"devDependencies": {
1718
"@benchmark/eslint-config": "workspace:^",

benchmark/apps/cli/src/index.ts

Lines changed: 28 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,11 @@ import * as fs from "fs"
22
import * as path from "path"
33
import * as os from "os"
44

5+
import pMap from "p-map"
56
import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
67
import { runTests } from "@vscode/test-electron"
78

8-
import {
9-
type Language,
10-
languages,
11-
type Run,
12-
findRun,
13-
createRun,
14-
getPendingTask,
15-
createPendingTask,
16-
getTask,
17-
} from "@benchmark/db"
9+
import { type Language, languages, type Run, findRun, createRun, getTask, createTask, Task } from "@benchmark/db"
1810

1911
import { __dirname, extensionDevelopmentPath, extensionTestsPath, exercisesPath } from "./paths.js"
2012
import { getExercises } from "./exercises.js"
@@ -41,34 +33,33 @@ const run = async (toolbox: GluegunToolbox) => {
4133

4234
const runAll = async (id?: number) => {
4335
const run = await findOrCreateRun({ id })
44-
const exercises = getExercises()
45-
46-
for (const [language, languageExercises] of Object.entries(exercises)) {
47-
await Promise.all(
48-
languageExercises.map((exercise) =>
49-
findOrCreatePendingTask({ runId: run.id, language: language as Language, exercise }),
50-
),
51-
)
52-
}
5336

54-
for (const [language, languageExercises] of Object.entries(exercises)) {
55-
for (const exercise of languageExercises) {
56-
await runExercise({ run, language: language as Language, exercise })
57-
}
37+
const entries = Object.entries(getExercises()).flatMap(([language, languageExercises]) =>
38+
languageExercises.map((exercise) => ({ language: language as Language, exercise })),
39+
)
40+
41+
const tasks = await pMap(
42+
entries,
43+
async ({ language, exercise }) => findOrCreateTask({ runId: run.id, language, exercise }),
44+
{ concurrency: 10 },
45+
)
46+
47+
for (const task of tasks) {
48+
await runExercise({ run, task })
5849
}
5950
}
6051

6152
const runLanguage = async ({ id, language }: { id?: number; language: Language }) => {
6253
const run = await findOrCreateRun({ id })
63-
const exercises = getExercises()
64-
const languageExercises = exercises[language]
6554

66-
await Promise.all(
67-
languageExercises.map((exercise) => findOrCreatePendingTask({ runId: run.id, language, exercise })),
55+
const tasks = await pMap(
56+
getExercises()[language],
57+
async (exercise) => findOrCreateTask({ runId: run.id, language, exercise }),
58+
{ concurrency: 10 },
6859
)
6960

70-
for (const exercise of languageExercises) {
71-
await runExercise({ run, language, exercise })
61+
for (const task of tasks) {
62+
await runExercise({ run, task })
7263
}
7364
}
7465

@@ -82,21 +73,20 @@ const runLanguageExercise = async ({
8273
exercise: string
8374
}) => {
8475
const run = await findOrCreateRun({ id })
85-
await findOrCreatePendingTask({ runId: run.id, language, exercise })
86-
return runExercise({ run, language, exercise })
76+
const task = await findOrCreateTask({ runId: run.id, language, exercise })
77+
return runExercise({ run, task })
8778
}
8879

89-
const runExercise = async ({ run, language, exercise }: { run: Run; language: Language; exercise: string }) => {
80+
const runExercise = async ({ run, task }: { run: Run; task: Task }) => {
81+
const { language, exercise } = task
9082
const workspacePath = path.resolve(exercisesPath, language, exercise)
9183
const promptPath = path.resolve(exercisesPath, `prompts/${language}.md`)
9284

9385
if (!fs.existsSync(promptPath)) {
9486
throw new Error(`Prompt file does not exist: ${promptPath}`)
9587
}
9688

97-
const task = await getTask({ runId: run.id, language, exercise })
98-
99-
if (task) {
89+
if (task.finishedAt) {
10090
console.log(`Test result exists for ${language} / ${exercise}, skipping`)
10191
return false
10292
}
@@ -108,7 +98,7 @@ const runExercise = async ({ run, language, exercise }: { run: Run; language: La
10898
extensionTestsPath,
10999
launchArgs: [workspacePath, "--disable-extensions"],
110100
extensionTestsEnv: {
111-
RUN_ID: run.id.toString(),
101+
TASK_ID: task.id.toString(),
112102
LANGUAGE: language,
113103
EXERCISE: exercise,
114104
PROMPT_PATH: promptPath,
@@ -157,15 +147,15 @@ const findOrCreateRun = async ({ id, model = "anthropic/claude-3.7-sonnet" }: {
157147
socketPath: path.resolve(os.tmpdir(), `benchmark-${crypto.randomUUID()}.sock`),
158148
})
159149

160-
const findOrCreatePendingTask = async ({
150+
const findOrCreateTask = async ({
161151
runId,
162152
language,
163153
exercise,
164154
}: {
165155
runId: number
166156
language: Language
167157
exercise: string
168-
}) => (await getPendingTask({ runId, language, exercise })) || (await createPendingTask({ runId, language, exercise }))
158+
}) => (await getTask({ runId, language, exercise })) || (await createTask({ runId, language, exercise }))
169159

170160
const main = async () => {
171161
const cli = build()

benchmark/apps/web/src/app/home.tsx

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,17 @@
33
import { useRouter } from "next/navigation"
44
import { Rocket } from "lucide-react"
55

6-
import { getRuns } from "@benchmark/db"
6+
import type { Run, TaskMetrics } from "@benchmark/db"
77

88
import { formatCurrency, formatDuration } from "@/lib"
99
import { Button, Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
10+
import { useMemo } from "react"
1011

11-
type Run = Awaited<ReturnType<typeof getRuns>>[number]
12-
13-
export function Home({ runs }: { runs: Run[] }) {
12+
export function Home({ runs }: { runs: (Run & { taskMetrics: TaskMetrics | null })[] }) {
1413
const router = useRouter()
1514

15+
const visibleRuns = useMemo(() => runs.filter((run) => run.taskMetrics !== null), [runs])
16+
1617
return (
1718
<>
1819
<Table className="border">
@@ -29,18 +30,30 @@ export function Home({ runs }: { runs: Run[] }) {
2930
</TableRow>
3031
</TableHeader>
3132
<TableBody>
32-
{runs.map((run) => (
33-
<TableRow key={run.id}>
34-
<TableCell>{run.id}</TableCell>
35-
<TableCell>{run.model}</TableCell>
36-
<TableCell>{new Date(run.createdAt).toLocaleString()}</TableCell>
37-
<TableCell>{run.passed}</TableCell>
38-
<TableCell>{run.failed}</TableCell>
39-
<TableCell>{(run.rate * 100).toFixed(1)}%</TableCell>
40-
<TableCell>{formatCurrency(run.cost)}</TableCell>
41-
<TableCell>{formatDuration(run.duration)}</TableCell>
33+
{visibleRuns.length ? (
34+
visibleRuns.map(({ taskMetrics, ...run }) => (
35+
<TableRow key={run.id}>
36+
<TableCell>{run.id}</TableCell>
37+
<TableCell>{run.model}</TableCell>
38+
<TableCell>{new Date(run.createdAt).toLocaleString()}</TableCell>
39+
<TableCell>{run.passed}</TableCell>
40+
<TableCell>{run.failed}</TableCell>
41+
<TableCell>{((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}%</TableCell>
42+
<TableCell>{formatCurrency(taskMetrics!.cost)}</TableCell>
43+
<TableCell>{formatDuration(taskMetrics!.duration)}</TableCell>
44+
</TableRow>
45+
))
46+
) : (
47+
<TableRow>
48+
<TableCell colSpan={8} className="text-center">
49+
No benchmark runs yet.
50+
<Button variant="link" onClick={() => router.push("/runs/new")}>
51+
Launch
52+
</Button>
53+
one now.
54+
</TableCell>
4255
</TableRow>
43-
))}
56+
)}
4457
</TableBody>
4558
</Table>
4659
<Button
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"use server"
2+
3+
import { revalidatePath } from "next/cache"
4+
5+
import * as db from "@benchmark/db"
6+
7+
export async function getTasks(runId: number) {
8+
const tasks = await db.getTasks(runId)
9+
revalidatePath(`/runs/${runId}`)
10+
return tasks
11+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import { EventSourceStatus } from "@/hooks/use-event-source"
2+
import { cn } from "@/lib/utils"
3+
4+
type ConnectionStatusProps = {
5+
status: EventSourceStatus
6+
clientId?: string
7+
pid: number | null
8+
}
9+
10+
export const ConnectionStatus = ({ status, clientId, pid }: ConnectionStatusProps) => (
11+
<div className="flex items-center">
12+
<div className="flex flex-col items-end gap-1 font-mono text-xs border-r border-dotted pr-4 mr-4">
13+
<div>
14+
Status: <span className="capitalize">{status}</span>
15+
</div>
16+
<div>PID: {pid}</div>
17+
{clientId && <div>Client: {clientId}</div>}
18+
</div>
19+
<div className="relative">
20+
<div
21+
className={cn("absolute size-2.5 rounded-full opacity-50 animate-ping", {
22+
"bg-green-500": status === "connected",
23+
"bg-amber-500": status === "waiting",
24+
"bg-rose-500": status === "error",
25+
})}
26+
/>
27+
<div
28+
className={cn("size-2.5 rounded-full", {
29+
"bg-green-500": status === "connected",
30+
"bg-amber-500": status === "waiting",
31+
"bg-rose-500": status === "error",
32+
})}
33+
/>
34+
</div>
35+
</div>
36+
)
Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,9 @@
1-
import { findRun, getTasks, getPendingTasks } from "@benchmark/db"
1+
import { findRun } from "@benchmark/db"
22

3-
import { ShowRun } from "./show-run"
3+
import { Run } from "./run"
44

55
export default async function Page({ params }: { params: Promise<{ id: string }> }) {
66
const { id } = await params
77
const run = await findRun(Number(id))
8-
const tasks = await getTasks(run.id)
9-
const pendingTasks = await getPendingTasks(run.id)
10-
11-
if (!run) {
12-
return <div>Run not found</div>
13-
}
14-
15-
return <ShowRun run={{ ...run, tasks, pendingTasks }} />
8+
return <Run run={run} />
169
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"use client"
2+
3+
import { LoaderCircle } from "lucide-react"
4+
5+
import * as db from "@benchmark/db"
6+
7+
import { useRunStatus } from "./use-run-status"
8+
import { TaskStatus } from "./task-status"
9+
import { ConnectionStatus } from "./connection-status"
10+
11+
export function Run({ run }: { run: db.Run }) {
12+
const { tasks, status, clientId, runningTaskId } = useRunStatus(run)
13+
14+
return (
15+
<>
16+
<div className="flex flex-col gap-2">
17+
<div className="border-b mb-2 pb-2">
18+
<div>Run #{run.id}</div>
19+
<div>{run.model}</div>
20+
{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
21+
</div>
22+
{!tasks ? (
23+
<LoaderCircle className="size-4 animate-spin" />
24+
) : (
25+
tasks.map((task) => (
26+
<div key={task.id} className="flex items-center gap-2">
27+
<TaskStatus task={task} runningTaskId={runningTaskId} />
28+
<div>
29+
{task.language}/{task.exercise}
30+
</div>
31+
</div>
32+
))
33+
)}
34+
</div>
35+
<div className="absolute top-5 right-5">
36+
<ConnectionStatus status={status} clientId={clientId} pid={run.pid} />
37+
</div>
38+
</>
39+
)
40+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import { z } from "zod"
2+
3+
export const messageSchema = z.object({
4+
type: z.enum(["hello", "data"]),
5+
data: z.record(z.string(), z.unknown()),
6+
})
7+
8+
export const taskSchema = z.object({
9+
id: z.number(),
10+
})
11+
12+
export const taskEventSchema = z.discriminatedUnion("event", [
13+
z.object({ event: z.literal("client"), task: taskSchema }),
14+
z.object({ event: z.literal("taskStarted"), task: taskSchema }),
15+
z.object({ event: z.literal("taskFinished"), task: taskSchema }),
16+
z.object({
17+
event: z.literal("message"),
18+
task: taskSchema,
19+
message: z.object({
20+
taskId: z.string(),
21+
action: z.enum(["created", "updated"]),
22+
message: z.object({
23+
ts: z.number(),
24+
type: z.enum(["ask", "say"]),
25+
text: z.string(),
26+
partial: z.boolean().optional(),
27+
}),
28+
}),
29+
}),
30+
z.object({
31+
event: z.literal("taskTokenUsageUpdated"),
32+
task: taskSchema,
33+
usage: z.object({
34+
totalTokensIn: z.number(),
35+
totalTokensOut: z.number(),
36+
totalCacheWrites: z.number().optional(),
37+
totalCacheReads: z.number().optional(),
38+
totalCost: z.number(),
39+
contextTokens: z.number(),
40+
}),
41+
}),
42+
])

0 commit comments

Comments
 (0)