Skip to content

Commit ef9b339

Browse files
authored
Evals improvements (#2555)
* Evals improvements * Remove debugging
1 parent d3c65ce commit ef9b339

File tree

14 files changed

+214
-157
lines changed

14 files changed

+214
-157
lines changed

evals/apps/cli/package.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@
1616
"execa": "^9.5.2",
1717
"gluegun": "^5.1.2",
1818
"p-map": "^7.0.3",
19-
"p-wait-for": "^5.0.2"
19+
"p-wait-for": "^5.0.2",
20+
"ps-tree": "^1.2.0"
2021
},
2122
"devDependencies": {
2223
"@evals/eslint-config": "workspace:^",
23-
"@evals/typescript-config": "workspace:^"
24+
"@evals/typescript-config": "workspace:^",
25+
"@types/ps-tree": "^1.1.6"
2426
}
2527
}

evals/apps/cli/src/index.ts

Lines changed: 76 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import pMap from "p-map"
66
import pWaitFor from "p-wait-for"
77
import { execa, parseCommandString } from "execa"
88
import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
9+
import psTree from "ps-tree"
910

1011
import {
1112
type ExerciseLanguage,
@@ -36,8 +37,9 @@ import { getExercises } from "./exercises.js"
3637
type TaskResult = { success: boolean; retry: boolean }
3738
type TaskPromise = Promise<TaskResult>
3839

39-
const TASK_TIMEOUT = 10 * 60 * 1_000
40-
const UNIT_TEST_TIMEOUT = 60 * 1_000
40+
const TASK_START_DELAY = 10 * 1_000
41+
const TASK_TIMEOUT = 5 * 60 * 1_000
42+
const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
4143

4244
const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
4345
go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
@@ -98,13 +100,11 @@ const run = async (toolbox: GluegunToolbox) => {
98100
throw new Error("No tasks found.")
99101
}
100102

101-
console.log(await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`)
102-
console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
103-
console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
104-
console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
105-
console.log(
106-
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
107-
)
103+
await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`
104+
await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`
105+
await execa({ cwd: exercisesPath })`git checkout -f`
106+
await execa({ cwd: exercisesPath })`git clean -fd`
107+
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
108108

109109
fs.writeFileSync(
110110
path.resolve(exercisesPath, "settings.json"),
@@ -145,11 +145,11 @@ const run = async (toolbox: GluegunToolbox) => {
145145
}
146146
}
147147

148-
let delay = 0
148+
let delay = TASK_START_DELAY
149149

150150
for (const task of tasks) {
151151
const promise = processTask(task, delay)
152-
delay = delay + 5_000
152+
delay = delay + TASK_START_DELAY
153153
runningPromises.push(promise)
154154
promise.then(() => processTaskResult(task, promise))
155155

@@ -162,10 +162,10 @@ const run = async (toolbox: GluegunToolbox) => {
162162
await Promise.all(runningPromises)
163163

164164
const result = await finishRun(run.id)
165-
console.log("[cli#run]", result)
165+
console.log(`${Date.now()} [cli#run]`, result)
166166

167-
console.log(await execa({ cwd: exercisesPath })`git add .`)
168-
console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
167+
await execa({ cwd: exercisesPath })`git add .`
168+
await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
169169
}
170170

171171
const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
@@ -180,9 +180,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
180180
// Don't await execa and store result as subprocess.
181181
// subprocess.stdout.pipe(process.stdout)
182182

183-
// Sleep for a random amount of time before opening a new VSCode window.
184-
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 5_000))
185-
console.log(`Opening new VS Code window at ${workspacePath}`)
183+
console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)
186184

187185
await execa({
188186
env: {
@@ -192,15 +190,15 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
192190
})`code --disable-workspace-trust -n ${workspacePath}`
193191

194192
// Give VSCode some time to spawn before connecting to its unix socket.
195-
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
196-
console.log(`Connecting to ${taskSocketPath}`)
193+
await new Promise((resolve) => setTimeout(resolve, 3_000))
194+
console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`)
197195
const client = new IpcClient(taskSocketPath)
198196

199197
try {
200198
await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
201199
// eslint-disable-next-line @typescript-eslint/no-unused-vars
202200
} catch (error) {
203-
console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
201+
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`)
204202
client.disconnect()
205203
return { success: false, retry: false }
206204
}
@@ -220,16 +218,20 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
220218
client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
221219
const { eventName, payload } = taskEvent
222220

223-
server.broadcast({
224-
type: IpcMessageType.TaskEvent,
225-
origin: IpcOrigin.Server,
226-
relayClientId: client.clientId!,
227-
data: { ...taskEvent, taskId: task.id },
228-
})
221+
if (taskEvent.eventName !== RooCodeEventName.Message) {
222+
server.broadcast({
223+
type: IpcMessageType.TaskEvent,
224+
origin: IpcOrigin.Server,
225+
relayClientId: client.clientId!,
226+
data: { ...taskEvent, taskId: task.id },
227+
})
228+
}
229229

230230
if (!ignoreEvents.includes(eventName)) {
231-
console.log(`[cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`)
232-
console.log(payload)
231+
console.log(
232+
`${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`,
233+
payload,
234+
)
233235
}
234236

235237
if (eventName === RooCodeEventName.TaskStarted) {
@@ -279,11 +281,11 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
279281
})
280282

281283
client.on(IpcMessageType.Disconnect, async () => {
282-
console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
284+
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`)
283285
isClientDisconnected = true
284286
})
285287

286-
console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
288+
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)
287289

288290
client.sendMessage({
289291
type: IpcMessageType.TaskCommand,
@@ -307,7 +309,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
307309
await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
308310
// eslint-disable-next-line @typescript-eslint/no-unused-vars
309311
} catch (error) {
310-
console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
312+
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`)
311313

312314
// Cancel the task.
313315
if (rooTaskId && !isClientDisconnected) {
@@ -351,17 +353,56 @@ const runUnitTest = async ({ task }: { task: Task }) => {
351353
let passed = true
352354

353355
for (const command of commands) {
354-
const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT
355-
356356
try {
357-
const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
357+
console.log(
358+
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`,
359+
)
360+
const subprocess = execa({ cwd, shell: true, reject: false })`${command}`
361+
362+
const timeout = setTimeout(async () => {
363+
const descendants = await new Promise<number[]>((resolve, reject) => {
364+
psTree(subprocess.pid!, (err, children) => {
365+
if (err) {
366+
reject(err)
367+
}
368+
369+
resolve(children.map((p) => parseInt(p.PID)))
370+
})
371+
})
372+
373+
if (descendants.length > 0) {
374+
try {
375+
console.log(
376+
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendants.join(" ")}`,
377+
)
378+
379+
await execa`kill -9 ${descendants.join(" ")}`
380+
} catch (error) {
381+
console.error("Error killing descendant processes:", error)
382+
}
383+
}
384+
385+
console.log(
386+
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`,
387+
)
388+
389+
await execa`kill -9 ${subprocess.pid!}`
390+
}, UNIT_TEST_TIMEOUT)
391+
392+
const result = await subprocess
393+
394+
console.log(
395+
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`,
396+
)
397+
398+
clearTimeout(timeout)
358399

359400
if (result.failed) {
360401
passed = false
361402
break
362403
}
363404
} catch (error) {
364-
console.log("[cli#runUnitTest]", error)
405+
console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error)
365406
passed = false
366407
break
367408
}
Lines changed: 41 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,44 @@
11
"use client"
22

3-
import { useState, useRef } from "react"
4-
import { LoaderCircle, SquareTerminal } from "lucide-react"
3+
import { useMemo } from "react"
4+
import { LoaderCircle } from "lucide-react"
55

66
import * as db from "@evals/db"
77

88
import { formatCurrency, formatDuration, formatTokens } from "@/lib"
99
import { useRunStatus } from "@/hooks/use-run-status"
10-
import {
11-
Drawer,
12-
DrawerContent,
13-
DrawerHeader,
14-
DrawerTitle,
15-
ScrollArea,
16-
Table,
17-
TableBody,
18-
TableCell,
19-
TableHead,
20-
TableHeader,
21-
TableRow,
22-
} from "@/components/ui"
10+
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
2311

2412
import { TaskStatus } from "./task-status"
2513
import { ConnectionStatus } from "./connection-status"
2614

15+
type TaskMetrics = Pick<db.TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">
16+
2717
export function Run({ run }: { run: db.Run }) {
28-
const { tasks, status, output, outputCounts } = useRunStatus(run)
29-
const scrollAreaRef = useRef<HTMLDivElement>(null)
30-
const [selectedTask, setSelectedTask] = useState<db.Task>()
18+
const { tasks, status, tokenUsage, usageUpdatedAt } = useRunStatus(run)
19+
20+
const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
21+
const metrics: Record<number, TaskMetrics> = {}
22+
23+
tasks?.forEach((task) => {
24+
const usage = tokenUsage.get(task.id)
25+
26+
if (task.finishedAt && task.taskMetrics) {
27+
metrics[task.id] = task.taskMetrics
28+
} else if (usage) {
29+
metrics[task.id] = {
30+
tokensIn: usage.totalTokensIn,
31+
tokensOut: usage.totalTokensOut,
32+
tokensContext: usage.contextTokens,
33+
duration: usage.duration ?? 0,
34+
cost: usage.totalCost,
35+
}
36+
}
37+
})
38+
39+
return metrics
40+
// eslint-disable-next-line react-hooks/exhaustive-deps
41+
}, [tasks, tokenUsage, usageUpdatedAt])
3142

3243
return (
3344
<>
@@ -57,38 +68,33 @@ export function Run({ run }: { run: db.Run }) {
5768
<TableRow key={task.id}>
5869
<TableCell>
5970
<div className="flex items-center gap-2">
60-
<TaskStatus task={task} />
71+
<TaskStatus
72+
task={task}
73+
running={!!task.startedAt || !!tokenUsage.get(task.id)}
74+
/>
6175
<div>
6276
{task.language}/{task.exercise}
6377
</div>
64-
{(outputCounts[task.id] ?? 0) > 0 && (
65-
<div
66-
className="flex items-center gap-1 cursor-pointer"
67-
onClick={() => setSelectedTask(task)}>
68-
<SquareTerminal className="size-4" />
69-
<div className="font-mono text-xs text-foreground/50">
70-
{outputCounts[task.id]}
71-
</div>
72-
</div>
73-
)}
7478
</div>
7579
</TableCell>
76-
{task.taskMetrics ? (
80+
{taskMetrics[task.id] ? (
7781
<>
7882
<TableCell className="font-mono text-xs">
7983
<div className="flex items-center justify-evenly">
80-
<div>{formatTokens(task.taskMetrics.tokensIn)}</div>/
81-
<div>{formatTokens(task.taskMetrics.tokensOut)}</div>
84+
<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
85+
<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
8286
</div>
8387
</TableCell>
8488
<TableCell className="font-mono text-xs">
85-
{formatTokens(task.taskMetrics.tokensContext)}
89+
{formatTokens(taskMetrics[task.id]!.tokensContext)}
8690
</TableCell>
8791
<TableCell className="font-mono text-xs">
88-
{formatDuration(task.taskMetrics.duration)}
92+
{taskMetrics[task.id]!.duration
93+
? formatDuration(taskMetrics[task.id]!.duration)
94+
: "-"}
8995
</TableCell>
9096
<TableCell className="font-mono text-xs">
91-
{formatCurrency(task.taskMetrics.cost)}
97+
{formatCurrency(taskMetrics[task.id]!.cost)}
9298
</TableCell>
9399
</>
94100
) : (
@@ -100,27 +106,6 @@ export function Run({ run }: { run: db.Run }) {
100106
</Table>
101107
)}
102108
</div>
103-
<Drawer open={!!selectedTask} onOpenChange={() => setSelectedTask(undefined)}>
104-
<DrawerContent>
105-
<div className="mx-auto w-full max-w-2xl">
106-
<DrawerHeader>
107-
<DrawerTitle>
108-
{selectedTask?.language}/{selectedTask?.exercise}
109-
</DrawerTitle>
110-
</DrawerHeader>
111-
<div className="font-mono text-xs pb-12">
112-
{selectedTask && (
113-
<ScrollArea viewportRef={scrollAreaRef} className="h-96 rounded-sm border">
114-
<div className="p-4">
115-
<h4 className="mb-4 text-sm font-medium leading-none">Tags</h4>
116-
{output.get(selectedTask.id)?.map((line, i) => <div key={i}>{line}</div>)}
117-
</div>
118-
</ScrollArea>
119-
)}
120-
</div>
121-
</div>
122-
</DrawerContent>
123-
</Drawer>
124109
</>
125110
)
126111
}

evals/apps/web/src/app/runs/[id]/task-status.tsx

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,15 @@ import { type Task } from "@evals/db"
44

55
type TaskStatusProps = {
66
task: Task
7+
running: boolean
78
}
89

9-
export const TaskStatus = ({ task }: TaskStatusProps) => {
10+
export const TaskStatus = ({ task, running }: TaskStatusProps) => {
1011
return task.passed === false ? (
1112
<CircleSlash className="size-4 text-destructive" />
1213
) : task.passed === true ? (
1314
<CircleCheck className="size-4 text-green-500" />
14-
) : task.startedAt ? (
15-
<LoaderCircle className="size-4 animate-spin" />
16-
) : task.finishedAt ? (
15+
) : running ? (
1716
<LoaderCircle className="size-4 animate-spin" />
1817
) : (
1918
<CircleDashed className="size-4" />

0 commit comments

Comments
 (0)