Skip to content

Commit c2e5c8d

Browse files
committed
Ironing out some kinks when running evals with high parallelism
1 parent e7a1db8 commit c2e5c8d

File tree

13 files changed

+297
-197
lines changed

13 files changed

+297
-197
lines changed

evals/apps/cli/src/index.ts

Lines changed: 96 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,17 @@ import { IpcServer, IpcClient } from "@evals/ipc"
3333
import { __dirname, extensionDevelopmentPath, exercisesPath } from "./paths.js"
3434
import { getExercises } from "./exercises.js"
3535

36-
const maxConcurrency = 2
37-
const taskTimeLimit = 5 * 60 * 1_000
36+
type TaskResult = { success: boolean; retry: boolean }
37+
type TaskPromise = Promise<TaskResult>
38+
39+
const MAX_CONCURRENCY = 20
40+
const TASK_TIMEOUT = 5 * 60 * 1_000
41+
const UNIT_TEST_TIMEOUT = 60 * 1_000
3842

3943
const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
4044
go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
4145
java: { commands: ["./gradlew test"] }, // timeout --foreground 15s bash -c "cd '$dir' && ./gradlew test > /dev/null 2>&1"
42-
javascript: { commands: ["pnpm install", "pnpm test"], timeout: 30_000 }, // timeout 30s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
46+
javascript: { commands: ["pnpm install", "pnpm test"] }, // timeout 15s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
4347
python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] }, // timeout 15s bash -c "cd '$dir' && uv run python3 -m pytest -o markers=task *_test.py"
4448
rust: { commands: ["cargo test"] }, // timeout 15s bash -c "cd '$dir' && cargo test > /dev/null 2>&1"
4549
}
@@ -107,130 +111,104 @@ const run = async (toolbox: GluegunToolbox) => {
107111
const server = new IpcServer(run.socketPath, () => {})
108112
server.listen()
109113

110-
// server.on(IpcMessageType.Connect, (clientId) => {
111-
// server.send(clientId, {
112-
// type: IpcMessageType.TaskEvent,
113-
// origin: IpcOrigin.Server,
114-
// data: { eventName: RooCodeEventName.Connect, taskId: -1 },
115-
// })
116-
// })
117-
118-
const runningPromises: Promise<void>[] = []
114+
const runningPromises: TaskPromise[] = []
119115

116+
// Retries aren't implemented yet, but the return values are set up to
117+
// support them.
120118
const processTask = async (task: Task) => {
121119
if (task.finishedAt === null) {
122-
await runExercise({ run, task, server })
120+
const { retry } = await runExercise({ run, task, server })
121+
122+
if (retry) {
123+
return { success: false, retry: true }
124+
}
123125
}
124126

125127
if (task.passed === null) {
126128
const passed = await runUnitTest({ task })
127129
await updateTask(task.id, { passed })
130+
return { success: passed, retry: false }
131+
} else {
132+
return { success: task.passed, retry: false }
128133
}
129134
}
130135

131-
for (const task of tasks) {
132-
const taskPromise = processTask(task)
133-
runningPromises.push(taskPromise)
136+
const processTaskResult = async (task: Task, promise: TaskPromise) => {
137+
const index = runningPromises.indexOf(promise)
134138

135-
taskPromise.finally(() => {
136-
const index = runningPromises.indexOf(taskPromise)
139+
if (index > -1) {
140+
runningPromises.splice(index, 1)
141+
}
142+
}
137143

138-
if (index > -1) {
139-
runningPromises.splice(index, 1)
140-
}
141-
})
144+
for (const task of tasks) {
145+
const promise = processTask(task)
146+
runningPromises.push(promise)
147+
promise.then(() => processTaskResult(task, promise))
142148

143-
if (runningPromises.length >= maxConcurrency) {
149+
if (runningPromises.length > MAX_CONCURRENCY) {
144150
await Promise.race(runningPromises)
145151
}
146152
}
147153

148154
await Promise.all(runningPromises)
149155

150156
const result = await finishRun(run.id)
151-
try {
152-
console.log("[cli#run]", result)
153-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
154-
} catch (error) {
155-
// console.error(error)
156-
}
157+
console.log("[cli#run]", result)
157158

158159
console.log(await execa({ cwd: exercisesPath })`git add .`)
159160
console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
160161
}
161162

162-
const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }) => {
163+
const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
163164
const { language, exercise } = task
164165
const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
165166
const dirname = path.dirname(run.socketPath)
167+
const workspacePath = path.resolve(exercisesPath, language, exercise)
166168
const taskSocketPath = path.resolve(dirname, `${dirname}/task-${task.id}.sock`)
167169

168-
const controller = new AbortController()
169-
const cancelSignal = controller.signal
170-
171170
// If debugging:
172171
// Use --wait --log trace or --verbose.
173-
const codeCommand = `code --disable-workspace-trust`
172+
// Don't await execa and store result as subprocess.
173+
// subprocess.stdout.pipe(process.stdout)
174+
175+
// Sleep for a random amount of time before opening a new VSCode window.
176+
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * MAX_CONCURRENCY * 1_000))
177+
console.log(`Opening new VS Code window at ${workspacePath}`)
174178

175179
await execa({
176180
env: {
177181
ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
178182
},
179183
shell: "/bin/bash",
180-
cancelSignal,
181-
})`${codeCommand} -n ${path.resolve(exercisesPath, language, exercise)}`
184+
})`code --disable-workspace-trust -n ${workspacePath}`
182185

183-
// If debugging:
184-
// Don't await execa and store result as subprocess.
185-
// subprocess.stdout.pipe(process.stdout)
186-
187-
// Give VSCode some time to spawn before connectint to its unix socket.
188-
await new Promise((resolve) => setTimeout(resolve, 1_000))
186+
// Give VSCode some time to spawn before connecting to its unix socket.
187+
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
189188
console.log(`Connecting to ${taskSocketPath}`)
189+
const client = new IpcClient(taskSocketPath)
190190

191-
const createClient = (taskSocketPath: string) => {
192-
const ipcClient = new IpcClient(taskSocketPath)
193-
194-
ipcClient.on(IpcMessageType.Ack, (ack) => {
195-
console.log(`[cli#runExercise | ${language} / ${exercise}] ack`, ack)
196-
})
197-
198-
return ipcClient
199-
}
200-
201-
let tries = 0
202-
let client = createClient(taskSocketPath)
203-
204-
while (++tries < 5) {
205-
try {
206-
await pWaitFor(() => client.isReady, { interval: 100, timeout: 5_000 })
207-
break
208-
} catch (error) {
209-
console.error(error)
210-
client.disconnect()
211-
client = createClient(taskSocketPath)
212-
}
191+
try {
192+
await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
193+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
194+
} catch (error) {
195+
console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
196+
client.disconnect()
197+
return { success: false, retry: false }
213198
}
214199

215-
let isTaskFinished = false
200+
let taskStartedAt = Date.now()
201+
let taskFinishedAt: number | undefined
202+
let taskMetricsId: number | undefined
203+
let rooTaskId: string | undefined
216204
let isClientDisconnected = false
217205

218-
client.on(IpcMessageType.Disconnect, async () => {
219-
console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
220-
isTaskFinished = true
221-
isClientDisconnected = true
222-
})
223-
224206
const ignoreEvents: RooCodeEventName[] = [
225-
// RooCodeEventName.Message,
207+
RooCodeEventName.Message,
226208
RooCodeEventName.TaskTokenUsageUpdated,
227209
RooCodeEventName.TaskAskResponded,
228210
]
229211

230-
let taskStartedAt = Date.now()
231-
let taskMetricsId: number | undefined
232-
let rooTaskId: string | undefined
233-
234212
client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
235213
const { eventName, payload } = taskEvent
236214

@@ -287,44 +265,43 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
287265
}
288266

289267
if (eventName === RooCodeEventName.TaskCompleted || eventName === RooCodeEventName.TaskAborted) {
268+
taskFinishedAt = Date.now()
290269
await updateTask(task.id, { finishedAt: new Date() })
291-
isTaskFinished = true
292270
}
293271
})
294272

295-
if (client.isReady) {
296-
client.sendMessage({
297-
type: IpcMessageType.TaskCommand,
298-
origin: IpcOrigin.Client,
299-
clientId: client.clientId!,
273+
client.on(IpcMessageType.Disconnect, async () => {
274+
console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
275+
isClientDisconnected = true
276+
})
277+
278+
console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
279+
280+
client.sendMessage({
281+
type: IpcMessageType.TaskCommand,
282+
origin: IpcOrigin.Client,
283+
clientId: client.clientId!,
284+
data: {
285+
commandName: TaskCommandName.StartNewTask,
300286
data: {
301-
commandName: TaskCommandName.StartNewTask,
302-
data: {
303-
configuration: {
304-
...rooCodeDefaults,
305-
openRouterApiKey: process.env.OPENROUTER_API_KEY!,
306-
...run.settings,
307-
},
308-
text: prompt,
309-
newTab: true,
287+
configuration: {
288+
...rooCodeDefaults,
289+
openRouterApiKey: process.env.OPENROUTER_API_KEY!,
290+
...run.settings,
310291
},
292+
text: prompt,
293+
newTab: true,
311294
},
312-
})
313-
314-
console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
315-
} else {
316-
console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
317-
client.disconnect()
318-
isTaskFinished = true
319-
isClientDisconnected = true
320-
}
295+
},
296+
})
321297

322298
try {
323-
await pWaitFor(() => isTaskFinished, { interval: 1_000, timeout: taskTimeLimit })
299+
await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
324300
// eslint-disable-next-line @typescript-eslint/no-unused-vars
325301
} catch (error) {
326302
console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
327303

304+
// Cancel the task.
328305
if (rooTaskId && !isClientDisconnected) {
329306
client.sendMessage({
330307
type: IpcMessageType.TaskCommand,
@@ -333,35 +310,28 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
333310
data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
334311
})
335312

336-
await new Promise((resolve) => setTimeout(resolve, 2_000))
313+
// Give the server some time to cancel the task.
314+
await new Promise((resolve) => setTimeout(resolve, 5_000))
337315
}
338316

317+
// TODO: Notify clients that the task timed out.
339318
await updateTask(task.id, { finishedAt: new Date() })
340319
}
341320

342321
if (!isClientDisconnected) {
343-
try {
344-
if (rooTaskId) {
345-
client.sendMessage({
346-
type: IpcMessageType.TaskCommand,
347-
origin: IpcOrigin.Client,
348-
clientId: client.clientId!,
349-
data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
350-
})
351-
}
352-
353-
client.disconnect()
354-
} catch (error) {
355-
console.error(error)
322+
if (rooTaskId) {
323+
client.sendMessage({
324+
type: IpcMessageType.TaskCommand,
325+
origin: IpcOrigin.Client,
326+
clientId: client.clientId!,
327+
data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
328+
})
356329
}
330+
331+
client.disconnect()
357332
}
358333

359-
// try {
360-
// console.log(`[cli#runExercise | ${language} / ${exercise}] aborting subprocess`)
361-
// controller.abort()
362-
// await subprocess
363-
// } catch (error) {
364-
// }
334+
return { success: !!taskFinishedAt, retry: false }
365335
}
366336

367337
const runUnitTest = async ({ task }: { task: Task }) => {
@@ -373,15 +343,13 @@ const runUnitTest = async ({ task }: { task: Task }) => {
373343
let passed = true
374344

375345
for (const command of commands) {
376-
// const controller = new AbortController()
377-
// const cancelSignal = controller.signal
378-
// const timeout = setTimeout(() => controller.abort(), cmd.timeout ?? 15_000)
346+
const controller = new AbortController()
347+
const cancelSignal = controller.signal
348+
const timeout = setTimeout(() => controller.abort(), cmd.timeout ?? UNIT_TEST_TIMEOUT)
379349

380350
try {
381-
const result = await execa({ cwd, shell: true, reject: false /* , cancelSignal */ })`${command}`
382-
// console.log('[cli#run] execa result =', { ...result, cwd, command })
383-
384-
// clearTimeout(timeout)
351+
const result = await execa({ cwd, shell: true, reject: false, cancelSignal })`${command}`
352+
clearTimeout(timeout)
385353

386354
if (result.failed) {
387355
passed = false

evals/apps/web/src/app/runs/[id]/run.tsx

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"use client"
22

3-
import { useState, useRef, useEffect } from "react"
3+
import { useState, useRef } from "react"
44
import { LoaderCircle, SquareTerminal } from "lucide-react"
55

66
import * as db from "@evals/db"
@@ -13,7 +13,6 @@ import {
1313
DrawerHeader,
1414
DrawerTitle,
1515
ScrollArea,
16-
Separator,
1716
Table,
1817
TableBody,
1918
TableCell,
@@ -30,19 +29,6 @@ export function Run({ run }: { run: db.Run }) {
3029
const scrollAreaRef = useRef<HTMLDivElement>(null)
3130
const [selectedTask, setSelectedTask] = useState<db.Task>()
3231

33-
useEffect(() => {
34-
if (selectedTask) {
35-
const scrollArea = scrollAreaRef.current
36-
37-
if (scrollArea) {
38-
scrollArea.scrollTo({
39-
top: scrollArea.scrollHeight,
40-
behavior: "smooth",
41-
})
42-
}
43-
}
44-
}, [selectedTask, outputCounts])
45-
4632
return (
4733
<>
4834
<div>

0 commit comments

Comments
 (0)