Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
223 changes: 94 additions & 129 deletions evals/apps/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,17 @@ import { IpcServer, IpcClient } from "@evals/ipc"
import { __dirname, extensionDevelopmentPath, exercisesPath } from "./paths.js"
import { getExercises } from "./exercises.js"

const maxConcurrency = 2
const taskTimeLimit = 5 * 60 * 1_000
type TaskResult = { success: boolean; retry: boolean }
type TaskPromise = Promise<TaskResult>

const MAX_CONCURRENCY = 20
const TASK_TIMEOUT = 10 * 60 * 1_000
const UNIT_TEST_TIMEOUT = 60 * 1_000

const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
java: { commands: ["./gradlew test"] }, // timeout --foreground 15s bash -c "cd '$dir' && ./gradlew test > /dev/null 2>&1"
javascript: { commands: ["pnpm install", "pnpm test"], timeout: 30_000 }, // timeout 30s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
javascript: { commands: ["pnpm install", "pnpm test"] }, // timeout 15s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] }, // timeout 15s bash -c "cd '$dir' && uv run python3 -m pytest -o markers=task *_test.py"
rust: { commands: ["cargo test"] }, // timeout 15s bash -c "cd '$dir' && cargo test > /dev/null 2>&1"
}
Expand Down Expand Up @@ -107,130 +111,104 @@ const run = async (toolbox: GluegunToolbox) => {
const server = new IpcServer(run.socketPath, () => {})
server.listen()

// server.on(IpcMessageType.Connect, (clientId) => {
// server.send(clientId, {
// type: IpcMessageType.TaskEvent,
// origin: IpcOrigin.Server,
// data: { eventName: RooCodeEventName.Connect, taskId: -1 },
// })
// })

const runningPromises: Promise<void>[] = []
const runningPromises: TaskPromise[] = []

// Retries aren't implemented yet, but the return values are set up to
// support them.
const processTask = async (task: Task) => {
if (task.finishedAt === null) {
await runExercise({ run, task, server })
const { retry } = await runExercise({ run, task, server })

if (retry) {
return { success: false, retry: true }
}
}

if (task.passed === null) {
const passed = await runUnitTest({ task })
await updateTask(task.id, { passed })
return { success: passed, retry: false }
} else {
return { success: task.passed, retry: false }
}
}

for (const task of tasks) {
const taskPromise = processTask(task)
runningPromises.push(taskPromise)
const processTaskResult = async (task: Task, promise: TaskPromise) => {
const index = runningPromises.indexOf(promise)

taskPromise.finally(() => {
const index = runningPromises.indexOf(taskPromise)
if (index > -1) {
runningPromises.splice(index, 1)
}
}

if (index > -1) {
runningPromises.splice(index, 1)
}
})
for (const task of tasks) {
const promise = processTask(task)
runningPromises.push(promise)
promise.then(() => processTaskResult(task, promise))

if (runningPromises.length >= maxConcurrency) {
if (runningPromises.length > MAX_CONCURRENCY) {
await Promise.race(runningPromises)
}
}

await Promise.all(runningPromises)

const result = await finishRun(run.id)
try {
console.log("[cli#run]", result)
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
// console.error(error)
}
console.log("[cli#run]", result)

console.log(await execa({ cwd: exercisesPath })`git add .`)
console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
}

const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }) => {
const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
const { language, exercise } = task
const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
const dirname = path.dirname(run.socketPath)
const workspacePath = path.resolve(exercisesPath, language, exercise)
const taskSocketPath = path.resolve(dirname, `${dirname}/task-${task.id}.sock`)

const controller = new AbortController()
const cancelSignal = controller.signal

// If debugging:
// Use --wait --log trace or --verbose.
const codeCommand = `code --disable-workspace-trust`
// Don't await execa and store result as subprocess.
// subprocess.stdout.pipe(process.stdout)

// Sleep for a random amount of time before opening a new VSCode window.
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * MAX_CONCURRENCY * 1_000))
console.log(`Opening new VS Code window at ${workspacePath}`)

await execa({
env: {
ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
},
shell: "/bin/bash",
cancelSignal,
})`${codeCommand} -n ${path.resolve(exercisesPath, language, exercise)}`
})`code --disable-workspace-trust -n ${workspacePath}`

// If debugging:
// Don't await execa and store result as subprocess.
// subprocess.stdout.pipe(process.stdout)

// Give VSCode some time to spawn before connectint to its unix socket.
await new Promise((resolve) => setTimeout(resolve, 1_000))
// Give VSCode some time to spawn before connecting to its unix socket.
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
console.log(`Connecting to ${taskSocketPath}`)
const client = new IpcClient(taskSocketPath)

const createClient = (taskSocketPath: string) => {
const ipcClient = new IpcClient(taskSocketPath)

ipcClient.on(IpcMessageType.Ack, (ack) => {
console.log(`[cli#runExercise | ${language} / ${exercise}] ack`, ack)
})

return ipcClient
}

let tries = 0
let client = createClient(taskSocketPath)

while (++tries < 5) {
try {
await pWaitFor(() => client.isReady, { interval: 100, timeout: 5_000 })
break
} catch (error) {
console.error(error)
client.disconnect()
client = createClient(taskSocketPath)
}
try {
await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
client.disconnect()
return { success: false, retry: false }
}

let isTaskFinished = false
let taskStartedAt = Date.now()
let taskFinishedAt: number | undefined
let taskMetricsId: number | undefined
let rooTaskId: string | undefined
let isClientDisconnected = false

client.on(IpcMessageType.Disconnect, async () => {
console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
isTaskFinished = true
isClientDisconnected = true
})

const ignoreEvents: RooCodeEventName[] = [
// RooCodeEventName.Message,
RooCodeEventName.Message,
RooCodeEventName.TaskTokenUsageUpdated,
RooCodeEventName.TaskAskResponded,
]

let taskStartedAt = Date.now()
let taskMetricsId: number | undefined
let rooTaskId: string | undefined

client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
const { eventName, payload } = taskEvent

Expand Down Expand Up @@ -287,44 +265,43 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
}

if (eventName === RooCodeEventName.TaskCompleted || eventName === RooCodeEventName.TaskAborted) {
taskFinishedAt = Date.now()
await updateTask(task.id, { finishedAt: new Date() })
isTaskFinished = true
}
})

if (client.isReady) {
client.sendMessage({
type: IpcMessageType.TaskCommand,
origin: IpcOrigin.Client,
clientId: client.clientId!,
client.on(IpcMessageType.Disconnect, async () => {
console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
isClientDisconnected = true
})

console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)

client.sendMessage({
type: IpcMessageType.TaskCommand,
origin: IpcOrigin.Client,
clientId: client.clientId!,
data: {
commandName: TaskCommandName.StartNewTask,
data: {
commandName: TaskCommandName.StartNewTask,
data: {
configuration: {
...rooCodeDefaults,
openRouterApiKey: process.env.OPENROUTER_API_KEY!,
...run.settings,
},
text: prompt,
newTab: true,
configuration: {
...rooCodeDefaults,
openRouterApiKey: process.env.OPENROUTER_API_KEY!,
...run.settings,
},
text: prompt,
newTab: true,
},
})

console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
} else {
console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
client.disconnect()
isTaskFinished = true
isClientDisconnected = true
}
},
})

try {
await pWaitFor(() => isTaskFinished, { interval: 1_000, timeout: taskTimeLimit })
await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)

// Cancel the task.
if (rooTaskId && !isClientDisconnected) {
client.sendMessage({
type: IpcMessageType.TaskCommand,
Expand All @@ -333,35 +310,28 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
})

await new Promise((resolve) => setTimeout(resolve, 2_000))
// Give the server some time to cancel the task.
await new Promise((resolve) => setTimeout(resolve, 5_000))
}

// TODO: Notify clients that the task timed out.
await updateTask(task.id, { finishedAt: new Date() })
}

if (!isClientDisconnected) {
try {
if (rooTaskId) {
client.sendMessage({
type: IpcMessageType.TaskCommand,
origin: IpcOrigin.Client,
clientId: client.clientId!,
data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
})
}

client.disconnect()
} catch (error) {
console.error(error)
if (rooTaskId) {
client.sendMessage({
type: IpcMessageType.TaskCommand,
origin: IpcOrigin.Client,
clientId: client.clientId!,
data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
})
}

client.disconnect()
}

// try {
// console.log(`[cli#runExercise | ${language} / ${exercise}] aborting subprocess`)
// controller.abort()
// await subprocess
// } catch (error) {
// }
return { success: !!taskFinishedAt, retry: false }
}

const runUnitTest = async ({ task }: { task: Task }) => {
Expand All @@ -373,22 +343,17 @@ const runUnitTest = async ({ task }: { task: Task }) => {
let passed = true

for (const command of commands) {
// const controller = new AbortController()
// const cancelSignal = controller.signal
// const timeout = setTimeout(() => controller.abort(), cmd.timeout ?? 15_000)
const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT

try {
const result = await execa({ cwd, shell: true, reject: false /* , cancelSignal */ })`${command}`
// console.log('[cli#run] execa result =', { ...result, cwd, command })

// clearTimeout(timeout)
const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`

if (result.failed) {
passed = false
break
}
} catch (error) {
console.log("[cli#run] execa error =", error)
console.log("[cli#runUnitTest]", error)
passed = false
break
}
Expand Down
Loading