Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions packages/evals/src/cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { runEvals } from "./runEvals.js"
import { processTask } from "./processTask.js"

const main = async () => {
const result = await run(
await run(
command({
name: "cli",
description: "Execute an eval run.",
Expand Down Expand Up @@ -43,7 +43,6 @@ const main = async () => {
process.argv.slice(2),
)

console.log(result)
process.exit(0)
}

Expand Down
2 changes: 1 addition & 1 deletion packages/evals/src/cli/processTask.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export const processTask = async (taskId: number) => {
await runTask({ run, task, publish })

console.log(`[${Date.now()} | ${tag}] testing task ${task.id} (${task.language}/${task.exercise})...`)
const passed = await runUnitTest({ task })
const passed = await runUnitTest({ run, task })

console.log(`[${Date.now()} | ${tag}] task ${task.id} (${task.language}/${task.exercise}) -> ${passed}`)
await updateTask(task.id, { passed })
Expand Down
26 changes: 13 additions & 13 deletions packages/evals/src/cli/runEvals.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import { execa } from "execa"
import PQueue from "p-queue"

import { findRun, finishRun, getTasks } from "../db/index.js"
import { exercisesPath } from "../exercises/index.js"

import { getTag, isDockerContainer } from "./utils.js"
import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
import { processTask, processTaskInContainer } from "./processTask.js"
import { startHeartbeat, stopHeartbeat } from "./redis.js"

Expand All @@ -24,31 +23,32 @@ export const runEvals = async (runId: number) => {
const tag = getTag("runEvals", { run })
console.log(`[${Date.now()} | ${tag}] running ${tasks.length} task(s)`)

const cwd = exercisesPath
await execa({ cwd })`git config user.name "Roo Code"`
await execa({ cwd })`git config user.email "[email protected]"`
await execa({ cwd })`git checkout -f`
await execa({ cwd })`git clean -fd`
await execa({ cwd })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
const containerized = isDockerContainer()

if (!containerized) {
await resetEvalsRepo({ run, cwd: exercisesPath })
}

const heartbeat = await startHeartbeat(run.id)
const queue = new PQueue({ concurrency: run.concurrency })

try {
const containerize = isDockerContainer()

await queue.addAll(
tasks
.filter((task) => task.finishedAt === null)
.map((task) => () => (containerize ? processTaskInContainer(task.id) : processTask(task.id))),
.map((task) => () => (containerized ? processTaskInContainer(task.id) : processTask(task.id))),
)

console.log(`[${Date.now()} | ${tag}] finishRun`)
const result = await finishRun(run.id)
console.log(`[${Date.now()} | ${tag}] result ->`, result)

await execa({ cwd: exercisesPath })`git add .`
await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
// There's no need to commit the changes in the container since they
// will lost when the container is destroyed. I think we should
// store the diffs in the database instead.
if (!containerized) {
await commitEvalsRepoChanges({ run, cwd: exercisesPath })
}
} finally {
console.log(`[${Date.now()} | ${tag}] cleaning up`)
stopHeartbeat(run.id, heartbeat)
Expand Down
154 changes: 56 additions & 98 deletions packages/evals/src/cli/runTask.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@ import pWaitFor from "p-wait-for"
import { execa } from "execa"

import {
type TaskEvent,
TaskCommandName,
RooCodeEventName,
IpcOrigin,
IpcMessageType,
TaskCommandName,
type TaskEvent,
EVALS_SETTINGS,
EVALS_TIMEOUT,
} from "@roo-code/types"
Expand All @@ -27,49 +26,38 @@ type RunTaskOptions = {
publish: (taskEvent: TaskEvent) => Promise<void>
}

export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{ success: boolean }> => {
const { language, exercise } = task
export const runTask = async ({ run, task, publish }: RunTaskOptions) => {
const tag = getTag("runTask", { run, task })
const log = (message: string, ...args: unknown[]) => console.log(`[${Date.now()} | ${tag}] ${message}`, ...args)
const logError = (message: string, ...args: unknown[]) =>
console.error(`[${Date.now()} | ${tag}] ${message}`, ...args)

const { language, exercise } = task
const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
const workspacePath = path.resolve(exercisesPath, language, exercise)
const taskSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`)

// Inject foot gun system prompt if present.
if (process.env.FOOTGUN_SYSTEM_PROMPT) {
const rooDir = path.join(workspacePath, ".roo")

if (!fs.existsSync(rooDir)) {
fs.mkdirSync(rooDir, { recursive: true })
}

fs.writeFileSync(path.join(rooDir, "system-prompt-code"), process.env.FOOTGUN_SYSTEM_PROMPT)
}

console.log(`[${Date.now()} | ${tag}] Opening new VS Code window at ${workspacePath}`)

const ipcSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`)
const env = { ROO_CODE_IPC_SOCKET_PATH: ipcSocketPath }
const controller = new AbortController()
const cancelSignal = controller.signal
const containerized = isDockerContainer()

const codeCommand = isDockerContainer()
? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic"`
: `code --disable-workspace-trust`
const codeCommand = containerized
? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic" -n ${workspacePath}`
: `code --disable-workspace-trust -n ${workspacePath}`

console.log(`[${Date.now()} | ${tag}] ${codeCommand}`)
log(codeCommand)

// Sleep for a random amount of time between 5 and 10 seconds.
await new Promise((resolve) => setTimeout(resolve, Math.random() * 5_000 + 5_000))
// Sleep for a random amount of time between 5 and 10 seconds, unless we're
// running in a container, in which case there are no issues with flooding
// VSCode with new windows.
if (!containerized) {
await new Promise((resolve) => setTimeout(resolve, Math.random() * 5_000 + 5_000))
}

const subprocess = execa({
env: {
ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
},
shell: "/bin/bash",
cancelSignal,
})`${codeCommand} -n ${workspacePath}`
const subprocess = execa({ env, shell: "/bin/bash", cancelSignal })`${codeCommand}`

// If debugging:
subprocess.stdout.pipe(process.stdout)
// If debugging, add `--verbose` to `command` and uncomment the following line.
// subprocess.stdout.pipe(process.stdout)

// Give VSCode some time to spawn before connecting to its unix socket.
await new Promise((resolve) => setTimeout(resolve, 3_000))
Expand All @@ -78,26 +66,20 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{

while (true) {
try {
console.log(`[${Date.now()} | ${tag}] connecting to ${taskSocketPath}`)
client = new IpcClient(taskSocketPath)
client = new IpcClient(ipcSocketPath)
await pWaitFor(() => client!.isReady, { interval: 250, timeout: 1_000 })
break
} catch (_error) {
if (client) {
client.disconnect()
}

client?.disconnect()
attempts--

if (attempts <= 0) {
console.error(`[${Date.now()} | ${tag}] unable to connect`)
return { success: false }
logError(`unable to connect to IPC socket -> ${ipcSocketPath}`)
return
}
}
}

console.log(`[${Date.now()} | ${tag}] connected to ${taskSocketPath}`)

let taskStartedAt = Date.now()
let taskFinishedAt: number | undefined
let taskMetricsId: number | undefined
Expand All @@ -106,18 +88,24 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{

const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = {
broadcast: [RooCodeEventName.Message],
log: [RooCodeEventName.TaskTokenUsageUpdated], // [RooCodeEventName.Message, RooCodeEventName.TaskAskResponded],
log: [RooCodeEventName.TaskTokenUsageUpdated, RooCodeEventName.TaskAskResponded],
}

client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
const { eventName, payload } = taskEvent

// Publish all events except for these to Redis.
if (!ignoreEvents.broadcast.includes(eventName)) {
await publish({ ...taskEvent, taskId: task.id })
}

if (!ignoreEvents.log.includes(eventName)) {
console.log(`[${Date.now()} | ${tag}] ${eventName} ->`, payload)
// Log all events except for these.
// For message events we only log non-partial messages.
if (
!ignoreEvents.log.includes(eventName) &&
(eventName !== RooCodeEventName.Message || payload[0].message.partial !== true)
) {
log(`${eventName} ->`, payload)
}

if (eventName === RooCodeEventName.TaskStarted) {
Expand Down Expand Up @@ -177,77 +165,47 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{
})

client.on(IpcMessageType.Disconnect, async () => {
console.log(`[${Date.now()} | ${tag}] disconnect`)
log(`disconnected from IPC socket -> ${ipcSocketPath}`)
isClientDisconnected = true
})

if (client.isReady) {
const configuration = {
...EVALS_SETTINGS,
...run.settings,
openRouterApiKey: process.env.OPENROUTER_API_KEY,
}

client.sendMessage({
type: IpcMessageType.TaskCommand,
origin: IpcOrigin.Client,
clientId: client.clientId!,
data: {
commandName: TaskCommandName.StartNewTask,
data: {
configuration,
text: prompt,
newTab: true,
},
client.sendCommand({
commandName: TaskCommandName.StartNewTask,
data: {
configuration: {
...EVALS_SETTINGS,
...run.settings,
openRouterApiKey: process.env.OPENROUTER_API_KEY,
},
})
} else {
console.error(`[${Date.now()} | ${tag}] unable to connect`)
client.disconnect()
taskFinishedAt = Date.now()
isClientDisconnected = true
}
text: prompt,
newTab: true,
},
})

try {
await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: EVALS_TIMEOUT })
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
console.log(`[${Date.now()} | ${tag}] time limit reached`)
} catch (_error) {
logError("time limit reached")

// Cancel the task.
if (rooTaskId && !isClientDisconnected) {
client.sendMessage({
type: IpcMessageType.TaskCommand,
origin: IpcOrigin.Client,
clientId: client.clientId!,
data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
})

// Allow some time for the task to cancel.
await new Promise((resolve) => setTimeout(resolve, 5_000))
log("cancelling task")
client.sendCommand({ commandName: TaskCommandName.CancelTask, data: rooTaskId })
await new Promise((resolve) => setTimeout(resolve, 5_000)) // Allow some time for the task to cancel.
}

await updateTask(task.id, { finishedAt: new Date() })
}

if (!isClientDisconnected) {
if (rooTaskId) {
client.sendMessage({
type: IpcMessageType.TaskCommand,
origin: IpcOrigin.Client,
clientId: client.clientId!,
data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
})

// Allow some time for the window to close.
await new Promise((resolve) => setTimeout(resolve, 2_000))
log("closing task")
client.sendCommand({ commandName: TaskCommandName.CloseTask, data: rooTaskId })
await new Promise((resolve) => setTimeout(resolve, 2_000)) // Allow some time for the window to close.
}

client.disconnect()
}

controller.abort()
await subprocess

return { success: !!taskFinishedAt }
}
Loading
Loading