Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values

const dockerArgs = [
`--name evals-controller-${run.id}`,
"--rm",
// "--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

Expand Down
1 change: 1 addition & 0 deletions packages/evals/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ services:
- HOST_EXECUTION_METHOD=docker
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /tmp/evals:/var/log/evals
stdin_open: true
tty: true
profiles:
Expand Down
86 changes: 86 additions & 0 deletions packages/evals/src/cli/FileLogger.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import * as fs from "fs"
import * as path from "path"

export enum LogLevel {
INFO = "INFO",
ERROR = "ERROR",
WARN = "WARN",
DEBUG = "DEBUG",
}

export interface LoggerOptions {
logDir: string
filename: string
tag: string
}

export class FileLogger {
private logStream: fs.WriteStream | undefined
private logFilePath: string
private tag: string

constructor({ logDir, filename, tag }: LoggerOptions) {
this.tag = tag
this.logFilePath = path.join(logDir, filename)
this.initializeLogger(logDir)
}

private initializeLogger(logDir: string): void {
try {
fs.mkdirSync(logDir, { recursive: true })
} catch (error) {
console.error(`Failed to create log directory ${logDir}:`, error)
}

try {
this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" })
} catch (error) {
console.error(`Failed to create log file ${this.logFilePath}:`, error)
}
}

private writeToLog(level: LogLevel, message: string, ...args: unknown[]) {
try {
const timestamp = new Date().toISOString()

const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${
args.length > 0 ? JSON.stringify(args) : ""
}\n`

console.log(logLine.trim())

if (this.logStream) {
this.logStream.write(logLine)
}
} catch (error) {
console.error(`Failed to write to log file ${this.logFilePath}:`, error)
}
}

public info(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.INFO, message, ...args)
}

public error(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.ERROR, message, ...args)
}

public warn(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.WARN, message, ...args)
}

public debug(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.DEBUG, message, ...args)
}

public log(message: string, ...args: unknown[]): void {
this.info(message, ...args)
}

public close(): void {
if (this.logStream) {
this.logStream.end()
this.logStream = undefined
}
}
}
2 changes: 1 addition & 1 deletion packages/evals/src/cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const main = async () => {
if (runId !== -1) {
await runEvals(runId)
} else {
await processTask(taskId)
await processTask({ taskId })
}
} catch (error) {
console.error(error)
Expand Down
86 changes: 71 additions & 15 deletions packages/evals/src/cli/processTask.ts
Original file line number Diff line number Diff line change
@@ -1,33 +1,42 @@
import { execa } from "execa"

import { RooCodeEventName, type TaskEvent } from "@roo-code/types"

import { findTask, updateTask, findRun } from "../db/index.js"

import { getTag } from "./utils.js"
import { FileLogger } from "./FileLogger.js"
import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
import { runTask } from "./runTask.js"
import { runUnitTest } from "./runUnitTest.js"
import { execa } from "execa"

export const processTask = async (taskId: number) => {
export const processTask = async ({ taskId, logger }: { taskId: number; logger?: FileLogger }) => {
const task = await findTask(taskId)
const { language, exercise } = task
const run = await findRun(task.runId)
await registerRunner({ runId: run.id, taskId })

try {
const tag = getTag("processTask", { run, task })
logger =
logger ||
new FileLogger({
logDir: `/var/log/evals/runs/${run.id}`,
filename: `${language}-${exercise}.log`,
tag: getTag("runTask", { run, task }),
})

try {
const publish = async (e: TaskEvent) => {
const redis = await redisClient()
await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
}

console.log(`[${Date.now()} | ${tag}] running task ${task.id} (${task.language}/${task.exercise})...`)
await runTask({ run, task, publish })
logger.info(`running task ${task.id} (${language}/${exercise})...`)
await runTask({ run, task, publish, logger })

console.log(`[${Date.now()} | ${tag}] testing task ${task.id} (${task.language}/${task.exercise})...`)
logger.info(`testing task ${task.id} (${language}/${exercise})...`)
const passed = await runUnitTest({ run, task })

console.log(`[${Date.now()} | ${tag}] task ${task.id} (${task.language}/${task.exercise}) -> ${passed}`)
logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
await updateTask(task.id, { passed })

await publish({
Expand All @@ -39,18 +48,65 @@ export const processTask = async (taskId: number) => {
}
}

export const processTaskInContainer = async (taskId: number) => {
const args = [
`--name evals-task-${taskId}`,
export const processTaskInContainer = async ({
taskId,
logger,
maxRetries = 10,
}: {
taskId: number
logger: FileLogger
maxRetries?: number
}) => {
const baseArgs = [
"--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
await subprocess
logger.info(command)

for (let attempt = 0; attempt <= maxRetries; attempt++) {
const containerName = `evals-task-${taskId}.${attempt}`
const args = [`--name ${containerName}`, ...baseArgs]
const isRetry = attempt > 0

if (isRetry) {
const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
await new Promise((resolve) => setTimeout(resolve, delayMs))
}

logger.info(
`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
)

const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
// subprocess.stderr?.on("data", (data) => console.error(data.toString()))

try {
const result = await subprocess
logger.info(`container process completed with exit code: ${result.exitCode}`)
return
} catch (error) {
if (error && typeof error === "object" && "exitCode" in error) {
logger.error(
`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
)
} else {
logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
}

if (attempt === maxRetries) {
break
}
}
}

logger.error(`all ${maxRetries + 1} attempts failed, giving up`)

// TODO: Mark task as failed.
}
29 changes: 23 additions & 6 deletions packages/evals/src/cli/runEvals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { exercisesPath } from "../exercises/index.js"
import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
import { processTask, processTaskInContainer } from "./processTask.js"
import { startHeartbeat, stopHeartbeat } from "./redis.js"
import { FileLogger } from "./FileLogger.js"

export const runEvals = async (runId: number) => {
const run = await findRun(runId)
Expand All @@ -20,8 +21,13 @@ export const runEvals = async (runId: number) => {
throw new Error(`Run ${run.id} has no tasks.`)
}

const tag = getTag("runEvals", { run })
console.log(`[${Date.now()} | ${tag}] running ${tasks.length} task(s)`)
const logger = new FileLogger({
logDir: `/var/log/evals/runs/${run.id}`,
filename: `controller.log`,
tag: getTag("runEvals", { run }),
})

logger.info(`running ${tasks.length} task(s)`)

const containerized = isDockerContainer()

Expand All @@ -36,12 +42,22 @@ export const runEvals = async (runId: number) => {
await queue.addAll(
tasks
.filter((task) => task.finishedAt === null)
.map((task) => () => (containerized ? processTaskInContainer(task.id) : processTask(task.id))),
.map((task) => () => {
try {
if (containerized) {
processTaskInContainer({ taskId: task.id, logger })
} else {
processTask({ taskId: task.id, logger })
}
} catch (error) {
logger.error("error processing task", error)
}
}),
)

console.log(`[${Date.now()} | ${tag}] finishRun`)
logger.info("finishRun")
const result = await finishRun(run.id)
console.log(`[${Date.now()} | ${tag}] result ->`, result)
logger.info("result ->", result)

// There's no need to commit the changes in the container since they
// will lost when the container is destroyed. I think we should
Expand All @@ -50,7 +66,8 @@ export const runEvals = async (runId: number) => {
await commitEvalsRepoChanges({ run, cwd: exercisesPath })
}
} finally {
console.log(`[${Date.now()} | ${tag}] cleaning up`)
logger.info("cleaning up")
stopHeartbeat(run.id, heartbeat)
logger.close()
}
}
Loading