Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values

const dockerArgs = [
`--name evals-controller-${run.id}`,
"--rm",
// "--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

Expand Down
1 change: 1 addition & 0 deletions packages/evals/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ services:
- HOST_EXECUTION_METHOD=docker
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /tmp/evals:/var/log/evals
stdin_open: true
tty: true
profiles:
Expand Down
86 changes: 86 additions & 0 deletions packages/evals/src/cli/FileLogger.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import * as fs from "fs"
import * as path from "path"

export enum LogLevel {
INFO = "INFO",
ERROR = "ERROR",
WARN = "WARN",
DEBUG = "DEBUG",
}

export interface LoggerOptions {
logDir: string
filename: string
tag: string
}

export class FileLogger {
private logStream: fs.WriteStream | undefined
private logFilePath: string
private tag: string

constructor({ logDir, filename, tag }: LoggerOptions) {
this.tag = tag
this.logFilePath = path.join(logDir, filename)
this.initializeLogger(logDir)
}

private initializeLogger(logDir: string): void {
try {
fs.mkdirSync(logDir, { recursive: true })
} catch (error) {
console.error(`Failed to create log directory ${logDir}:`, error)
}

try {
this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" })
} catch (error) {
console.error(`Failed to create log file ${this.logFilePath}:`, error)
}
}

private writeToLog(level: LogLevel, message: string, ...args: unknown[]) {
try {
const timestamp = new Date().toISOString()

const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${
args.length > 0 ? JSON.stringify(args) : ""
}\n`

console.log(logLine.trim())

if (this.logStream) {
this.logStream.write(logLine)
}
} catch (error) {
console.error(`Failed to write to log file ${this.logFilePath}:`, error)
}
}

public info(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.INFO, message, ...args)
}

public error(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.ERROR, message, ...args)
}

public warn(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.WARN, message, ...args)
}

public debug(message: string, ...args: unknown[]): void {
this.writeToLog(LogLevel.DEBUG, message, ...args)
}

public log(message: string, ...args: unknown[]): void {
this.info(message, ...args)
}

public close(): void {
if (this.logStream) {
this.logStream.end()
this.logStream = undefined
}
}
}
2 changes: 1 addition & 1 deletion packages/evals/src/cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const main = async () => {
if (runId !== -1) {
await runEvals(runId)
} else {
await processTask(taskId)
await processTask({ taskId })
}
} catch (error) {
console.error(error)
Expand Down
86 changes: 71 additions & 15 deletions packages/evals/src/cli/processTask.ts
Original file line number Diff line number Diff line change
@@ -1,33 +1,42 @@
import { execa } from "execa"

import { RooCodeEventName, type TaskEvent } from "@roo-code/types"

import { findTask, updateTask, findRun } from "../db/index.js"

import { getTag } from "./utils.js"
import { FileLogger } from "./FileLogger.js"
import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
import { runTask } from "./runTask.js"
import { runUnitTest } from "./runUnitTest.js"
import { execa } from "execa"

export const processTask = async (taskId: number) => {
export const processTask = async ({ taskId, logger }: { taskId: number; logger?: FileLogger }) => {
const task = await findTask(taskId)
const { language, exercise } = task
const run = await findRun(task.runId)
await registerRunner({ runId: run.id, taskId })

try {
const tag = getTag("processTask", { run, task })
logger =
logger ||
new FileLogger({
logDir: `/var/log/evals/runs/${run.id}`,
filename: `${language}-${exercise}.log`,
tag: getTag("runTask", { run, task }),
})

try {
const publish = async (e: TaskEvent) => {
const redis = await redisClient()
await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
}

console.log(`[${Date.now()} | ${tag}] running task ${task.id} (${task.language}/${task.exercise})...`)
await runTask({ run, task, publish })
logger.info(`running task ${task.id} (${language}/${exercise})...`)
await runTask({ run, task, publish, logger })

console.log(`[${Date.now()} | ${tag}] testing task ${task.id} (${task.language}/${task.exercise})...`)
logger.info(`testing task ${task.id} (${language}/${exercise})...`)
const passed = await runUnitTest({ run, task })

console.log(`[${Date.now()} | ${tag}] task ${task.id} (${task.language}/${task.exercise}) -> ${passed}`)
logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
await updateTask(task.id, { passed })

await publish({
Expand All @@ -39,18 +48,65 @@ export const processTask = async (taskId: number) => {
}
}

export const processTaskInContainer = async (taskId: number) => {
const args = [
`--name evals-task-${taskId}`,
export const processTaskInContainer = async ({
taskId,
logger,
maxRetries = 10,
}: {
taskId: number
logger: FileLogger
maxRetries?: number
}) => {
const baseArgs = [
"--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
await subprocess
logger.info(command)

for (let attempt = 0; attempt <= maxRetries; attempt++) {
const containerName = `evals-task-${taskId}.${attempt}`
const args = [`--name ${containerName}`, ...baseArgs]
const isRetry = attempt > 0

if (isRetry) {
const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
await new Promise((resolve) => setTimeout(resolve, delayMs))
}

logger.info(
`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
)

const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
// subprocess.stderr?.on("data", (data) => console.error(data.toString()))

try {
const result = await subprocess
logger.info(`container process completed with exit code: ${result.exitCode}`)
return
} catch (error) {
if (error && typeof error === "object" && "exitCode" in error) {
logger.error(
`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
)
} else {
logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
}

if (attempt === maxRetries) {
break
}
}
}

logger.error(`all ${maxRetries + 1} attempts failed, giving up`)

// TODO: Mark task as failed.
}
29 changes: 23 additions & 6 deletions packages/evals/src/cli/runEvals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { exercisesPath } from "../exercises/index.js"
import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
import { processTask, processTaskInContainer } from "./processTask.js"
import { startHeartbeat, stopHeartbeat } from "./redis.js"
import { FileLogger } from "./FileLogger.js"

export const runEvals = async (runId: number) => {
const run = await findRun(runId)
Expand All @@ -20,8 +21,13 @@ export const runEvals = async (runId: number) => {
throw new Error(`Run ${run.id} has no tasks.`)
}

const tag = getTag("runEvals", { run })
console.log(`[${Date.now()} | ${tag}] running ${tasks.length} task(s)`)
const logger = new FileLogger({
logDir: `/var/log/evals/runs/${run.id}`,
filename: `controller.log`,
tag: getTag("runEvals", { run }),
})

logger.info(`running ${tasks.length} task(s)`)

const containerized = isDockerContainer()

Expand All @@ -36,12 +42,22 @@ export const runEvals = async (runId: number) => {
await queue.addAll(
tasks
.filter((task) => task.finishedAt === null)
.map((task) => () => (containerized ? processTaskInContainer(task.id) : processTask(task.id))),
.map((task) => async () => {
try {
if (containerized) {
await processTaskInContainer({ taskId: task.id, logger })
} else {
await processTask({ taskId: task.id, logger })
}
} catch (error) {
logger.error("error processing task", error)
}
}),
)

console.log(`[${Date.now()} | ${tag}] finishRun`)
logger.info("finishRun")
const result = await finishRun(run.id)
console.log(`[${Date.now()} | ${tag}] result ->`, result)
logger.info("result ->", result)

// There's no need to commit the changes in the container since they
// will lost when the container is destroyed. I think we should
Expand All @@ -50,7 +66,8 @@ export const runEvals = async (runId: number) => {
await commitEvalsRepoChanges({ run, cwd: exercisesPath })
}
} finally {
console.log(`[${Date.now()} | ${tag}] cleaning up`)
logger.info("cleaning up")
stopHeartbeat(run.id, heartbeat)
logger.close()
}
}
Loading
Loading