Skip to content

Commit 52673b3

Browse files
authored
Harden evals with retry logic + centralize logs on Docker host (RooCodeInc#4440)
* Harden evals with retry logic + centralize logs on Docker host * Don't delete controller container on exit * Add randomness * More tweaks * More tweaks
1 parent 6a232a9 commit 52673b3

File tree

7 files changed

+205
-45
lines changed

7 files changed

+205
-45
lines changed

apps/web-evals/src/actions/runs.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,10 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values
5252

5353
const dockerArgs = [
5454
`--name evals-controller-${run.id}`,
55-
"--rm",
55+
// "--rm",
5656
"--network evals_default",
5757
"-v /var/run/docker.sock:/var/run/docker.sock",
58+
"-v /tmp/evals:/var/log/evals",
5859
"-e HOST_EXECUTION_METHOD=docker",
5960
]
6061

packages/evals/docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ services:
7272
- HOST_EXECUTION_METHOD=docker
7373
volumes:
7474
- /var/run/docker.sock:/var/run/docker.sock
75+
- /tmp/evals:/var/log/evals
7576
stdin_open: true
7677
tty: true
7778
profiles:
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import * as fs from "fs"
2+
import * as path from "path"
3+
4+
export enum LogLevel {
5+
INFO = "INFO",
6+
ERROR = "ERROR",
7+
WARN = "WARN",
8+
DEBUG = "DEBUG",
9+
}
10+
11+
export interface LoggerOptions {
12+
logDir: string
13+
filename: string
14+
tag: string
15+
}
16+
17+
export class FileLogger {
18+
private logStream: fs.WriteStream | undefined
19+
private logFilePath: string
20+
private tag: string
21+
22+
constructor({ logDir, filename, tag }: LoggerOptions) {
23+
this.tag = tag
24+
this.logFilePath = path.join(logDir, filename)
25+
this.initializeLogger(logDir)
26+
}
27+
28+
private initializeLogger(logDir: string): void {
29+
try {
30+
fs.mkdirSync(logDir, { recursive: true })
31+
} catch (error) {
32+
console.error(`Failed to create log directory ${logDir}:`, error)
33+
}
34+
35+
try {
36+
this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" })
37+
} catch (error) {
38+
console.error(`Failed to create log file ${this.logFilePath}:`, error)
39+
}
40+
}
41+
42+
private writeToLog(level: LogLevel, message: string, ...args: unknown[]) {
43+
try {
44+
const timestamp = new Date().toISOString()
45+
46+
const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${
47+
args.length > 0 ? JSON.stringify(args) : ""
48+
}\n`
49+
50+
console.log(logLine.trim())
51+
52+
if (this.logStream) {
53+
this.logStream.write(logLine)
54+
}
55+
} catch (error) {
56+
console.error(`Failed to write to log file ${this.logFilePath}:`, error)
57+
}
58+
}
59+
60+
public info(message: string, ...args: unknown[]): void {
61+
this.writeToLog(LogLevel.INFO, message, ...args)
62+
}
63+
64+
public error(message: string, ...args: unknown[]): void {
65+
this.writeToLog(LogLevel.ERROR, message, ...args)
66+
}
67+
68+
public warn(message: string, ...args: unknown[]): void {
69+
this.writeToLog(LogLevel.WARN, message, ...args)
70+
}
71+
72+
public debug(message: string, ...args: unknown[]): void {
73+
this.writeToLog(LogLevel.DEBUG, message, ...args)
74+
}
75+
76+
public log(message: string, ...args: unknown[]): void {
77+
this.info(message, ...args)
78+
}
79+
80+
public close(): void {
81+
if (this.logStream) {
82+
this.logStream.end()
83+
this.logStream = undefined
84+
}
85+
}
86+
}

packages/evals/src/cli/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ const main = async () => {
3232
if (runId !== -1) {
3333
await runEvals(runId)
3434
} else {
35-
await processTask(taskId)
35+
await processTask({ taskId })
3636
}
3737
} catch (error) {
3838
console.error(error)
Lines changed: 71 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,42 @@
1+
import { execa } from "execa"
2+
13
import { RooCodeEventName, type TaskEvent } from "@roo-code/types"
24

35
import { findTask, updateTask, findRun } from "../db/index.js"
46

57
import { getTag } from "./utils.js"
8+
import { FileLogger } from "./FileLogger.js"
69
import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
710
import { runTask } from "./runTask.js"
811
import { runUnitTest } from "./runUnitTest.js"
9-
import { execa } from "execa"
1012

11-
export const processTask = async (taskId: number) => {
13+
export const processTask = async ({ taskId, logger }: { taskId: number; logger?: FileLogger }) => {
1214
const task = await findTask(taskId)
15+
const { language, exercise } = task
1316
const run = await findRun(task.runId)
1417
await registerRunner({ runId: run.id, taskId })
1518

16-
try {
17-
const tag = getTag("processTask", { run, task })
19+
logger =
20+
logger ||
21+
new FileLogger({
22+
logDir: `/var/log/evals/runs/${run.id}`,
23+
filename: `${language}-${exercise}.log`,
24+
tag: getTag("runTask", { run, task }),
25+
})
1826

27+
try {
1928
const publish = async (e: TaskEvent) => {
2029
const redis = await redisClient()
2130
await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
2231
}
2332

24-
console.log(`[${Date.now()} | ${tag}] running task ${task.id} (${task.language}/${task.exercise})...`)
25-
await runTask({ run, task, publish })
33+
logger.info(`running task ${task.id} (${language}/${exercise})...`)
34+
await runTask({ run, task, publish, logger })
2635

27-
console.log(`[${Date.now()} | ${tag}] testing task ${task.id} (${task.language}/${task.exercise})...`)
36+
logger.info(`testing task ${task.id} (${language}/${exercise})...`)
2837
const passed = await runUnitTest({ run, task })
2938

30-
console.log(`[${Date.now()} | ${tag}] task ${task.id} (${task.language}/${task.exercise}) -> ${passed}`)
39+
logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
3140
await updateTask(task.id, { passed })
3241

3342
await publish({
@@ -39,18 +48,65 @@ export const processTask = async (taskId: number) => {
3948
}
4049
}
4150

42-
export const processTaskInContainer = async (taskId: number) => {
43-
const args = [
44-
`--name evals-task-${taskId}`,
51+
export const processTaskInContainer = async ({
52+
taskId,
53+
logger,
54+
maxRetries = 10,
55+
}: {
56+
taskId: number
57+
logger: FileLogger
58+
maxRetries?: number
59+
}) => {
60+
const baseArgs = [
4561
"--rm",
4662
"--network evals_default",
4763
"-v /var/run/docker.sock:/var/run/docker.sock",
64+
"-v /tmp/evals:/var/log/evals",
4865
"-e HOST_EXECUTION_METHOD=docker",
4966
]
5067

5168
const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
52-
const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
53-
// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
54-
// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
55-
await subprocess
69+
logger.info(command)
70+
71+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
72+
const containerName = `evals-task-${taskId}.${attempt}`
73+
const args = [`--name ${containerName}`, ...baseArgs]
74+
const isRetry = attempt > 0
75+
76+
if (isRetry) {
77+
const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
78+
logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
79+
await new Promise((resolve) => setTimeout(resolve, delayMs))
80+
}
81+
82+
logger.info(
83+
`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
84+
)
85+
86+
const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
87+
// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
88+
// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
89+
90+
try {
91+
const result = await subprocess
92+
logger.info(`container process completed with exit code: ${result.exitCode}`)
93+
return
94+
} catch (error) {
95+
if (error && typeof error === "object" && "exitCode" in error) {
96+
logger.error(
97+
`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
98+
)
99+
} else {
100+
logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
101+
}
102+
103+
if (attempt === maxRetries) {
104+
break
105+
}
106+
}
107+
}
108+
109+
logger.error(`all ${maxRetries + 1} attempts failed, giving up`)
110+
111+
// TODO: Mark task as failed.
56112
}

packages/evals/src/cli/runEvals.ts

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { exercisesPath } from "../exercises/index.js"
66
import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
77
import { processTask, processTaskInContainer } from "./processTask.js"
88
import { startHeartbeat, stopHeartbeat } from "./redis.js"
9+
import { FileLogger } from "./FileLogger.js"
910

1011
export const runEvals = async (runId: number) => {
1112
const run = await findRun(runId)
@@ -20,8 +21,13 @@ export const runEvals = async (runId: number) => {
2021
throw new Error(`Run ${run.id} has no tasks.`)
2122
}
2223

23-
const tag = getTag("runEvals", { run })
24-
console.log(`[${Date.now()} | ${tag}] running ${tasks.length} task(s)`)
24+
const logger = new FileLogger({
25+
logDir: `/var/log/evals/runs/${run.id}`,
26+
filename: `controller.log`,
27+
tag: getTag("runEvals", { run }),
28+
})
29+
30+
logger.info(`running ${tasks.length} task(s)`)
2531

2632
const containerized = isDockerContainer()
2733

@@ -36,12 +42,22 @@ export const runEvals = async (runId: number) => {
3642
await queue.addAll(
3743
tasks
3844
.filter((task) => task.finishedAt === null)
39-
.map((task) => () => (containerized ? processTaskInContainer(task.id) : processTask(task.id))),
45+
.map((task) => async () => {
46+
try {
47+
if (containerized) {
48+
await processTaskInContainer({ taskId: task.id, logger })
49+
} else {
50+
await processTask({ taskId: task.id, logger })
51+
}
52+
} catch (error) {
53+
logger.error("error processing task", error)
54+
}
55+
}),
4056
)
4157

42-
console.log(`[${Date.now()} | ${tag}] finishRun`)
58+
logger.info("finishRun")
4359
const result = await finishRun(run.id)
44-
console.log(`[${Date.now()} | ${tag}] result ->`, result)
60+
logger.info("result ->", result)
4561

4662
// There's no need to commit the changes in the container since they
4763
// will lost when the container is destroyed. I think we should
@@ -50,7 +66,8 @@ export const runEvals = async (runId: number) => {
5066
await commitEvalsRepoChanges({ run, cwd: exercisesPath })
5167
}
5268
} finally {
53-
console.log(`[${Date.now()} | ${tag}] cleaning up`)
69+
logger.info("cleaning up")
5470
stopHeartbeat(run.id, heartbeat)
71+
logger.close()
5572
}
5673
}

0 commit comments

Comments
 (0)