diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 0000000000..9d8f9fb49b --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,74 @@ +name: Evals + +on: + pull_request: + types: [labeled] + workflow_dispatch: + +env: + DOCKER_BUILDKIT: 1 + COMPOSE_DOCKER_CLI_BUILD: 1 + +jobs: + evals: + # Run if triggered manually or if PR has 'evals' label. + if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals') + runs-on: blacksmith-16vcpu-ubuntu-2404 + timeout-minutes: 45 + + defaults: + run: + working-directory: packages/evals + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Create environment + run: | + cat > .env.local << EOF + OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} + EOF + + cat > .env.development << EOF + NODE_ENV=development + DATABASE_URL=postgresql://postgres:password@db:5432/evals_development + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + - name: Build image + uses: docker/build-push-action@v5 + with: + context: . + file: packages/evals/Dockerfile.runner + tags: evals-runner:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + + - name: Tag image + run: docker tag evals-runner:latest evals-runner + + - name: Start containers + run: | + docker compose up -d db redis + timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' + timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' + docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' + docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' + docker compose run --rm runner docker ps + + - name: Run database migrations + run: docker compose run --rm runner pnpm --filter @roo-code/evals db:migrate + + - name: Run evals + run: docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci + + - name: Cleanup + if: always() + run: docker compose down -v --remove-orphans diff --git a/apps/web-evals/src/actions/exercises.ts b/apps/web-evals/src/actions/exercises.ts index 8cffa40ba3..17eb1ff085 100644 --- a/apps/web-evals/src/actions/exercises.ts +++ b/apps/web-evals/src/actions/exercises.ts @@ -1,37 +1,22 @@ "use server" -import * as fs from "fs/promises" import * as path from "path" import { fileURLToPath } from "url" -import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals" +import { exerciseLanguages, listDirectories } from "@roo-code/evals" const __dirname = path.dirname(fileURLToPath(import.meta.url)) // /apps/web-evals/src/actions -const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals") - -export const listDirectories = async (relativePath: string) => { - try { - const targetPath = path.resolve(__dirname, relativePath) - const entries = await fs.readdir(targetPath, { withFileTypes: true }) - return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name) - } catch (error) { - console.error(`Error listing directories at ${relativePath}:`, error) - return [] - } -} +const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals") export const getExercises = async () => { const result = await Promise.all( exerciseLanguages.map(async (language) => { - const languagePath = path.join(EXERCISES_BASE_PATH, language) - const exercises = await listDirectories(languagePath) + const languagePath = path.join(EVALS_REPO_PATH, language) + const exercises = await listDirectories(__dirname, languagePath) return exercises.map((exercise) => `${language}/${exercise}`) }), ) return result.flat() } - -export const getExercisesForLanguage = async (language: ExerciseLanguage) => - listDirectories(path.join(EXERCISES_BASE_PATH, language)) diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts index 80a4659567..90387d3257 100644 --- a/apps/web-evals/src/actions/runs.ts +++ b/apps/web-evals/src/actions/runs.ts @@ -1,7 +1,9 @@ "use server" -import { spawn } from "child_process" +import * as path from "path" import fs from "fs" +import { fileURLToPath } from "url" +import { spawn } from "child_process" import { revalidatePath } from "next/cache" import pMap from "p-map" @@ -12,11 +14,12 @@ import { createRun as _createRun, deleteRun as _deleteRun, createTask, + getExercisesForLanguage, } from "@roo-code/evals" import { CreateRun } from "@/lib/schemas" -import { getExercisesForLanguage } from "./exercises" +const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") // eslint-disable-next-line @typescript-eslint/no-unused-vars export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) { @@ -37,9 +40,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values } } else { for (const language of exerciseLanguages) { - const exercises = await getExercisesForLanguage(language) + const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language) - await pMap(exercises, (exercise) => createTask({ ...values, runId: run.id, language, exercise }), { + await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), { concurrency: 10, }) } diff --git a/apps/web-evals/src/app/api/health/route.ts b/apps/web-evals/src/app/api/health/route.ts new file mode 100644 index 0000000000..ca8a833942 --- /dev/null +++ b/apps/web-evals/src/app/api/health/route.ts @@ -0,0 +1,24 @@ +import { NextResponse } from "next/server" + +export async function GET() { + try { + return NextResponse.json( + { + status: "healthy", + timestamp: new Date().toISOString(), + uptime: process.uptime(), + environment: process.env.NODE_ENV || "production", + }, + { status: 200 }, + ) + } catch (error) { + return NextResponse.json( + { + status: "unhealthy", + timestamp: new Date().toISOString(), + error: error instanceof Error ? error.message : "Unknown error", + }, + { status: 503 }, + ) + } +} diff --git a/packages/evals/Dockerfile.runner b/packages/evals/Dockerfile.runner index c68b4f80c0..ec3277461c 100644 --- a/packages/evals/Dockerfile.runner +++ b/packages/evals/Dockerfile.runner @@ -13,6 +13,7 @@ RUN apt update && \ git \ vim \ jq \ + netcat-openbsd \ apt-transport-https \ ca-certificates \ gnupg \ diff --git a/packages/evals/Dockerfile.web b/packages/evals/Dockerfile.web index 55e8b5a298..b8713f69b9 100644 --- a/packages/evals/Dockerfile.web +++ b/packages/evals/Dockerfile.web @@ -8,7 +8,7 @@ RUN npm install -g npm@latest RUN npm install -g npm-run-all # Install system packages -RUN apt update && apt install -y curl git vim jq postgresql-client +RUN apt update && apt install -y curl git vim jq netcat-openbsd postgresql-client # Install Docker cli RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index 37d95dbb59..93e643e44b 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -17,8 +17,10 @@ services: db: container_name: evals-db image: postgres:15.4 - expose: - - 5432 + # expose: + # - 5432 + ports: + - "${EVALS_DB_PORT:-5432}:5432" volumes: - ./.docker/postgres:/var/lib/postgresql/data - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d @@ -38,8 +40,10 @@ services: redis: container_name: evals-redis image: redis:7-alpine - expose: - - 6379 + # expose: + # - 6379 + ports: + - "${EVALS_REDIS_PORT:-6379}:6379" volumes: - ./.docker/redis:/data command: redis-server --appendonly yes diff --git a/packages/evals/package.json b/packages/evals/package.json index 554356e5b1..88195b134b 100644 --- a/packages/evals/package.json +++ b/packages/evals/package.json @@ -21,7 +21,8 @@ "db:start": "docker compose up -d db", "db:stop": "docker compose down db", "redis:start": "docker compose up -d redis", - "redis:stop": "docker compose down redis" + "redis:stop": "docker compose down redis", + "services:start": "docker compose up -d db redis" }, "dependencies": { "@roo-code/ipc": "workspace:^", diff --git a/packages/evals/src/cli/FileLogger.ts b/packages/evals/src/cli/FileLogger.ts deleted file mode 100644 index 443c1d2c53..0000000000 --- a/packages/evals/src/cli/FileLogger.ts +++ /dev/null @@ -1,86 +0,0 @@ -import * as fs from "fs" -import * as path from "path" - -export enum LogLevel { - INFO = "INFO", - ERROR = "ERROR", - WARN = "WARN", - DEBUG = "DEBUG", -} - -export interface LoggerOptions { - logDir: string - filename: string - tag: string -} - -export class FileLogger { - private logStream: fs.WriteStream | undefined - private logFilePath: string - private tag: string - - constructor({ logDir, filename, tag }: LoggerOptions) { - this.tag = tag - this.logFilePath = path.join(logDir, filename) - this.initializeLogger(logDir) - } - - private initializeLogger(logDir: string): void { - try { - fs.mkdirSync(logDir, { recursive: true }) - } catch (error) { - console.error(`Failed to create log directory ${logDir}:`, error) - } - - try { - this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" }) - } catch (error) { - console.error(`Failed to create log file ${this.logFilePath}:`, error) - } - } - - private writeToLog(level: LogLevel, message: string, ...args: unknown[]) { - try { - const timestamp = new Date().toISOString() - - const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${ - args.length > 0 ? JSON.stringify(args) : "" - }\n` - - console.log(logLine.trim()) - - if (this.logStream) { - this.logStream.write(logLine) - } - } catch (error) { - console.error(`Failed to write to log file ${this.logFilePath}:`, error) - } - } - - public info(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.INFO, message, ...args) - } - - public error(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.ERROR, message, ...args) - } - - public warn(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.WARN, message, ...args) - } - - public debug(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.DEBUG, message, ...args) - } - - public log(message: string, ...args: unknown[]): void { - this.info(message, ...args) - } - - public close(): void { - if (this.logStream) { - this.logStream.end() - this.logStream = undefined - } - } -} diff --git a/packages/evals/src/cli/index.ts b/packages/evals/src/cli/index.ts index 35c9203093..de62be8ae0 100644 --- a/packages/evals/src/cli/index.ts +++ b/packages/evals/src/cli/index.ts @@ -1,11 +1,12 @@ import * as fs from "fs" -import { command, run, number, option } from "cmd-ts" +import { run, command, option, flag, number, boolean } from "cmd-ts" -import { exercisesPath } from "../exercises/index.js" +import { EVALS_REPO_PATH } from "../exercises/index.js" +import { runCi } from "./runCi.js" import { runEvals } from "./runEvals.js" -import { processTask } from "./processTask.js" +import { processTask } from "./runTask.js" const main = async () => { await run( @@ -14,25 +15,22 @@ const main = async () => { description: "Execute an eval run.", version: "0.0.0", args: { + ci: flag({ type: boolean, long: "ci", defaultValue: () => false }), runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }), taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }), }, handler: async (args) => { - const { runId, taskId } = args - - if (runId === -1 && taskId === -1) { - throw new Error("Either runId or taskId must be provided.") - } - - if (runId !== -1 && taskId !== -1) { - throw new Error("Only one of runId or taskId must be provided.") - } + const { runId, taskId, ci } = args try { - if (runId !== -1) { + if (ci) { + await runCi({ concurrency: 3, exercisesPerLanguage: 5 }) + } else if (runId !== -1) { await runEvals(runId) - } else { + } else if (taskId !== -1) { await processTask({ taskId }) + } else { + throw new Error("Either runId or taskId must be provided.") } } catch (error) { console.error(error) @@ -46,9 +44,9 @@ const main = async () => { process.exit(0) } -if (!fs.existsSync(exercisesPath)) { +if (!fs.existsSync(EVALS_REPO_PATH)) { console.error( - `Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`, + `Exercises do not exist at ${EVALS_REPO_PATH}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`, ) process.exit(1) diff --git a/packages/evals/src/cli/processTask.ts b/packages/evals/src/cli/processTask.ts deleted file mode 100644 index 2b70013864..0000000000 --- a/packages/evals/src/cli/processTask.ts +++ /dev/null @@ -1,112 +0,0 @@ -import { execa } from "execa" - -import { RooCodeEventName, type TaskEvent } from "@roo-code/types" - -import { findTask, updateTask, findRun } from "../db/index.js" - -import { getTag } from "./utils.js" -import { FileLogger } from "./FileLogger.js" -import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js" -import { runTask } from "./runTask.js" -import { runUnitTest } from "./runUnitTest.js" - -export const processTask = async ({ taskId, logger }: { taskId: number; logger?: FileLogger }) => { - const task = await findTask(taskId) - const { language, exercise } = task - const run = await findRun(task.runId) - await registerRunner({ runId: run.id, taskId }) - - logger = - logger || - new FileLogger({ - logDir: `/var/log/evals/runs/${run.id}`, - filename: `${language}-${exercise}.log`, - tag: getTag("runTask", { run, task }), - }) - - try { - const publish = async (e: TaskEvent) => { - const redis = await redisClient() - await redis.publish(getPubSubKey(run.id), JSON.stringify(e)) - } - - logger.info(`running task ${task.id} (${language}/${exercise})...`) - await runTask({ run, task, publish, logger }) - - logger.info(`testing task ${task.id} (${language}/${exercise})...`) - const passed = await runUnitTest({ run, task }) - - logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`) - await updateTask(task.id, { passed }) - - await publish({ - eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail, - taskId: task.id, - }) - } finally { - await deregisterRunner({ runId: run.id, taskId }) - } -} - -export const processTaskInContainer = async ({ - taskId, - logger, - maxRetries = 10, -}: { - taskId: number - logger: FileLogger - maxRetries?: number -}) => { - const baseArgs = [ - "--rm", - "--network evals_default", - "-v /var/run/docker.sock:/var/run/docker.sock", - "-v /tmp/evals:/var/log/evals", - "-e HOST_EXECUTION_METHOD=docker", - ] - - const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}` - logger.info(command) - - for (let attempt = 0; attempt <= maxRetries; attempt++) { - const containerName = `evals-task-${taskId}.${attempt}` - const args = [`--name ${containerName}`, ...baseArgs] - const isRetry = attempt > 0 - - if (isRetry) { - const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random()) - logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`) - await new Promise((resolve) => setTimeout(resolve, delayMs)) - } - - logger.info( - `${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`, - ) - - const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true }) - // subprocess.stdout?.on("data", (data) => console.log(data.toString())) - // subprocess.stderr?.on("data", (data) => console.error(data.toString())) - - try { - const result = await subprocess - logger.info(`container process completed with exit code: ${result.exitCode}`) - return - } catch (error) { - if (error && typeof error === "object" && "exitCode" in error) { - logger.error( - `container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`, - ) - } else { - logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`) - } - - if (attempt === maxRetries) { - break - } - } - } - - logger.error(`all ${maxRetries + 1} attempts failed, giving up`) - - // TODO: Mark task as failed. -} diff --git a/packages/evals/src/cli/runCi.ts b/packages/evals/src/cli/runCi.ts new file mode 100644 index 0000000000..ca8a88e0e0 --- /dev/null +++ b/packages/evals/src/cli/runCi.ts @@ -0,0 +1,30 @@ +import pMap from "p-map" + +import { EVALS_REPO_PATH, exerciseLanguages, getExercisesForLanguage } from "../exercises/index.js" +import { createRun, createTask } from "../db/index.js" + +import { runEvals } from "./runEvals.js" + +export const runCi = async ({ + concurrency = 1, + exercisesPerLanguage, +}: { + concurrency?: number + exercisesPerLanguage?: number +} = {}) => { + console.log("Running evals in CI mode.") + + const run = await createRun({ model: "anthropic/claude-sonnet-4", socketPath: "", concurrency }) + + for (const language of exerciseLanguages) { + let exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language) + + if (exercisesPerLanguage) { + exercises = exercises.slice(0, exercisesPerLanguage) + } + + await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), { concurrency }) + } + + await runEvals(run.id) +} diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts index 56bc6ce222..00199bbb44 100644 --- a/packages/evals/src/cli/runEvals.ts +++ b/packages/evals/src/cli/runEvals.ts @@ -1,12 +1,11 @@ import PQueue from "p-queue" import { findRun, finishRun, getTasks } from "../db/index.js" -import { exercisesPath } from "../exercises/index.js" +import { EVALS_REPO_PATH } from "../exercises/index.js" -import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js" -import { processTask, processTaskInContainer } from "./processTask.js" +import { Logger, getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js" import { startHeartbeat, stopHeartbeat } from "./redis.js" -import { FileLogger } from "./FileLogger.js" +import { processTask, processTaskInContainer } from "./runTask.js" export const runEvals = async (runId: number) => { const run = await findRun(runId) @@ -21,7 +20,7 @@ export const runEvals = async (runId: number) => { throw new Error(`Run ${run.id} has no tasks.`) } - const logger = new FileLogger({ + const logger = new Logger({ logDir: `/var/log/evals/runs/${run.id}`, filename: `controller.log`, tag: getTag("runEvals", { run }), @@ -32,7 +31,7 @@ export const runEvals = async (runId: number) => { const containerized = isDockerContainer() if (!containerized) { - await resetEvalsRepo({ run, cwd: exercisesPath }) + await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH }) } const heartbeat = await startHeartbeat(run.id) @@ -63,7 +62,7 @@ export const runEvals = async (runId: number) => { // will lost when the container is destroyed. I think we should // store the diffs in the database instead. if (!containerized) { - await commitEvalsRepoChanges({ run, cwd: exercisesPath }) + await commitEvalsRepoChanges({ run, cwd: EVALS_REPO_PATH }) } } finally { logger.info("cleaning up") diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index 3b1ba61104..14028b493a 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -15,11 +15,21 @@ import { } from "@roo-code/types" import { IpcClient } from "@roo-code/ipc" -import { type Run, type Task, updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js" -import { exercisesPath } from "../exercises/index.js" - -import { isDockerContainer } from "./utils.js" -import { FileLogger } from "./FileLogger.js" +import { + type Run, + type Task, + findRun, + findTask, + updateTask, + createTaskMetrics, + updateTaskMetrics, + createToolError, +} from "../db/index.js" +import { EVALS_REPO_PATH } from "../exercises/index.js" + +import { Logger, getTag, isDockerContainer } from "./utils.js" +import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js" +import { runUnitTest } from "./runUnitTest.js" class SubprocessTimeoutError extends Error { constructor(timeout: number) { @@ -28,17 +38,118 @@ class SubprocessTimeoutError extends Error { } } +export const processTask = async ({ taskId, logger }: { taskId: number; logger?: Logger }) => { + const task = await findTask(taskId) + const { language, exercise } = task + const run = await findRun(task.runId) + await registerRunner({ runId: run.id, taskId }) + + logger = + logger || + new Logger({ + logDir: `/var/log/evals/runs/${run.id}`, + filename: `${language}-${exercise}.log`, + tag: getTag("runTask", { run, task }), + }) + + try { + const publish = async (e: TaskEvent) => { + const redis = await redisClient() + await redis.publish(getPubSubKey(run.id), JSON.stringify(e)) + } + + logger.info(`running task ${task.id} (${language}/${exercise})...`) + await runTask({ run, task, publish, logger }) + + logger.info(`testing task ${task.id} (${language}/${exercise})...`) + const passed = await runUnitTest({ task, logger }) + + logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`) + await updateTask(task.id, { passed }) + + await publish({ + eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail, + taskId: task.id, + }) + } finally { + await deregisterRunner({ runId: run.id, taskId }) + } +} + +export const processTaskInContainer = async ({ + taskId, + logger, + maxRetries = 10, +}: { + taskId: number + logger: Logger + maxRetries?: number +}) => { + const baseArgs = [ + "--rm", + "--network evals_default", + "-v /var/run/docker.sock:/var/run/docker.sock", + "-v /tmp/evals:/var/log/evals", + "-e HOST_EXECUTION_METHOD=docker", + ] + + const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}` + logger.info(command) + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + const containerName = `evals-task-${taskId}.${attempt}` + const args = [`--name ${containerName}`, ...baseArgs] + const isRetry = attempt > 0 + + if (isRetry) { + const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random()) + logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`) + await new Promise((resolve) => setTimeout(resolve, delayMs)) + } + + logger.info( + `${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`, + ) + + const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true }) + // subprocess.stdout?.on("data", (data) => console.log(data.toString())) + // subprocess.stderr?.on("data", (data) => console.error(data.toString())) + + try { + const result = await subprocess + logger.info(`container process completed with exit code: ${result.exitCode}`) + return + } catch (error) { + if (error && typeof error === "object" && "exitCode" in error) { + logger.error( + `container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`, + ) + } else { + logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`) + } + + if (attempt === maxRetries) { + break + } + } + } + + logger.error(`all ${maxRetries + 1} attempts failed, giving up`) + + // TODO: Mark task as failed. +} + type RunTaskOptions = { run: Run task: Task publish: (taskEvent: TaskEvent) => Promise - logger: FileLogger + logger: Logger } export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => { const { language, exercise } = task - const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8") - const workspacePath = path.resolve(exercisesPath, language, exercise) + const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8") + const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise) const ipcSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`) const env = { ROO_CODE_IPC_SOCKET_PATH: ipcSocketPath } const controller = new AbortController() @@ -87,6 +198,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => let taskStartedAt = Date.now() let taskFinishedAt: number | undefined let taskAbortedAt: number | undefined + let taskTimedOut: boolean = false let taskMetricsId: number | undefined let rooTaskId: string | undefined let isClientDisconnected = false @@ -196,6 +308,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => timeout: EVALS_TIMEOUT, }) } catch (_error) { + taskTimedOut = true logger.error("time limit reached") if (rooTaskId && !isClientDisconnected) { @@ -207,16 +320,16 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => taskFinishedAt = Date.now() } - if (taskFinishedAt) { - logger.info("setting task finished at") - await updateTask(task.id, { finishedAt: new Date() }) - } - - if (!taskFinishedAt && isClientDisconnected) { + if (!taskFinishedAt && !taskTimedOut) { logger.error("client disconnected before task finished") throw new Error("Client disconnected before task completion.") } + // If the task was aborted unexpectedly or the client disconnected + // unexpectedly, then throw to trigger a retry. + logger.info("setting task finished at") + await updateTask(task.id, { finishedAt: new Date() }) + if (rooTaskId && !isClientDisconnected) { logger.info("closing task") client.sendCommand({ commandName: TaskCommandName.CloseTask, data: rooTaskId }) diff --git a/packages/evals/src/cli/runUnitTest.ts b/packages/evals/src/cli/runUnitTest.ts index 7785312e76..6f8fbac619 100644 --- a/packages/evals/src/cli/runUnitTest.ts +++ b/packages/evals/src/cli/runUnitTest.ts @@ -3,14 +3,14 @@ import * as path from "path" import { execa, parseCommandString } from "execa" import psTree from "ps-tree" -import type { Run, Task } from "../db/index.js" -import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js" +import type { Task } from "../db/index.js" +import { type ExerciseLanguage, EVALS_REPO_PATH } from "../exercises/index.js" -import { getTag } from "./utils.js" +import { Logger } from "./utils.js" const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000 -const testCommands: Record = { +const testCommands: Record = { go: { commands: ["go test"] }, java: { commands: ["./gradlew test"] }, javascript: { commands: ["pnpm install", "pnpm test"] }, @@ -18,22 +18,21 @@ const testCommands: Record { - const tag = getTag("runUnitTest", { run, task }) - const log = (message: string, ...args: unknown[]) => console.log(`[${Date.now()} | ${tag}] ${message}`, ...args) - const logError = (message: string, ...args: unknown[]) => - console.error(`[${Date.now()} | ${tag}] ${message}`, ...args) +type RunUnitTestOptions = { + task: Task + logger: Logger +} +export const runUnitTest = async ({ task, logger }: RunUnitTestOptions) => { const cmd = testCommands[task.language] - const exercisePath = path.resolve(exercisesPath, task.language, task.exercise) - const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath + const cwd = path.resolve(EVALS_REPO_PATH, task.language, task.exercise) const commands = cmd.commands.map((cs) => parseCommandString(cs)) let passed = true for (const command of commands) { try { - log(`running "${command.join(" ")}"`) + logger.info(`running "${command.join(" ")}"`) const subprocess = execa({ cwd, shell: "/bin/bash", reject: false })`${command}` subprocess.stdout.pipe(process.stdout) subprocess.stderr.pipe(process.stderr) @@ -49,25 +48,27 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => { }) }) - log(`"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`) + logger.info( + `"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`, + ) if (descendants.length > 0) { for (const descendant of descendants) { try { - log(`killing descendant process ${descendant}`) + logger.info(`killing descendant process ${descendant}`) await execa`kill -9 ${descendant}` } catch (error) { - logError(`failed to kill descendant process ${descendant}:`, error) + logger.error(`failed to kill descendant process ${descendant}:`, error) } } } - log(`killing main process ${subprocess.pid}`) + logger.info(`killing main process ${subprocess.pid}`) try { await execa`kill -9 ${subprocess.pid!}` } catch (error) { - logError(`failed to kill main process ${subprocess.pid}:`, error) + logger.error(`failed to kill main process ${subprocess.pid}:`, error) } }, UNIT_TEST_TIMEOUT) @@ -80,7 +81,7 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => { break } } catch (error) { - logError(`unexpected error:`, error) + logger.error(`unexpected error:`, error) passed = false break } diff --git a/packages/evals/src/cli/utils.ts b/packages/evals/src/cli/utils.ts index cbabb451b9..bf1489d09b 100644 --- a/packages/evals/src/cli/utils.ts +++ b/packages/evals/src/cli/utils.ts @@ -1,4 +1,5 @@ import * as fs from "fs" +import * as path from "path" import { execa } from "execa" @@ -29,3 +30,87 @@ export const commitEvalsRepoChanges = async ({ run, cwd }: { run: Run; cwd: stri await execa({ cwd })`git add .` await execa({ cwd })`git commit -m ${`Run #${run.id}`} --no-verify` } + +enum LogLevel { + INFO = "INFO", + ERROR = "ERROR", + WARN = "WARN", + DEBUG = "DEBUG", +} + +interface LoggerOptions { + logDir: string + filename: string + tag: string +} + +export class Logger { + private logStream: fs.WriteStream | undefined + private logFilePath: string + private tag: string + + constructor({ logDir, filename, tag }: LoggerOptions) { + this.tag = tag + this.logFilePath = path.join(logDir, filename) + this.initializeLogger(logDir) + } + + private initializeLogger(logDir: string): void { + try { + fs.mkdirSync(logDir, { recursive: true }) + } catch (error) { + console.error(`Failed to create log directory ${logDir}:`, error) + } + + try { + this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" }) + } catch (error) { + console.error(`Failed to create log file ${this.logFilePath}:`, error) + } + } + + private writeToLog(level: LogLevel, message: string, ...args: unknown[]) { + try { + const timestamp = new Date().toISOString() + + const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${ + args.length > 0 ? JSON.stringify(args) : "" + }\n` + + console.log(logLine.trim()) + + if (this.logStream) { + this.logStream.write(logLine) + } + } catch (error) { + console.error(`Failed to write to log file ${this.logFilePath}:`, error) + } + } + + public info(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.INFO, message, ...args) + } + + public error(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.ERROR, message, ...args) + } + + public warn(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.WARN, message, ...args) + } + + public debug(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.DEBUG, message, ...args) + } + + public log(message: string, ...args: unknown[]): void { + this.info(message, ...args) + } + + public close(): void { + if (this.logStream) { + this.logStream.end() + this.logStream = undefined + } + } +} diff --git a/packages/evals/src/exercises/index.ts b/packages/evals/src/exercises/index.ts index 17e339f21a..7ba34f2a2b 100644 --- a/packages/evals/src/exercises/index.ts +++ b/packages/evals/src/exercises/index.ts @@ -4,15 +4,15 @@ import { fileURLToPath } from "url" const __dirname = path.dirname(fileURLToPath(import.meta.url)) -export const exercisesPath = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals") +export const EVALS_REPO_PATH = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals") export const exerciseLanguages = ["go", "java", "javascript", "python", "rust"] as const export type ExerciseLanguage = (typeof exerciseLanguages)[number] -const listDirectories = async (relativePath: string) => { +export const listDirectories = async (basePath: string, relativePath: string) => { try { - const targetPath = path.resolve(__dirname, relativePath) + const targetPath = path.resolve(basePath, relativePath) const entries = await fs.readdir(targetPath, { withFileTypes: true }) return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name) } catch (error) { @@ -21,5 +21,5 @@ const listDirectories = async (relativePath: string) => { } } -export const getExercisesForLanguage = async (language: ExerciseLanguage) => - listDirectories(path.join(exercisesPath, language)) +export const getExercisesForLanguage = async (basePath: string, language: ExerciseLanguage) => + listDirectories(__dirname, path.join(basePath, language))