From 580479a2a7b04ede55636648a521f7a04d403a18 Mon Sep 17 00:00:00 2001 From: cte Date: Wed, 4 Jun 2025 00:20:07 -0700 Subject: [PATCH 1/9] Improve Docker setup for evals --- .dockerignore | 16 +++- packages/evals/Dockerfile | 124 +++++++++++++++++++----------- packages/evals/docker-compose.yml | 7 ++ packages/evals/scripts/setup.sh | 2 +- 4 files changed, 99 insertions(+), 50 deletions(-) diff --git a/.dockerignore b/.dockerignore index eacfebecb2..2d1d624af6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,18 +1,28 @@ # Build artifacts bin/ -!bin/roo-code-latest.vsix dist/ **/dist/ out/ **/out/ +src/webview-ui/ -# Dependencies +# dependencies node_modules/ **/node_modules/ -# Test and development files +# testing coverage/ **/.vscode-test/ +**/mock/ +# devtools knip.json .husky/ + +# monorepo +.turbo/ +**/.turbo/ + +# next.js +**/.next/ +.vercel diff --git a/packages/evals/Dockerfile b/packages/evals/Dockerfile index a39c5cb94c..bff55d1c69 100644 --- a/packages/evals/Dockerfile +++ b/packages/evals/Dockerfile @@ -1,16 +1,23 @@ +# docker build -f packages/evals/Dockerfile -t roo-code-evals . +# docker run -it roo-code-evals + FROM node:20-slim AS base - ENV PNPM_HOME="/pnpm" - ENV PATH="$PNPM_HOME:$PATH" + +# Install pnpm +ENV PNPM_HOME="/pnpm" +ENV PATH="$PNPM_HOME:$PATH" RUN corepack enable RUN npm install -g npm@latest RUN npm install -g npm-run-all -# Install dependencies + +# Install system packages RUN apt update && apt install -y sudo curl git vim jq # Create a `vscode` user RUN useradd -m vscode -s /bin/bash && \ echo "vscode ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/vscode && \ chmod 0440 /etc/sudoers.d/vscode + # Install VS Code # https://code.visualstudio.com/docs/setup/linux RUN apt install -y wget gpg apt-transport-https @@ -19,59 +26,84 @@ RUN install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/p RUN echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null RUN rm -f packages.microsoft.gpg RUN apt update && apt install -y code + # Install Xvfb RUN apt install -y xvfb + # [cpp] Install cmake 3.28.3 RUN apt install -y cmake + # [go] Install Go 1.22.2 RUN apt install -y golang-go + # [java] Install Java 21 RUN apt install -y default-jre + # [python] Install Python 3.12.3 and uv 0.6.6 RUN apt install -y python3 python3-venv python3-dev python3-pip + +WORKDIR /home/vscode +USER vscode + # [rust] Install Rust 1.85 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc - WORKDIR /home/vscode - USER vscode - - # Copy evals - RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals - - # Prepare evals - WORKDIR /home/vscode/evals/python - RUN curl -LsSf https://astral.sh/uv/install.sh | sh - RUN /home/vscode/.local/bin/uv sync - - WORKDIR /home/vscode/repo/benchmark - - # Install dependencies - COPY --chown=vscode:vscode ./evals/package.json ./evals/pnpm-lock.yaml ./evals/pnpm-workspace.yaml ./evals/.npmrc ./ - RUN mkdir -p apps/cli apps/web \ - config/eslint config/typescript \ - packages/db packages/ipc packages/lib packages/types - COPY --chown=vscode:vscode ./evals/apps/cli/package.json ./apps/cli/ - COPY --chown=vscode:vscode ./evals/apps/web/package.json ./apps/web/ - COPY --chown=vscode:vscode ./evals/config/eslint/package.json ./config/eslint/ - COPY --chown=vscode:vscode ./evals/config/typescript/package.json ./config/typescript/ - COPY --chown=vscode:vscode ./evals/packages/db/package.json ./packages/db/ - COPY --chown=vscode:vscode ./evals/packages/ipc/package.json ./packages/ipc/ - COPY --chown=vscode:vscode ./evals/packages/lib/package.json ./packages/lib/ - COPY --chown=vscode:vscode ./evals/packages/types/package.json ./packages/types/ - RUN pnpm install - - # Copy & install extension - COPY --chown=vscode:vscode ./bin/roo-code-latest.vsix ./ - RUN code --debug --install-extension ./roo-code-latest.vsix - - # Copy application code - COPY --chown=vscode:vscode ./evals ./ - - # Copy environment variables - COPY --chown=vscode:vscode ./evals/.env ./ - - # Push database schema - RUN pnpm --filter @roo-code/evals db:push --force - - EXPOSE 3000 - CMD ["pnpm", "web"] + +# Copy evals +RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals + +# Prepare evals +WORKDIR /home/vscode/evals/python +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +RUN /home/vscode/.local/bin/uv sync + +WORKDIR /home/vscode/repo + +# Install npm packages +RUN mkdir -p \ + scripts \ + apps/web-evals \ + packages/build \ + packages/cloud \ + packages/config-eslint \ + packages/config-typescript \ + packages/evals \ + packages/ipc \ + packages/telemetry \ + packages/types \ + src \ + webview-ui + +COPY --chown=vscode:vscode ./package.json ./ +COPY --chown=vscode:vscode ./pnpm-lock.yaml ./ +COPY --chown=vscode:vscode ./pnpm-workspace.yaml ./ +COPY --chown=vscode:vscode ./scripts/bootstrap.mjs ./scripts/ +COPY --chown=vscode:vscode ./apps/web-evals/package.json ./apps/web-evals/ +COPY --chown=vscode:vscode ./packages/build/package.json ./packages/build/ +COPY --chown=vscode:vscode ./packages/cloud/package.json ./packages/cloud/ +COPY --chown=vscode:vscode ./packages/config-eslint/package.json ./packages/config-eslint/ +COPY --chown=vscode:vscode ./packages/config-typescript/package.json ./packages/config-typescript/ +COPY --chown=vscode:vscode ./packages/evals/package.json ./packages/evals/ +COPY --chown=vscode:vscode ./packages/ipc/package.json ./packages/ipc/ +COPY --chown=vscode:vscode ./packages/telemetry/package.json ./packages/telemetry/ +COPY --chown=vscode:vscode ./packages/types/package.json ./packages/types/ +COPY --chown=vscode:vscode ./src/package.json ./src/ +COPY --chown=vscode:vscode ./webview-ui/package.json ./webview-ui/ +RUN pnpm install + +# Build the extension +COPY --chown=vscode:vscode . ./ +RUN pnpm vsix -- --out ../bin/roo-code.vsix + +# Copy & install extension +RUN code --debug --install-extension ./roo-code.vsix + +# Copy env variables +# COPY --chown=vscode:vscode packages/evals/.env ./ + +# Push database schema +# RUN pnpm --filter @roo-code/evals db:push --force + +# EXPOSE 3000 +# CMD ["pnpm", "web"] +CMD ["bash"] diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index e84d1df986..da2940a16f 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -1,3 +1,5 @@ +# docker compose up --build --no-start + services: postgres: container_name: postgres-evals @@ -11,3 +13,8 @@ services: - POSTGRES_USER=postgres - POSTGRES_PASSWORD=password - POSTGRES_DATABASES=evals_development,evals_test + runner: + container_name: runner-evals + build: + context: ../../ + dockerfile: packages/evals/Dockerfile diff --git a/packages/evals/scripts/setup.sh b/packages/evals/scripts/setup.sh index 33f08bfe09..464abc3c0c 100755 --- a/packages/evals/scripts/setup.sh +++ b/packages/evals/scripts/setup.sh @@ -28,7 +28,7 @@ build_extension() { echo "๐Ÿ”จ Building the Roo Code extension..." cd .. mkdir -p bin - pnpm build -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 + pnpm vsix -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 code --install-extension bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 cd evals } From 19e8b04ce52b896172073cd950975984a50fc805 Mon Sep 17 00:00:00 2001 From: cte Date: Wed, 4 Jun 2025 02:07:13 -0700 Subject: [PATCH 2/9] More progress --- .dockerignore | 5 +- apps/web-evals/src/lib/server/runs.ts | 68 ++++++++++++++---- .../evals/{Dockerfile => Dockerfile.runner} | 18 ++--- packages/evals/Dockerfile.web | 72 +++++++++++++++++++ packages/evals/docker-compose.yml | 64 ++++++++++++----- 5 files changed, 180 insertions(+), 47 deletions(-) rename packages/evals/{Dockerfile => Dockerfile.runner} (87%) create mode 100644 packages/evals/Dockerfile.web diff --git a/.dockerignore b/.dockerignore index 2d1d624af6..2b4c07e517 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,7 @@ -# Build artifacts +# git +.git + +# build artifacts bin/ dist/ **/dist/ diff --git a/apps/web-evals/src/lib/server/runs.ts b/apps/web-evals/src/lib/server/runs.ts index bb4a2b47d0..f4e77fe758 100644 --- a/apps/web-evals/src/lib/server/runs.ts +++ b/apps/web-evals/src/lib/server/runs.ts @@ -1,3 +1,5 @@ +/* eslint-disable @typescript-eslint/no-unused-vars */ + "use server" import { spawn } from "child_process" @@ -49,24 +51,60 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values revalidatePath("/runs") - try { - const logFile = fs.openSync(`/tmp/roo-code-evals-${run.id}.log`, "a") + // try { + // const logFile = fs.openSync(`/tmp/roo-code-evals-${run.id}.log`, "a") - const env: NodeJS.ProcessEnv = systemPrompt - ? { ...process.env, FOOTGUN_SYSTEM_PROMPT: systemPrompt } - : process.env + // const env: NodeJS.ProcessEnv = systemPrompt + // ? { ...process.env, FOOTGUN_SYSTEM_PROMPT: systemPrompt } + // : process.env - const childProcess = spawn("pnpm", ["--filter", "@roo-code/evals", "cli", run.id.toString()], { - detached: true, - stdio: ["ignore", logFile, logFile], - env, - }) + // const childProcess = spawn("pnpm", ["--filter", "@roo-code/evals", "cli", run.id.toString()], { + // detached: true, + // stdio: ["ignore", logFile, logFile], + // env, + // }) - childProcess.unref() - await _updateRun(run.id, { pid: childProcess.pid }) - } catch (error) { - console.error(error) - } + // childProcess.unref() + // await _updateRun(run.id, { pid: childProcess.pid }) + // } catch (error) { + // console.error(error) + // } + + // try { + // const logFile = `/tmp/roo-code-evals-${run.id}.log` + + // const envVars = systemPrompt ? { ...process.env, FOOTGUN_SYSTEM_PROMPT: systemPrompt } : process.env + + // // Requires a docker socket mounted and host container running. + // const runOnHost = async () => { + // // Create and start a new runner container connected to the compose network + // const command = `docker run --rm --network evals_default evals-runner sh -c "pnpm --filter @roo-code/evals cli ${run.id}"` + + // const childProcess = spawn("sh", ["-c", command], { + // detached: true, + // stdio: ["ignore", "pipe", "pipe"], + // }) + + // // Redirect output to log file + // const logStream = fs.createWriteStream(logFile, { flags: "a" }) + + // if (childProcess.stdout) { + // childProcess.stdout.pipe(logStream) + // } + + // if (childProcess.stderr) { + // childProcess.stderr.pipe(logStream) + // } + + // return childProcess + // } + + // const childProcess = await runOnHost() + // childProcess.unref() + // await _updateRun(run.id, { pid: childProcess.pid }) + // } catch (error) { + // console.error(error) + // } return run } diff --git a/packages/evals/Dockerfile b/packages/evals/Dockerfile.runner similarity index 87% rename from packages/evals/Dockerfile rename to packages/evals/Dockerfile.runner index bff55d1c69..140d795e6d 100644 --- a/packages/evals/Dockerfile +++ b/packages/evals/Dockerfile.runner @@ -1,5 +1,5 @@ -# docker build -f packages/evals/Dockerfile -t roo-code-evals . -# docker run -it roo-code-evals +# docker build -f packages/evals/Dockerfile.runner -t evals-runner . +# docker run -it evals-runner FROM node:20-slim AS base @@ -62,7 +62,6 @@ WORKDIR /home/vscode/repo # Install npm packages RUN mkdir -p \ scripts \ - apps/web-evals \ packages/build \ packages/cloud \ packages/config-eslint \ @@ -78,7 +77,6 @@ COPY --chown=vscode:vscode ./package.json ./ COPY --chown=vscode:vscode ./pnpm-lock.yaml ./ COPY --chown=vscode:vscode ./pnpm-workspace.yaml ./ COPY --chown=vscode:vscode ./scripts/bootstrap.mjs ./scripts/ -COPY --chown=vscode:vscode ./apps/web-evals/package.json ./apps/web-evals/ COPY --chown=vscode:vscode ./packages/build/package.json ./packages/build/ COPY --chown=vscode:vscode ./packages/cloud/package.json ./packages/cloud/ COPY --chown=vscode:vscode ./packages/config-eslint/package.json ./packages/config-eslint/ @@ -93,17 +91,11 @@ RUN pnpm install # Build the extension COPY --chown=vscode:vscode . ./ -RUN pnpm vsix -- --out ../bin/roo-code.vsix +RUN pnpm build -- --out ../bin/roo-code.vsix # Copy & install extension -RUN code --debug --install-extension ./roo-code.vsix +RUN code --debug --install-extension bin/roo-code.vsix -# Copy env variables -# COPY --chown=vscode:vscode packages/evals/.env ./ +ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development -# Push database schema -# RUN pnpm --filter @roo-code/evals db:push --force - -# EXPOSE 3000 -# CMD ["pnpm", "web"] CMD ["bash"] diff --git a/packages/evals/Dockerfile.web b/packages/evals/Dockerfile.web new file mode 100644 index 0000000000..1ea772100d --- /dev/null +++ b/packages/evals/Dockerfile.web @@ -0,0 +1,72 @@ +# docker build -f packages/evals/Dockerfile.web -t evals-web . +# docker run -it evals-web + +FROM node:20-slim AS base + +# Install pnpm +ENV PNPM_HOME="/pnpm" +ENV PATH="$PNPM_HOME:$PATH" +RUN corepack enable +RUN npm install -g npm@latest +RUN npm install -g npm-run-all + +# Install system packages +RUN apt update && apt install -y sudo curl git vim jq + +# Install Docker CLI +RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release +RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg +RUN echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null +RUN apt update && apt install -y docker-ce-cli + +# Create a `vscode` user and add to docker group +RUN useradd -m vscode -s /bin/bash && \ + echo "vscode ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/vscode && \ + chmod 0440 /etc/sudoers.d/vscode + +WORKDIR /home/vscode +USER vscode + +# Copy evals +RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals + +WORKDIR /home/vscode/repo + +# Install npm packages +RUN mkdir -p \ + scripts \ + apps/web-evals \ + packages/build \ + packages/cloud \ + packages/config-eslint \ + packages/config-typescript \ + packages/evals \ + packages/ipc \ + packages/telemetry \ + packages/types \ + src \ + webview-ui + +COPY --chown=vscode:vscode ./package.json ./ +COPY --chown=vscode:vscode ./pnpm-lock.yaml ./ +COPY --chown=vscode:vscode ./pnpm-workspace.yaml ./ +COPY --chown=vscode:vscode ./scripts/bootstrap.mjs ./scripts/ +COPY --chown=vscode:vscode ./apps/web-evals/package.json ./apps/web-evals/ +COPY --chown=vscode:vscode ./packages/build/package.json ./packages/build/ +COPY --chown=vscode:vscode ./packages/cloud/package.json ./packages/cloud/ +COPY --chown=vscode:vscode ./packages/config-eslint/package.json ./packages/config-eslint/ +COPY --chown=vscode:vscode ./packages/config-typescript/package.json ./packages/config-typescript/ +COPY --chown=vscode:vscode ./packages/evals/package.json ./packages/evals/ +COPY --chown=vscode:vscode ./packages/ipc/package.json ./packages/ipc/ +COPY --chown=vscode:vscode ./packages/telemetry/package.json ./packages/telemetry/ +COPY --chown=vscode:vscode ./packages/types/package.json ./packages/types/ +COPY --chown=vscode:vscode ./src/package.json ./src/ +COPY --chown=vscode:vscode ./webview-ui/package.json ./webview-ui/ +RUN pnpm install + +COPY --chown=vscode:vscode . ./ + +ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development + +EXPOSE 3000 +CMD ["pnpm", "--filter", "@roo-code/web-evals", "dev"] diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index da2940a16f..e5e2601f47 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -1,20 +1,48 @@ -# docker compose up --build --no-start +# docker compose build web runner + +# docker compose up db web +# docker compose up runner +# docker compose up runner --no-start +# docker run -it --rm --network evals_default evals-runner bash +# docker run --rm --network evals_default evals-runner sh -c "pnpm --filter @roo-code/evals cli 3" services: - postgres: - container_name: postgres-evals - image: postgres:15.4 - ports: - - 5432:5432 - volumes: - - ./.docker/postgres:/var/lib/postgresql/data - - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d - environment: - - POSTGRES_USER=postgres - - POSTGRES_PASSWORD=password - - POSTGRES_DATABASES=evals_development,evals_test - runner: - container_name: runner-evals - build: - context: ../../ - dockerfile: packages/evals/Dockerfile + db: + container_name: evals-db + image: postgres:15.4 + ports: + - 5432:5432 + volumes: + - ./.docker/postgres:/var/lib/postgresql/data + - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d + environment: + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=password + - POSTGRES_DATABASES=evals_development,evals_test + profiles: + - db + + web: + container_name: evals-web + build: + context: ../../ + dockerfile: packages/evals/Dockerfile.web + ports: + - "3000:3000" + environment: + - HOST_EXECUTION_METHOD=docker + volumes: + - /var/run/docker.sock:/var/run/docker.sock + user: "0:0" # Run as root to access docker socket. + profiles: + - web + + runner: + container_name: evals-runner + build: + context: ../../ + dockerfile: packages/evals/Dockerfile.runner + stdin_open: true + tty: true + profiles: + - runner From f2a2367951bbf4b1ca8af49f8a8ea8b19635d442 Mon Sep 17 00:00:00 2001 From: cte Date: Wed, 4 Jun 2025 12:02:38 -0700 Subject: [PATCH 3/9] More progress --- packages/evals/.docker/entrypoints/web.sh | 69 +++++++++++++++++++++++ packages/evals/.gitignore | 1 + packages/evals/Dockerfile.web | 7 ++- packages/evals/docker-compose.yml | 34 +++++++++-- 4 files changed, 103 insertions(+), 8 deletions(-) create mode 100644 packages/evals/.docker/entrypoints/web.sh diff --git a/packages/evals/.docker/entrypoints/web.sh b/packages/evals/.docker/entrypoints/web.sh new file mode 100644 index 0000000000..c105e3ec9d --- /dev/null +++ b/packages/evals/.docker/entrypoints/web.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -e + +echo "๐Ÿš€ Starting evals web service..." + +wait_for_db() { + echo "โณ Waiting for database..." + + # postgresql://user:password@host:port/database + DB_HOST=$(echo $DATABASE_URL | sed -n 's/.*@\([^:]*\):.*/\1/p') + DB_PORT=$(echo $DATABASE_URL | sed -n 's/.*:\([0-9]*\)\/.*/\1/p') + DB_USER=$(echo $DATABASE_URL | sed -n 's/.*\/\/\([^:]*\):.*/\1/p') + DB_NAME=$(echo $DATABASE_URL | sed -n 's/.*\/\([^?]*\).*/\1/p') + + DB_HOST=${DB_HOST:-db} + DB_PORT=${DB_PORT:-5432} + DB_USER=${DB_USER:-postgres} + DB_NAME=${DB_NAME:-evals_development} + + until pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" > /dev/null 2>&1; do + echo "โณ Database not ready yet, waiting 2 seconds..." + sleep 2 + done + + echo "โœ… Database is ready" +} + +run_migrations() { + echo "๐Ÿ”„ Running database migrations..." + + if pnpm --filter @roo-code/evals db:migrate; then + echo "โœ… Database migrations completed successfully!" + else + echo "โŒ Database migration failed!" + exit 1 + fi +} + +start_web_service() { + echo "๐ŸŒ Starting web service..." + exec "$@" +} + +main() { + if [ $# -eq 0 ]; then + set -- pnpm --filter @roo-code/web-evals dev + fi + + if [ "$SKIP_MIGRATION" = "true" ]; then + echo "โญ๏ธ Skipping migration (SKIP_MIGRATION=true)" + start_web_service "$@" + return + fi + + if [ "$MIGRATION_ONLY" = "true" ]; then + echo "๐Ÿ”„ Running migration only (MIGRATION_ONLY=true)" + wait_for_db + run_migrations + echo "โœ… Migration completed, exiting..." + exit 0 + fi + + wait_for_db + run_migrations + start_web_service "$@" +} + +main "$@" diff --git a/packages/evals/.gitignore b/packages/evals/.gitignore index a8d28bb918..fe6126c41d 100644 --- a/packages/evals/.gitignore +++ b/packages/evals/.gitignore @@ -5,3 +5,4 @@ # docker .docker/* !.docker/scripts +!.docker/entrypoints diff --git a/packages/evals/Dockerfile.web b/packages/evals/Dockerfile.web index 1ea772100d..7a5bb25f2c 100644 --- a/packages/evals/Dockerfile.web +++ b/packages/evals/Dockerfile.web @@ -11,7 +11,7 @@ RUN npm install -g npm@latest RUN npm install -g npm-run-all # Install system packages -RUN apt update && apt install -y sudo curl git vim jq +RUN apt update && apt install -y sudo curl git vim jq postgresql-client # Install Docker CLI RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release @@ -66,7 +66,10 @@ RUN pnpm install COPY --chown=vscode:vscode . ./ +COPY --chown=vscode:vscode packages/evals/.docker/entrypoints/web.sh /usr/local/bin/entrypoint.sh +RUN sudo chmod +x /usr/local/bin/entrypoint.sh + ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development EXPOSE 3000 -CMD ["pnpm", "--filter", "@roo-code/web-evals", "dev"] +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index e5e2601f47..d1db8da81a 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -1,7 +1,9 @@ -# docker compose build web runner +# Server: +# docker compose build web +# docker compose --profile server up -# docker compose up db web -# docker compose up runner +# Client: +# docker compose build runner # docker compose up runner --no-start # docker run -it --rm --network evals_default evals-runner bash # docker run --rm --network evals_default evals-runner sh -c "pnpm --filter @roo-code/evals cli 3" @@ -19,8 +21,25 @@ services: - POSTGRES_USER=postgres - POSTGRES_PASSWORD=password - POSTGRES_DATABASES=evals_development,evals_test + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d evals_development"] + interval: 5s + timeout: 5s + retries: 5 + start_period: 30s profiles: - - db + - server + + redis: + container_name: evals-redis + image: redis:7-alpine + ports: + - "6379:6379" + volumes: + - ./.docker/redis:/data + command: redis-server --appendonly yes + profiles: + - server web: container_name: evals-web @@ -34,8 +53,11 @@ services: volumes: - /var/run/docker.sock:/var/run/docker.sock user: "0:0" # Run as root to access docker socket. + depends_on: + db: + condition: service_healthy profiles: - - web + - server runner: container_name: evals-runner @@ -45,4 +67,4 @@ services: stdin_open: true tty: true profiles: - - runner + - client From f72bb1d943d10d234b6f523f4bbe1030f13ec70a Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 5 Jun 2025 11:02:51 -0700 Subject: [PATCH 4/9] More progress --- .dockerignore | 60 +++ apps/web-evals/next.config.ts | 5 +- apps/web-evals/package.json | 8 +- apps/web-evals/scripts/check-services.sh | 20 + .../src/app/api/runs/[id]/stream/route.ts | 60 ++- apps/web-evals/src/app/api/runs/route.ts | 12 - apps/web-evals/src/app/api/tasks/route.ts | 12 - .../src/app/runs/[id]/connection-status.tsx | 35 +- apps/web-evals/src/app/runs/[id]/run.tsx | 2 +- apps/web-evals/src/app/runs/new/defaults.ts | 66 --- apps/web-evals/src/app/runs/new/new-run.tsx | 74 +-- apps/web-evals/src/hooks/use-event-source.ts | 66 ++- apps/web-evals/src/hooks/use-process-tree.ts | 11 - apps/web-evals/src/hooks/use-runners.ts | 10 + apps/web-evals/src/lib/schemas.ts | 4 +- .../lib/server/__tests__/sse-stream.spec.ts | 111 +++++ apps/web-evals/src/lib/server/processes.ts | 55 --- apps/web-evals/src/lib/server/redis.ts | 13 + apps/web-evals/src/lib/server/runners.ts | 8 + apps/web-evals/src/lib/server/runs.ts | 82 ++-- apps/web-evals/src/lib/server/sse-stream.ts | 29 +- apps/web-evals/tsconfig.json | 1 + apps/web-evals/vitest.config.ts | 7 + packages/evals/.docker/entrypoints/runner.sh | 7 + packages/evals/.docker/entrypoints/web.sh | 27 +- packages/evals/ARCHITECTURE.md | 282 +++++++++++ packages/evals/Dockerfile.runner | 165 ++++--- packages/evals/Dockerfile.web | 63 +-- packages/evals/README.md | 31 +- packages/evals/docker-compose.yml | 24 +- packages/evals/package.json | 10 +- packages/evals/scripts/setup.sh | 163 ++++--- packages/evals/src/cli/index.ts | 443 ++---------------- packages/evals/src/cli/processTask.ts | 56 +++ packages/evals/src/cli/redis.ts | 53 +++ packages/evals/src/cli/runEvals.ts | 56 +++ packages/evals/src/cli/runTask.ts | 253 ++++++++++ packages/evals/src/cli/runUnitTest.ts | 84 ++++ packages/evals/src/cli/utils.ts | 16 + packages/evals/src/db/queries/runs.ts | 9 + packages/evals/src/db/queries/tasks.ts | 8 +- packages/evals/src/db/schema.ts | 2 +- packages/types/src/global-settings.ts | 74 ++- pnpm-lock.yaml | 276 ++++++++++- 44 files changed, 1879 insertions(+), 974 deletions(-) create mode 100755 apps/web-evals/scripts/check-services.sh delete mode 100644 apps/web-evals/src/app/api/runs/route.ts delete mode 100644 apps/web-evals/src/app/api/tasks/route.ts delete mode 100644 apps/web-evals/src/app/runs/new/defaults.ts delete mode 100644 apps/web-evals/src/hooks/use-process-tree.ts create mode 100644 apps/web-evals/src/hooks/use-runners.ts create mode 100644 apps/web-evals/src/lib/server/__tests__/sse-stream.spec.ts delete mode 100644 apps/web-evals/src/lib/server/processes.ts create mode 100644 apps/web-evals/src/lib/server/redis.ts create mode 100644 apps/web-evals/src/lib/server/runners.ts create mode 100644 apps/web-evals/vitest.config.ts create mode 100644 packages/evals/.docker/entrypoints/runner.sh create mode 100644 packages/evals/ARCHITECTURE.md create mode 100644 packages/evals/src/cli/processTask.ts create mode 100644 packages/evals/src/cli/redis.ts create mode 100644 packages/evals/src/cli/runEvals.ts create mode 100644 packages/evals/src/cli/runTask.ts create mode 100644 packages/evals/src/cli/runUnitTest.ts create mode 100644 packages/evals/src/cli/utils.ts diff --git a/.dockerignore b/.dockerignore index 2b4c07e517..11d03a1c54 100644 --- a/.dockerignore +++ b/.dockerignore @@ -29,3 +29,63 @@ knip.json # next.js **/.next/ .vercel + +# Ignore common development files +node_modules +.git +.gitignore +.dockerignore +.env* +.vscode +.idea + +# Ignore build artifacts +dist +build +*.log +*.tmp +.cache +coverage + +# Ignore OS files +.DS_Store +Thumbs.db + +# Ignore test files +__tests__ +*.test.js +*.spec.js +*.test.ts +*.spec.ts + +# Ignore development config files +.eslintrc* +.prettierrc* +jest.config* + +# Ignore most directories except what we need for the build +apps/ +evals/ +webview-ui/node_modules +src/node_modules + +# Keep essential files for the build +!README.md +!CHANGELOG.md +!package.json +!pnpm-lock.yaml +!pnpm-workspace.yaml +!scripts/bootstrap.mjs +!apps/web-evals/ +!src/ +!webview-ui/ +!packages/evals/.docker/entrypoints/runner.sh +!packages/build/ +!packages/cloud/ +!packages/config-eslint/ +!packages/config-typescript/ +!packages/evals/ +!packages/ipc/ +!packages/telemetry/ +!packages/types/ +!locales/ diff --git a/apps/web-evals/next.config.ts b/apps/web-evals/next.config.ts index 9da1646d2a..08ed853fc3 100644 --- a/apps/web-evals/next.config.ts +++ b/apps/web-evals/next.config.ts @@ -1,7 +1,10 @@ import type { NextConfig } from "next" const nextConfig: NextConfig = { - /* config options here */ + webpack: (config) => { + config.resolve.extensionAlias = { ".js": [".ts", ".tsx", ".js", ".jsx"] } + return config + }, } export default nextConfig diff --git a/apps/web-evals/package.json b/apps/web-evals/package.json index 36d7facc28..fe53708e5a 100644 --- a/apps/web-evals/package.json +++ b/apps/web-evals/package.json @@ -4,7 +4,7 @@ "scripts": { "lint": "next lint", "check-types": "tsc -b", - "dev": "next dev --turbopack", + "dev": "scripts/check-services.sh && next dev --turbopack", "format": "prettier --write src", "build": "next build", "start": "next start" @@ -24,7 +24,6 @@ "@radix-ui/react-tabs": "^1.1.3", "@radix-ui/react-tooltip": "^1.1.8", "@roo-code/evals": "workspace:^", - "@roo-code/ipc": "workspace:^", "@roo-code/types": "workspace:^", "@tanstack/react-query": "^5.69.0", "class-variance-authority": "^0.7.1", @@ -35,11 +34,11 @@ "next": "^15.2.5", "next-themes": "^0.4.6", "p-map": "^7.0.3", - "ps-tree": "^1.2.0", "react": "^18.3.1", "react-dom": "^18.3.1", "react-hook-form": "^7.57.0", "react-use": "^17.6.0", + "redis": "^5.5.5", "sonner": "^2.0.5", "tailwind-merge": "^3.3.0", "tailwindcss-animate": "^1.0.7", @@ -53,6 +52,7 @@ "@types/ps-tree": "^1.1.6", "@types/react": "^18.3.23", "@types/react-dom": "^18.3.5", - "tailwindcss": "^4" + "tailwindcss": "^4", + "vitest": "^3.2.1" } } diff --git a/apps/web-evals/scripts/check-services.sh b/apps/web-evals/scripts/check-services.sh new file mode 100755 index 0000000000..fd1e74997c --- /dev/null +++ b/apps/web-evals/scripts/check-services.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if ! docker info &> /dev/null; then + echo "โŒ Docker is not running. Please start Docker Desktop and try again." + exit 1 +fi + +if ! nc -z localhost 5432 2>/dev/null; then + echo "โŒ PostgreSQL is not running on port 5432" + echo "๐Ÿ’ก Start it with: pnpm --filter @roo-code/evals db:start" + exit 1 +fi + +if ! nc -z localhost 6379 2>/dev/null; then + echo "โŒ Redis is not running on port 6379" + echo "๐Ÿ’ก Start it with: pnpm --filter @roo-code/evals redis:start" + exit 1 +fi + +echo "โœ… All required services are running" diff --git a/apps/web-evals/src/app/api/runs/[id]/stream/route.ts b/apps/web-evals/src/app/api/runs/[id]/stream/route.ts index 5b1de60710..3168974ecd 100644 --- a/apps/web-evals/src/app/api/runs/[id]/stream/route.ts +++ b/apps/web-evals/src/app/api/runs/[id]/stream/route.ts @@ -1,10 +1,10 @@ import type { NextRequest } from "next/server" +import { taskEventSchema } from "@roo-code/types" import { findRun } from "@roo-code/evals" -import { IpcClient } from "@roo-code/ipc" -import { IpcMessageType } from "@roo-code/types" import { SSEStream } from "@/lib/server/sse-stream" +import { redisClient } from "@/lib/server/redis" export const dynamic = "force-dynamic" @@ -13,26 +13,58 @@ export async function GET(request: NextRequest, { params }: { params: Promise<{ const requestId = crypto.randomUUID() const stream = new SSEStream() const run = await findRun(Number(id)) - const client = new IpcClient(run.socketPath, () => {}) + const redis = await redisClient() - const write = async (data: string | object) => { - // console.log(`[stream#${requestId}] write`, data) - const success = await stream.write(data) + let isStreamClosed = false + const channelName = `evals:${run.id}` - if (!success) { - client.disconnect() + const onMessage = async (data: string) => { + if (isStreamClosed || stream.isClosed) { + return + } + + try { + const taskEvent = taskEventSchema.parse(JSON.parse(data)) + // console.log(`[stream#${requestId}] task event -> ${taskEvent.eventName}`) + const writeSuccess = await stream.write(JSON.stringify(taskEvent)) + + if (!writeSuccess) { + await disconnect() + } + } catch (_error) { + console.error(`[stream#${requestId}] invalid task event:`, data) + } + } + + const disconnect = async () => { + if (isStreamClosed) { + return + } + + isStreamClosed = true + + try { + await redis.unsubscribe(channelName) + console.log(`[stream#${requestId}] unsubscribed from ${channelName}`) + } catch (error) { + console.error(`[stream#${requestId}] error unsubscribing:`, error) + } + + try { + await stream.close() + } catch (error) { + console.error(`[stream#${requestId}] error closing stream:`, error) } } - console.log(`[stream#${requestId}] connect`) - client.on(IpcMessageType.Connect, () => write("connect")) - client.on(IpcMessageType.Disconnect, () => write("disconnect")) - client.on(IpcMessageType.TaskEvent, write) + await redis.subscribe(channelName, onMessage) request.signal.addEventListener("abort", () => { console.log(`[stream#${requestId}] abort`) - client.disconnect() - stream.close().catch(() => {}) + + disconnect().catch((error) => { + console.error(`[stream#${requestId}] cleanup error:`, error) + }) }) return stream.getResponse() diff --git a/apps/web-evals/src/app/api/runs/route.ts b/apps/web-evals/src/app/api/runs/route.ts deleted file mode 100644 index b21bb3b655..0000000000 --- a/apps/web-evals/src/app/api/runs/route.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { NextResponse } from "next/server" - -import { createRun } from "@roo-code/evals" - -export async function POST(request: Request) { - try { - const run = await createRun(await request.json()) - return NextResponse.json({ run }, { status: 201 }) - } catch (error) { - return NextResponse.json({ error: (error as Error).message }, { status: 500 }) - } -} diff --git a/apps/web-evals/src/app/api/tasks/route.ts b/apps/web-evals/src/app/api/tasks/route.ts deleted file mode 100644 index 843c078b9b..0000000000 --- a/apps/web-evals/src/app/api/tasks/route.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { NextResponse } from "next/server" - -import { createTask } from "@roo-code/evals" - -export async function POST(request: Request) { - try { - const task = await createTask(await request.json()) - return NextResponse.json({ task }, { status: 201 }) - } catch (error) { - return NextResponse.json({ error: (error as Error).message }, { status: 500 }) - } -} diff --git a/apps/web-evals/src/app/runs/[id]/connection-status.tsx b/apps/web-evals/src/app/runs/[id]/connection-status.tsx index 60d6141a53..1505050b2d 100644 --- a/apps/web-evals/src/app/runs/[id]/connection-status.tsx +++ b/apps/web-evals/src/app/runs/[id]/connection-status.tsx @@ -1,29 +1,17 @@ "use client" -import { useCallback } from "react" -import { Skull } from "lucide-react" - -import { killProcessTree } from "@/lib/server/processes" -import { EventSourceStatus } from "@/hooks/use-event-source" -import { useProcessList } from "@/hooks/use-process-tree" +import type { EventSourceStatus } from "@/hooks/use-event-source" +import { useRunners } from "@/hooks/use-runners" import { cn } from "@/lib/utils" -import { Button } from "@/components/ui" type ConnectionStatusProps = { status: EventSourceStatus - pid: number | null + runId: number } export const ConnectionStatus = (connectionStatus: ConnectionStatusProps) => { - const { data: pids, isLoading } = useProcessList(connectionStatus.pid) - const status = isLoading ? "loading" : pids === null ? "dead" : connectionStatus.status - - const onKill = useCallback(async () => { - if (connectionStatus.pid) { - await killProcessTree(connectionStatus.pid) - window.location.reload() - } - }, [connectionStatus.pid]) + const { data: runners, isLoading } = useRunners(connectionStatus.runId) + const status = isLoading ? "loading" : runners === null ? "dead" : connectionStatus.status return (
@@ -52,16 +40,9 @@ export const ConnectionStatus = (connectionStatus: ConnectionStatusProps) => {
-
PIDs:
-
{connectionStatus.pid}
- {status === "connected" && ( - <> -
{pids?.join(" ")}
- - +
Runners:
+ {runners && runners.length > 0 && ( +
{runners?.join(", ")}
)}
diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index 576fb12a2e..ba93b2940a 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -48,7 +48,7 @@ export function Run({ run }: { run: Run }) {
{run.model}
{run.description &&
{run.description}
} - {!run.taskMetricsId && } + {!run.taskMetricsId && } {!tasks ? ( diff --git a/apps/web-evals/src/app/runs/new/defaults.ts b/apps/web-evals/src/app/runs/new/defaults.ts deleted file mode 100644 index b77460d36a..0000000000 --- a/apps/web-evals/src/app/runs/new/defaults.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { RooCodeSettings } from "@roo-code/types" - -export const rooCodeDefaults: RooCodeSettings = { - apiProvider: "openrouter", - openRouterUseMiddleOutTransform: false, - - lastShownAnnouncementId: "may-21-2025-3-18", - - pinnedApiConfigs: {}, - - autoApprovalEnabled: true, - alwaysAllowReadOnly: true, - alwaysAllowReadOnlyOutsideWorkspace: false, - alwaysAllowWrite: true, - alwaysAllowWriteOutsideWorkspace: false, - writeDelayMs: 1000, - alwaysAllowBrowser: true, - alwaysApproveResubmit: true, - requestDelaySeconds: 10, - alwaysAllowMcp: true, - alwaysAllowModeSwitch: true, - alwaysAllowSubtasks: true, - alwaysAllowExecute: true, - allowedCommands: ["*"], - - browserToolEnabled: false, - browserViewportSize: "900x600", - screenshotQuality: 75, - remoteBrowserEnabled: false, - - ttsEnabled: false, - ttsSpeed: 1, - soundEnabled: false, - soundVolume: 0.5, - - terminalOutputLineLimit: 500, - terminalShellIntegrationTimeout: 30000, - terminalCommandDelay: 0, - terminalPowershellCounter: false, - terminalZshOhMy: true, - terminalZshClearEolMark: true, - terminalZshP10k: false, - terminalZdotdir: true, - terminalCompressProgressBar: true, - terminalShellIntegrationDisabled: false, - - diffEnabled: true, - fuzzyMatchThreshold: 1, - - enableCheckpoints: false, - - rateLimitSeconds: 0, - maxOpenTabsContext: 20, - maxWorkspaceFiles: 200, - showRooIgnoredFiles: true, - maxReadFileLine: -1, // -1 to enable full file reading. - - language: "en", - telemetrySetting: "enabled", - - mcpEnabled: false, - - mode: "code", - - customModes: [], -} diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx index 535094fcd5..43190ca6d6 100644 --- a/apps/web-evals/src/app/runs/new/new-run.tsx +++ b/apps/web-evals/src/app/runs/new/new-run.tsx @@ -9,12 +9,13 @@ import fuzzysort from "fuzzysort" import { toast } from "sonner" import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal, Book, CircleCheck } from "lucide-react" -import { globalSettingsSchema, providerSettingsSchema } from "@roo-code/types" +import { globalSettingsSchema, providerSettingsSchema, EVALS_SETTINGS, getModelId } from "@roo-code/types" import { createRun } from "@/lib/server/runs" import { createRunSchema as formSchema, type CreateRun as FormValues, + MODEL_DEFAULT, CONCURRENCY_MIN, CONCURRENCY_MAX, CONCURRENCY_DEFAULT, @@ -51,26 +52,25 @@ import { DialogFooter, } from "@/components/ui" -import { rooCodeDefaults } from "./defaults" import { SettingsDiff } from "./settings-diff" export function NewRun() { const router = useRouter() const [mode, setMode] = useState<"openrouter" | "settings">("openrouter") - const [modelSearchValue, setModelSearchValue] = useState("") const [modelPopoverOpen, setModelPopoverOpen] = useState(false) + const modelSearchResultsRef = useRef>(new Map()) const modelSearchValueRef = useRef("") - const models = useOpenRouterModels() + const models = useOpenRouterModels() const exercises = useExercises() const form = useForm({ resolver: zodResolver(formSchema), defaultValues: { - model: "", + model: MODEL_DEFAULT, description: "", suite: "full", exercises: [], @@ -96,14 +96,7 @@ export function NewRun() { async (values: FormValues) => { try { if (mode === "openrouter") { - const openRouterModel = models.data?.find(({ id }) => id === model) - - if (!openRouterModel) { - throw new Error("Model not found.") - } - - const openRouterModelId = openRouterModel.id - values.settings = { ...(values.settings || {}), openRouterModelId } + values.settings = { ...(values.settings || {}), openRouterModelId: model } } const { id } = await createRun({ ...values, systemPrompt }) @@ -112,7 +105,7 @@ export function NewRun() { toast.error(e instanceof Error ? e.message : "An unknown error occurred.") } }, - [mode, model, models.data, router, systemPrompt], + [mode, model, router, systemPrompt], ) const onFilterModels = useCallback( @@ -167,55 +160,8 @@ export function NewRun() { const providerSettings = providerProfiles.apiConfigs[providerProfiles.currentApiConfigName] ?? {} - const { - apiProvider, - apiModelId, - openRouterModelId, - glamaModelId, - requestyModelId, - unboundModelId, - ollamaModelId, - lmStudioModelId, - openAiModelId, - } = providerSettings - - switch (apiProvider) { - case "anthropic": - case "bedrock": - case "deepseek": - case "gemini": - case "mistral": - case "openai-native": - case "xai": - case "vertex": - setValue("model", apiModelId ?? "") - break - case "openrouter": - setValue("model", openRouterModelId ?? "") - break - case "glama": - setValue("model", glamaModelId ?? "") - break - case "requesty": - setValue("model", requestyModelId ?? "") - break - case "unbound": - setValue("model", unboundModelId ?? "") - break - case "openai": - setValue("model", openAiModelId ?? "") - break - case "ollama": - setValue("model", ollamaModelId ?? "") - break - case "lmstudio": - setValue("model", lmStudioModelId ?? "") - break - default: - throw new Error(`Unsupported API provider: ${apiProvider}`) - } - - setValue("settings", { ...rooCodeDefaults, ...providerSettings, ...globalSettings }) + setValue("model", getModelId(providerSettings) ?? "") + setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings }) setMode("settings") event.target.value = "" @@ -316,7 +262,7 @@ export function NewRun() { settings. - + )} diff --git a/apps/web-evals/src/hooks/use-event-source.ts b/apps/web-evals/src/hooks/use-event-source.ts index d076e68a5a..545232ecf2 100644 --- a/apps/web-evals/src/hooks/use-event-source.ts +++ b/apps/web-evals/src/hooks/use-event-source.ts @@ -14,44 +14,88 @@ export function useEventSource({ url, withCredentials, onMessage }: UseEventSour const sourceRef = useRef(null) const statusRef = useRef("waiting") const [status, setStatus] = useState("waiting") + const reconnectTimeoutRef = useRef(null) + const isUnmountedRef = useRef(false) const handleMessage = useCallback((event: MessageEvent) => onMessage(event), [onMessage]) + const cleanup = useCallback(() => { + if (reconnectTimeoutRef.current) { + clearTimeout(reconnectTimeoutRef.current) + reconnectTimeoutRef.current = null + } + + if (sourceRef.current) { + sourceRef.current.close() + sourceRef.current = null + } + }, []) + const createEventSource = useCallback(() => { + if (isUnmountedRef.current) { + return + } + + cleanup() + + statusRef.current = "waiting" + setStatus("waiting") + sourceRef.current = new EventSource(url, { withCredentials }) sourceRef.current.onopen = () => { + if (isUnmountedRef.current) { + return + } + statusRef.current = "connected" setStatus("connected") } sourceRef.current.onmessage = (event) => { + if (isUnmountedRef.current) { + return + } + handleMessage(event) } sourceRef.current.onerror = () => { + if (isUnmountedRef.current) { + return + } + statusRef.current = "error" setStatus("error") - // sourceRef.current?.close() - // sourceRef.current = null + + // Clean up current connection. + cleanup() + + // Attempt to reconnect after a delay. + reconnectTimeoutRef.current = setTimeout(() => { + if (!isUnmountedRef.current) { + createEventSource() + } + }, 1000) } - }, [url, withCredentials, handleMessage]) + }, [url, withCredentials, handleMessage, cleanup]) useEffect(() => { + isUnmountedRef.current = false createEventSource() - setTimeout(() => { - if (statusRef.current === "waiting") { - sourceRef.current?.close() - sourceRef.current = null + // Initial connection timeout. + const initialTimeout = setTimeout(() => { + if (statusRef.current === "waiting" && !isUnmountedRef.current) { createEventSource() } - }, 100) + }, 5000) return () => { - sourceRef.current?.close() - sourceRef.current = null + isUnmountedRef.current = true + clearTimeout(initialTimeout) + cleanup() } - }, [createEventSource]) + }, [createEventSource, cleanup]) return status } diff --git a/apps/web-evals/src/hooks/use-process-tree.ts b/apps/web-evals/src/hooks/use-process-tree.ts deleted file mode 100644 index 35d7e7ce04..0000000000 --- a/apps/web-evals/src/hooks/use-process-tree.ts +++ /dev/null @@ -1,11 +0,0 @@ -import { useQuery } from "@tanstack/react-query" - -import { getProcessList } from "@/lib/server/processes" - -export const useProcessList = (pid: number | null) => - useQuery({ - queryKey: ["process-tree", pid], - queryFn: () => (pid ? getProcessList(pid) : []), - enabled: !!pid, - refetchInterval: 30_000, - }) diff --git a/apps/web-evals/src/hooks/use-runners.ts b/apps/web-evals/src/hooks/use-runners.ts new file mode 100644 index 0000000000..8fc10aeb8b --- /dev/null +++ b/apps/web-evals/src/hooks/use-runners.ts @@ -0,0 +1,10 @@ +import { useQuery } from "@tanstack/react-query" + +import { getRunners } from "@/lib/server/runners" + +export const useRunners = (runId: number) => + useQuery({ + queryKey: ["runners", runId], + queryFn: () => getRunners(runId), + refetchInterval: 10_000, + }) diff --git a/apps/web-evals/src/lib/schemas.ts b/apps/web-evals/src/lib/schemas.ts index 485bb19fcd..4609820aee 100644 --- a/apps/web-evals/src/lib/schemas.ts +++ b/apps/web-evals/src/lib/schemas.ts @@ -6,9 +6,11 @@ import { rooCodeSettingsSchema } from "@roo-code/types" * CreateRun */ +export const MODEL_DEFAULT = "anthropic/claude-sonnet-4" + export const CONCURRENCY_MIN = 1 export const CONCURRENCY_MAX = 25 -export const CONCURRENCY_DEFAULT = 2 +export const CONCURRENCY_DEFAULT = 1 export const createRunSchema = z .object({ diff --git a/apps/web-evals/src/lib/server/__tests__/sse-stream.spec.ts b/apps/web-evals/src/lib/server/__tests__/sse-stream.spec.ts new file mode 100644 index 0000000000..ab052f3c8c --- /dev/null +++ b/apps/web-evals/src/lib/server/__tests__/sse-stream.spec.ts @@ -0,0 +1,111 @@ +// npx vitest run src/lib/server/__tests__/sse-stream.spec.ts + +import { SSEStream } from "../sse-stream" + +describe("SSEStream", () => { + let stream: SSEStream + + beforeEach(() => { + stream = new SSEStream() + }) + + it("should create a new SSEStream instance", () => { + expect(stream).toBeInstanceOf(SSEStream) + expect(stream.isClosed).toBe(false) + }) + + it("should write string data successfully when stream is open", async () => { + const response = stream.getResponse() + const reader = response.body?.getReader() + + const writePromise = stream.write("test message") + + if (reader) { + await reader.read() + reader.releaseLock() + } + + const result = await writePromise + expect(result).toBe(true) + expect(stream.isClosed).toBe(false) + }) + + it("should write object data successfully when stream is open", async () => { + const testData = { message: "test", id: 123 } + + const response = stream.getResponse() + const reader = response.body?.getReader() + + const writePromise = stream.write(testData) + + if (reader) { + await reader.read() + reader.releaseLock() + } + + const result = await writePromise + expect(result).toBe(true) + expect(stream.isClosed).toBe(false) + }) + + it("should return false when writing to closed stream", async () => { + await stream.close() + expect(stream.isClosed).toBe(true) + + const result = await stream.write("test message") + expect(result).toBe(false) + }) + + it("should handle multiple close calls gracefully", async () => { + await stream.close() + expect(stream.isClosed).toBe(true) + + // Second close should not throw. + await expect(stream.close()).resolves.toBeUndefined() + expect(stream.isClosed).toBe(true) + }) + + it("should create response with correct headers", () => { + const response = stream.getResponse() + expect(response).toBeInstanceOf(Response) + expect(response.headers.get("Content-Type")).toBe("text/event-stream") + expect(response.headers.get("Connection")).toBe("keep-alive") + expect(response.headers.get("Cache-Control")).toBe("no-cache, no-transform") + expect(response.headers.get("Access-Control-Allow-Origin")).toBe("*") + }) + + it("should format data correctly for SSE", async () => { + const response = stream.getResponse() + const reader = response.body?.getReader() + const decoder = new TextDecoder() + + const writePromise = stream.write("hello world") + + if (reader) { + const { value } = await reader.read() + const text = decoder.decode(value) + expect(text).toBe("data: hello world\n\n") + reader.releaseLock() + } + + await writePromise + }) + + it("should format JSON data correctly for SSE", async () => { + const response = stream.getResponse() + const reader = response.body?.getReader() + const decoder = new TextDecoder() + + const testData = { type: "test", message: "hello" } + const writePromise = stream.write(testData) + + if (reader) { + const { value } = await reader.read() + const text = decoder.decode(value) + expect(text).toBe(`data: ${JSON.stringify(testData)}\n\n`) + reader.releaseLock() + } + + await writePromise + }) +}) diff --git a/apps/web-evals/src/lib/server/processes.ts b/apps/web-evals/src/lib/server/processes.ts deleted file mode 100644 index fdf2f22b37..0000000000 --- a/apps/web-evals/src/lib/server/processes.ts +++ /dev/null @@ -1,55 +0,0 @@ -"use server" - -import psTree from "ps-tree" -import { exec } from "child_process" - -const asyncExec = (command: string): Promise<{ stdout: string; stderr: string }> => - new Promise((resolve, reject) => { - exec(command, (error, stdout, stderr) => { - if (error) { - reject(error) - } else { - resolve({ stdout, stderr }) - } - }) - }) - -export const getProcessList = async (pid: number) => { - try { - await asyncExec(`ps -p ${pid} -o pid=`) - - return new Promise((resolve, reject) => { - psTree(pid, (err, children) => { - if (err) { - reject(err) - } - - resolve(children.map((p) => parseInt(p.PID))) - }) - }) - } catch (_) { - return null - } -} - -export const killProcessTree = async (pid: number) => { - const descendants = await getProcessList(pid) - - if (descendants === null) { - return - } - - if (descendants.length > 0) { - try { - await asyncExec(`kill -9 ${descendants.join(" ")}`) - } catch (error) { - console.error("Error killing descendant processes:", error) - } - } - - try { - await asyncExec(`kill -9 ${pid}`) - } catch (error) { - console.error("Error killing main process:", error) - } -} diff --git a/apps/web-evals/src/lib/server/redis.ts b/apps/web-evals/src/lib/server/redis.ts new file mode 100644 index 0000000000..a35597490c --- /dev/null +++ b/apps/web-evals/src/lib/server/redis.ts @@ -0,0 +1,13 @@ +import { type RedisClientType, createClient } from "redis" + +let redis: RedisClientType | null = null + +export async function redisClient() { + if (!redis) { + redis = createClient({ url: process.env.REDIS_URL || "redis://localhost:6379" }) + redis.on("error", (error) => console.error("Redis error:", error)) + await redis.connect() + } + + return redis +} diff --git a/apps/web-evals/src/lib/server/runners.ts b/apps/web-evals/src/lib/server/runners.ts new file mode 100644 index 0000000000..324fdbae12 --- /dev/null +++ b/apps/web-evals/src/lib/server/runners.ts @@ -0,0 +1,8 @@ +"use server" + +import { redisClient } from "./redis" + +export const getRunners = async (runId: number) => { + const redis = await redisClient() + return redis.sMembers(`runners:${runId}`) +} diff --git a/apps/web-evals/src/lib/server/runs.ts b/apps/web-evals/src/lib/server/runs.ts index f4e77fe758..831e5cfe11 100644 --- a/apps/web-evals/src/lib/server/runs.ts +++ b/apps/web-evals/src/lib/server/runs.ts @@ -1,10 +1,6 @@ -/* eslint-disable @typescript-eslint/no-unused-vars */ - "use server" import { spawn } from "child_process" -import path from "path" -import os from "os" import fs from "fs" import { revalidatePath } from "next/cache" @@ -14,7 +10,6 @@ import { type ExerciseLanguage, exerciseLanguages, createRun as _createRun, - updateRun as _updateRun, deleteRun as _deleteRun, createTask, } from "@roo-code/evals" @@ -23,10 +18,11 @@ import { CreateRun } from "@/lib/schemas" import { getExercisesForLanguage } from "./exercises" +// eslint-disable-next-line @typescript-eslint/no-unused-vars export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) { const run = await _createRun({ ...values, - socketPath: path.join(os.tmpdir(), `roo-code-evals-${crypto.randomUUID()}.sock`), + socketPath: "", // TODO: Get rid of this. }) if (suite === "partial") { @@ -51,60 +47,44 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values revalidatePath("/runs") - // try { - // const logFile = fs.openSync(`/tmp/roo-code-evals-${run.id}.log`, "a") - - // const env: NodeJS.ProcessEnv = systemPrompt - // ? { ...process.env, FOOTGUN_SYSTEM_PROMPT: systemPrompt } - // : process.env - - // const childProcess = spawn("pnpm", ["--filter", "@roo-code/evals", "cli", run.id.toString()], { - // detached: true, - // stdio: ["ignore", logFile, logFile], - // env, - // }) - - // childProcess.unref() - // await _updateRun(run.id, { pid: childProcess.pid }) - // } catch (error) { - // console.error(error) - // } + try { + const isRunningInDocker = true // fs.existsSync("/.dockerenv") - // try { - // const logFile = `/tmp/roo-code-evals-${run.id}.log` + const dockerArgs = [ + `--name evals-controller-${run.id}`, + "--rm", + "--network evals_default", + "-v /var/run/docker.sock:/var/run/docker.sock", + "-e HOST_EXECUTION_METHOD=docker", + ] - // const envVars = systemPrompt ? { ...process.env, FOOTGUN_SYSTEM_PROMPT: systemPrompt } : process.env + const cliCommand = `pnpm --filter @roo-code/evals cli --runId ${run.id}` - // // Requires a docker socket mounted and host container running. - // const runOnHost = async () => { - // // Create and start a new runner container connected to the compose network - // const command = `docker run --rm --network evals_default evals-runner sh -c "pnpm --filter @roo-code/evals cli ${run.id}"` + const command = isRunningInDocker + ? `docker run ${dockerArgs.join(" ")} evals-runner sh -c "${cliCommand}"` + : cliCommand - // const childProcess = spawn("sh", ["-c", command], { - // detached: true, - // stdio: ["ignore", "pipe", "pipe"], - // }) + console.log("spawn ->", command) - // // Redirect output to log file - // const logStream = fs.createWriteStream(logFile, { flags: "a" }) + const childProcess = spawn("sh", ["-c", command], { + detached: true, + stdio: ["ignore", "pipe", "pipe"], + }) - // if (childProcess.stdout) { - // childProcess.stdout.pipe(logStream) - // } + const logStream = fs.createWriteStream("/tmp/roo-code-evals.log", { flags: "a" }) - // if (childProcess.stderr) { - // childProcess.stderr.pipe(logStream) - // } + if (childProcess.stdout) { + childProcess.stdout.pipe(logStream) + } - // return childProcess - // } + if (childProcess.stderr) { + childProcess.stderr.pipe(logStream) + } - // const childProcess = await runOnHost() - // childProcess.unref() - // await _updateRun(run.id, { pid: childProcess.pid }) - // } catch (error) { - // console.error(error) - // } + childProcess.unref() + } catch (error) { + console.error(error) + } return run } diff --git a/apps/web-evals/src/lib/server/sse-stream.ts b/apps/web-evals/src/lib/server/sse-stream.ts index acfc425fa2..47ab8bb6ca 100644 --- a/apps/web-evals/src/lib/server/sse-stream.ts +++ b/apps/web-evals/src/lib/server/sse-stream.ts @@ -2,6 +2,7 @@ export class SSEStream { private readonly _stream: TransformStream private readonly _writer: WritableStreamDefaultWriter private readonly _encoder: TextEncoder + private _isClosed: boolean = false constructor() { this._stream = new TransformStream() @@ -9,20 +10,40 @@ export class SSEStream { this._encoder = new TextEncoder() } - public async write(data: string | object) { + public async write(data: string | object): Promise { + if (this._isClosed) { + return false + } + try { const buffer = typeof data === "object" ? JSON.stringify(data) : data await this._writer.write(this._encoder.encode(`data: ${buffer}\n\n`)) return true } catch (error) { console.error("[SSEStream#write]", error) + this._isClosed = true this.close().catch(() => {}) return false } } - public close() { - return this._writer.close() + public async close(): Promise { + if (this._isClosed) { + return + } + + this._isClosed = true + + try { + await this._writer.close() + } catch (error) { + // Writer might already be closed, ignore the error. + console.debug("[SSEStream#close] Writer already closed:", error) + } + } + + public get isClosed(): boolean { + return this._isClosed } public getResponse() { @@ -31,6 +52,8 @@ export class SSEStream { "Content-Type": "text/event-stream", Connection: "keep-alive", "Cache-Control": "no-cache, no-transform", + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Headers": "Cache-Control", }, }) } diff --git a/apps/web-evals/tsconfig.json b/apps/web-evals/tsconfig.json index 31c94ac77c..546d73aef9 100644 --- a/apps/web-evals/tsconfig.json +++ b/apps/web-evals/tsconfig.json @@ -1,6 +1,7 @@ { "extends": "@roo-code/config-typescript/nextjs.json", "compilerOptions": { + "types": ["vitest/globals"], "plugins": [{ "name": "next" }], "paths": { "@/*": ["./src/*"] } }, diff --git a/apps/web-evals/vitest.config.ts b/apps/web-evals/vitest.config.ts new file mode 100644 index 0000000000..aa04bc59b7 --- /dev/null +++ b/apps/web-evals/vitest.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from "vitest/config" + +export default defineConfig({ + test: { + globals: true, + }, +}) diff --git a/packages/evals/.docker/entrypoints/runner.sh b/packages/evals/.docker/entrypoints/runner.sh new file mode 100644 index 0000000000..5445bf335e --- /dev/null +++ b/packages/evals/.docker/entrypoints/runner.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [ $# -eq 0 ]; then + exec bash +else + exec "$@" +fi diff --git a/packages/evals/.docker/entrypoints/web.sh b/packages/evals/.docker/entrypoints/web.sh index c105e3ec9d..43272ee93f 100644 --- a/packages/evals/.docker/entrypoints/web.sh +++ b/packages/evals/.docker/entrypoints/web.sh @@ -37,33 +37,12 @@ run_migrations() { fi } -start_web_service() { - echo "๐ŸŒ Starting web service..." - exec "$@" -} - main() { - if [ $# -eq 0 ]; then - set -- pnpm --filter @roo-code/web-evals dev - fi - - if [ "$SKIP_MIGRATION" = "true" ]; then - echo "โญ๏ธ Skipping migration (SKIP_MIGRATION=true)" - start_web_service "$@" - return - fi - - if [ "$MIGRATION_ONLY" = "true" ]; then - echo "๐Ÿ”„ Running migration only (MIGRATION_ONLY=true)" - wait_for_db - run_migrations - echo "โœ… Migration completed, exiting..." - exit 0 - fi - wait_for_db run_migrations - start_web_service "$@" + + echo "๐ŸŒ Starting web service..." + pnpm --filter @roo-code/web-evals start } main "$@" diff --git a/packages/evals/ARCHITECTURE.md b/packages/evals/ARCHITECTURE.md new file mode 100644 index 0000000000..f7b3a836cd --- /dev/null +++ b/packages/evals/ARCHITECTURE.md @@ -0,0 +1,282 @@ +# Evals System Architecture + +## Overview + +The evals system is a distributed evaluation platform that runs AI coding tasks in isolated VS Code environments. It solves two critical problems in AI evaluation: + +1. **Dependency Management**: Eliminates the complexity of setting up multiple programming language environments by packaging everything into pre-configured containers +2. **Resource Isolation**: Prevents memory exhaustion and state contamination by running each task in a fresh, isolated container instead of sequentially in a single VS Code instance + +The architecture consists of three main components: a Next.js web application for management, a controller container that orchestrates evaluation runs, and multiple runner containers that execute individual tasks. + +## Problems Solved + +### Simplified Setup and Deployment + +Traditional AI evaluation setups require complex dependency management across multiple programming languages, development tools, and VS Code extensions. The evals system eliminates this friction by: + +- **One-Command Deployment**: Single `docker compose up` command starts the entire evaluation infrastructure +- **Pre-configured Environments**: Runner containers include all necessary language runtimes, tools, and VS Code extensions +- **Dependency Isolation**: No host system contamination or version conflicts between different language requirements +- **Reproducible Environments**: Identical evaluation conditions across different machines and deployments + +### Resource Management and Isolation + +Running multiple AI evaluation tasks sequentially in a single VS Code instance creates several problems: + +- **Memory Accumulation**: VS Code instances gradually consume more memory with each task, eventually leading to crashes +- **State Contamination**: Previous tasks can leave behind files, settings, or processes that affect subsequent evaluations +- **Resource Contention**: Multiple tasks competing for the same VS Code instance create bottlenecks and inconsistent performance +- **Failure Propagation**: A single problematic task can crash the entire evaluation session + +The containerized approach solves these issues by: + +- **Fresh Environments**: Each task starts with a clean VS Code instance and workspace +- **Memory Reset**: Container termination automatically reclaims all memory and resources +- **Parallel Execution**: Multiple tasks can run simultaneously without interference +- **Fault Isolation**: Individual task failures don't affect other running evaluations + +## Architecture Components + +```mermaid +graph TB + Web[Admin Web App] <--> Redis[(Redis
PubSub & Registration)] + Web <--> DB[(PostgreSQL
Runs & Tasks)] + Web --> Controller[Run Controller / PQueue] + + Controller <--> DB + Controller --> Runner1[Task Runner 1] + Controller --> Runner2[...] + Controller --> RunnerN[Task Runner N] + + Runner1 <--> Redis + Runner2 <--> Redis + RunnerN <--> Redis + + Redis <--> Web +``` + +### Core Components + +#### Next.js Web Application + +The web application serves as the primary interface for creating and monitoring evaluation runs. It provides: + +- **Run Management**: Create evaluation runs with configurable parameters (model, concurrency, exercise selection) +- **Real-time Monitoring**: Live progress tracking via Server-Sent Events +- **Results Dashboard**: View task completion status, metrics, and outcomes +- **Container Orchestration**: Spawns controller containers for new runs + +#### Controller Container + +A specialized instance of the `evals-runner` container that acts as the run orchestrator. The controller: + +- **In-Memory Task Queue**: Uses the `p-queue` npm package to manage task distribution with configurable concurrency limits +- **Git Workspace Setup**: Prepares exercise repositories and manages version control +- **Runner Coordination**: Spawns and monitors individual task runner containers +- **Heartbeat Monitoring**: Maintains Redis heartbeat to track controller health +- **Result Aggregation**: Collects task results and finalizes run metrics + +#### Runner Containers + +Individual containers that execute single evaluation tasks. Each runner: + +- **Isolated Environment**: Fresh VS Code instance with pre-installed language tools and extensions +- **Task Execution**: Runs AI agent with evaluation prompt in VS Code environment +- **IPC Communication**: Connects to VS Code via Unix socket for real-time interaction +- **Unit Testing**: Validates task completion using language-specific test suites +- **Metrics Collection**: Tracks token usage, costs, tool usage, and execution time + +#### Supporting Infrastructure + +- **Redis**: Provides pub/sub messaging for real-time events and runner registration tracking (not used for task queuing) +- **PostgreSQL**: Stores run configurations, task definitions, execution metrics, and results +- **Docker**: Container orchestration for isolation and scalability + +## Execution Flow + +### 1. Run Initialization + +The web application creates an evaluation run with specified parameters: + +- **Suite Type**: Full evaluation (all exercises) or partial (selected exercises) +- **Model Configuration**: AI model selection and settings +- **Concurrency**: Number of parallel task executions (1-25) +- **Exercise Selection**: Programming language and specific coding challenges + +### 2. Controller Deployment + +The web application spawns a controller container that: + +- **Loads Run Configuration**: Retrieves run parameters and associated tasks from database +- **Prepares Workspace**: Sets up git repository with exercise code and test suites +- **Establishes Monitoring**: Starts Redis heartbeat and event publishing +- **Creates Task Queue**: Initializes concurrent task processing with specified limits + +### 3. Task Distribution + +The controller distributes tasks across runner containers using an in-memory queue: + +- **p-queue Management**: Uses the `p-queue` npm package to manage task concurrency in memory +- **Container Spawning**: Creates isolated runner containers for each task +- **Resource Management**: Enforces concurrency limits to prevent resource exhaustion +- **Task Assignment**: Each runner receives a single task with full context +- **Progress Tracking**: Monitors runner registration and task status via Redis pub/sub + +### 4. Task Execution + +Individual runners execute evaluation tasks: + +- **Environment Setup**: Launches VS Code with Roo extension in isolated container +- **Prompt Delivery**: Sends evaluation prompt to AI agent via IPC +- **Code Generation**: AI agent writes code using available tools and context +- **Real-time Events**: Publishes progress updates, token usage, and completion status +- **Validation**: Runs language-specific unit tests to verify task completion + +### 5. Result Collection + +The system aggregates and reports results: + +- **Event Streaming**: Real-time progress updates flow from runners through Redis to web interface +- **Metrics Aggregation**: Controller collects execution metrics, costs, and success rates +- **Run Completion**: Final results stored in database with comprehensive analytics +- **Cleanup**: Containers terminated and resources released + +## Technical Implementation + +### CLI System + +The evaluation system is driven by a command-line interface that can operate in two modes: + +- **Run Mode**: Orchestrates complete evaluation runs with multiple tasks +- **Task Mode**: Executes individual tasks within runner containers + +The CLI automatically detects its execution environment and adapts behavior accordingly, using containerized task execution when running within Docker. + +### Container Architecture + +Both controller and runner containers use the same base image but serve different purposes: + +#### Runner Container Features + +- **Multi-language Support**: Pre-installed runtimes for Go, Java, JavaScript, Python, and Rust +- **Development Tools**: VS Code with language-specific extensions and Roo Code extension +- **Containerization**: Docker-in-Docker capability for nested container execution +- **Exercise Repository**: Git clone of evaluation exercises with test suites + +#### Container Isolation + +Each task executes in complete isolation with: + +- **Fresh VS Code Instance**: Clean environment with no shared state +- **Dedicated Workspace**: Task-specific directory with relevant exercise files +- **Resource Limits**: Controlled CPU and memory allocation +- **Network Isolation**: Containers communicate only through Redis pub/sub + +### Communication Architecture + +The system uses multiple communication channels: + +#### IPC (Inter-Process Communication) + +- **Unix Sockets**: Direct communication between CLI and VS Code extension +- **Event Streaming**: Real-time task progress and AI agent interactions +- **Command Interface**: Task lifecycle management (start, cancel, close) + +#### Redis Pub/Sub + +- **Event Broadcasting**: Task events published to run-specific channels +- **Runner Registration**: Active runner tracking per evaluation run +- **Heartbeat Monitoring**: Controller health and availability status +- **Not Used for Queuing**: Task queue management is handled in-memory by the controller using `p-queue` + +#### HTTP/SSE + +- **Web Interface**: REST API for run management and configuration +- **Real-time Updates**: Server-Sent Events for live progress monitoring +- **Result Retrieval**: Task metrics and completion status + +### Task Lifecycle Management + +Each evaluation task follows a structured lifecycle: + +1. **Initialization**: Container startup and VS Code launch +2. **Connection**: IPC socket establishment and extension activation +3. **Prompt Delivery**: Evaluation challenge sent to AI agent +4. **Execution**: AI agent writes code using available tools +5. **Validation**: Unit test execution to verify correctness +6. **Cleanup**: Container termination and resource cleanup + +### Error Handling and Timeouts + +The system implements comprehensive error handling: + +- **Task Timeouts**: 30-minute maximum execution time per task +- **Process Cleanup**: Automatic termination of hung processes +- **Container Recovery**: Failed containers are cleaned up and resources released +- **Graceful Degradation**: Individual task failures don't affect other tasks in the run + +### Metrics and Monitoring + +Comprehensive tracking of evaluation performance: + +- **Token Usage**: Input/output tokens and context size tracking +- **Cost Analysis**: API costs per task and aggregated run costs +- **Tool Usage**: Frequency and success rates of different AI tools +- **Execution Time**: Task duration and queue wait times +- **Success Rates**: Pass/fail statistics across languages and exercises + +## Configuration and Customization + +### Run Configuration + +Evaluation runs support extensive customization: + +- **Model Selection**: Choose from available AI models via OpenRouter integration +- **Concurrency Control**: 1-25 parallel task executions based on resource availability +- **Exercise Selection**: Full suite (all exercises) or partial (selected exercises) +- **Custom Settings**: Override default AI agent configuration and behavior +- **System Prompts**: Optional custom prompts for specialized evaluation scenarios + +### Exercise Management + +The system uses a separate Git repository containing: + +- **Language-specific Exercises**: Coding challenges organized by programming language +- **Test Suites**: Automated validation for each exercise +- **Prompt Templates**: Standardized evaluation instructions per language +- **Workspace Configuration**: Language-specific development environment setup + +### Scalability Considerations + +The architecture supports horizontal scaling: + +- **Container Orchestration**: Multiple controller instances can run simultaneously +- **Resource Management**: Configurable concurrency prevents resource exhaustion +- **Database Optimization**: Efficient task querying and result storage +- **Redis Clustering**: Pub/sub system can scale with message volume + +## Operational Characteristics + +### Performance + +- **Task Isolation**: Complete environment isolation prevents interference between tasks +- **Parallel Execution**: Configurable concurrency maximizes resource utilization +- **Efficient Communication**: Unix sockets and Redis provide low-latency messaging +- **Resource Cleanup**: Automatic container termination prevents resource leaks + +### Reliability + +- **Fault Tolerance**: Individual task failures don't impact other tasks +- **Timeout Management**: Prevents hung tasks from consuming resources indefinitely +- **Health Monitoring**: Controller heartbeat and runner registration tracking +- **Graceful Shutdown**: Proper cleanup of containers and database connections + +### Observability + +- **Real-time Monitoring**: Live progress tracking through web interface +- **Comprehensive Logging**: Detailed execution logs for debugging and analysis +- **Metrics Collection**: Performance and cost analytics for optimization +- **Event Auditing**: Complete task lifecycle tracking for accountability + +This architecture provides a robust, scalable platform for evaluating AI coding capabilities across multiple programming languages while maintaining strict isolation and comprehensive monitoring. diff --git a/packages/evals/Dockerfile.runner b/packages/evals/Dockerfile.runner index 140d795e6d..d6d141b27a 100644 --- a/packages/evals/Dockerfile.runner +++ b/packages/evals/Dockerfile.runner @@ -1,63 +1,83 @@ -# docker build -f packages/evals/Dockerfile.runner -t evals-runner . -# docker run -it evals-runner - FROM node:20-slim AS base # Install pnpm ENV PNPM_HOME="/pnpm" ENV PATH="$PNPM_HOME:$PATH" RUN corepack enable -RUN npm install -g npm@latest -RUN npm install -g npm-run-all +RUN npm install -g npm@latest npm-run-all # Install system packages -RUN apt update && apt install -y sudo curl git vim jq - -# Create a `vscode` user -RUN useradd -m vscode -s /bin/bash && \ - echo "vscode ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/vscode && \ - chmod 0440 /etc/sudoers.d/vscode +RUN apt update && \ + apt install -y \ + curl \ + git \ + vim \ + jq \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + wget \ + gpg \ + xvfb \ + cmake \ + golang-go \ + default-jre \ + python3 \ + python3-venv \ + python3-dev \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* + +# Install Docker cli +RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \ + && apt update && apt install -y docker-ce-cli \ + && rm -rf /var/lib/apt/lists/* # Install VS Code -# https://code.visualstudio.com/docs/setup/linux -RUN apt install -y wget gpg apt-transport-https -RUN wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > packages.microsoft.gpg -RUN install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/packages.microsoft.gpg -RUN echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null -RUN rm -f packages.microsoft.gpg -RUN apt update && apt install -y code - -# Install Xvfb -RUN apt install -y xvfb - -# [cpp] Install cmake 3.28.3 -RUN apt install -y cmake - -# [go] Install Go 1.22.2 -RUN apt install -y golang-go - -# [java] Install Java 21 -RUN apt install -y default-jre - -# [python] Install Python 3.12.3 and uv 0.6.6 -RUN apt install -y python3 python3-venv python3-dev python3-pip - -WORKDIR /home/vscode -USER vscode - -# [rust] Install Rust 1.85 -RUN curl https://sh.rustup.rs -sSf | bash -s -- -y -RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc +RUN wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > packages.microsoft.gpg \ + && install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/packages.microsoft.gpg \ + && echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null \ + && rm -f packages.microsoft.gpg \ + && apt update && apt install -y code \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /roo + +# Install rust +ARG RUST_VERSION=1.87.0 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_VERSION} \ + && echo 'source $HOME/.cargo/env' >> $HOME/.bashrc + +# Install VS Code extensions +ARG GOLANG_EXT_VERSION=0.46.1 +ARG ESLINT_EXT_VERSION=3.0.10 +ARG JAVA_EXT_VERSION=1.42.0 +ARG PYTHON_EXT_VERSION=2025.6.1 +ARG RUST_EXT_VERSION=0.3.2482 + +RUN mkdir -p /roo/.vscode-template \ + && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension golang.go@${GOLANG_EXT_VERSION} \ + && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension dbaeumer.vscode-eslint@${ESLINT_EXT_VERSION} \ + && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension redhat.java@${JAVA_EXT_VERSION} \ + && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension ms-python.python@${PYTHON_EXT_VERSION} \ + && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension rust-lang.rust-analyzer@${RUST_EXT_VERSION} # Copy evals -RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals +ARG EVALS_COMMIT=main +ARG EVALS_REPO_URL=https://github.com/RooCodeInc/Roo-Code-Evals.git +RUN git clone ${EVALS_REPO_URL} evals \ + && cd evals \ + && git checkout ${EVALS_COMMIT} -# Prepare evals -WORKDIR /home/vscode/evals/python -RUN curl -LsSf https://astral.sh/uv/install.sh | sh -RUN /home/vscode/.local/bin/uv sync +# Install uv and sync python dependencies +ARG UV_VERSION=0.7.11 +WORKDIR /roo/evals/python +RUN curl -LsSf https://github.com/astral-sh/uv/releases/download/${UV_VERSION}/uv-installer.sh | sh \ + && /root/.local/bin/uv sync -WORKDIR /home/vscode/repo +WORKDIR /roo/repo # Install npm packages RUN mkdir -p \ @@ -73,29 +93,40 @@ RUN mkdir -p \ src \ webview-ui -COPY --chown=vscode:vscode ./package.json ./ -COPY --chown=vscode:vscode ./pnpm-lock.yaml ./ -COPY --chown=vscode:vscode ./pnpm-workspace.yaml ./ -COPY --chown=vscode:vscode ./scripts/bootstrap.mjs ./scripts/ -COPY --chown=vscode:vscode ./packages/build/package.json ./packages/build/ -COPY --chown=vscode:vscode ./packages/cloud/package.json ./packages/cloud/ -COPY --chown=vscode:vscode ./packages/config-eslint/package.json ./packages/config-eslint/ -COPY --chown=vscode:vscode ./packages/config-typescript/package.json ./packages/config-typescript/ -COPY --chown=vscode:vscode ./packages/evals/package.json ./packages/evals/ -COPY --chown=vscode:vscode ./packages/ipc/package.json ./packages/ipc/ -COPY --chown=vscode:vscode ./packages/telemetry/package.json ./packages/telemetry/ -COPY --chown=vscode:vscode ./packages/types/package.json ./packages/types/ -COPY --chown=vscode:vscode ./src/package.json ./src/ -COPY --chown=vscode:vscode ./webview-ui/package.json ./webview-ui/ +COPY ./package.json ./ +COPY ./pnpm-lock.yaml ./ +COPY ./pnpm-workspace.yaml ./ +COPY ./scripts/bootstrap.mjs ./scripts/ +COPY ./packages/build/package.json ./packages/build/ +COPY ./packages/cloud/package.json ./packages/cloud/ +COPY ./packages/config-eslint/package.json ./packages/config-eslint/ +COPY ./packages/config-typescript/package.json ./packages/config-typescript/ +COPY ./packages/evals/package.json ./packages/evals/ +COPY ./packages/ipc/package.json ./packages/ipc/ +COPY ./packages/telemetry/package.json ./packages/telemetry/ +COPY ./packages/types/package.json ./packages/types/ +COPY ./src/package.json ./src/ +COPY ./webview-ui/package.json ./webview-ui/ + RUN pnpm install -# Build the extension -COPY --chown=vscode:vscode . ./ -RUN pnpm build -- --out ../bin/roo-code.vsix +# Copy source code +COPY . ./ -# Copy & install extension -RUN code --debug --install-extension bin/roo-code.vsix +# Copy ENV secrets +COPY packages/evals/.env.local ./packages/evals/ -ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development +# Copy the pre-installed VS Code extensions +RUN cp -r /roo/.vscode-template /roo/.vscode -CMD ["bash"] +# Build the Roo Code extension +RUN pnpm build -- --out ../bin/roo-code.vsix \ + && code --no-sandbox --user-data-dir /roo/.vscode --install-extension bin/roo-code.vsix + +# Copy entrypoint script +COPY packages/evals/.docker/entrypoints/runner.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development +ENV REDIS_URL=redis://redis:6379 +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/packages/evals/Dockerfile.web b/packages/evals/Dockerfile.web index 7a5bb25f2c..55e8b5a298 100644 --- a/packages/evals/Dockerfile.web +++ b/packages/evals/Dockerfile.web @@ -1,6 +1,3 @@ -# docker build -f packages/evals/Dockerfile.web -t evals-web . -# docker run -it evals-web - FROM node:20-slim AS base # Install pnpm @@ -11,65 +8,55 @@ RUN npm install -g npm@latest RUN npm install -g npm-run-all # Install system packages -RUN apt update && apt install -y sudo curl git vim jq postgresql-client +RUN apt update && apt install -y curl git vim jq postgresql-client -# Install Docker CLI +# Install Docker cli RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg RUN echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null RUN apt update && apt install -y docker-ce-cli -# Create a `vscode` user and add to docker group -RUN useradd -m vscode -s /bin/bash && \ - echo "vscode ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/vscode && \ - chmod 0440 /etc/sudoers.d/vscode - -WORKDIR /home/vscode -USER vscode +WORKDIR /roo # Copy evals RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals -WORKDIR /home/vscode/repo +WORKDIR /roo/repo # Install npm packages RUN mkdir -p \ scripts \ apps/web-evals \ - packages/build \ - packages/cloud \ packages/config-eslint \ packages/config-typescript \ packages/evals \ packages/ipc \ - packages/telemetry \ - packages/types \ - src \ - webview-ui + packages/types + +COPY ./package.json ./ +COPY ./pnpm-lock.yaml ./ +COPY ./pnpm-workspace.yaml ./ +COPY ./scripts/bootstrap.mjs ./scripts/ +COPY ./apps/web-evals/package.json ./apps/web-evals/ +COPY ./packages/config-eslint/package.json ./packages/config-eslint/ +COPY ./packages/config-typescript/package.json ./packages/config-typescript/ +COPY ./packages/evals/package.json ./packages/evals/ +COPY ./packages/ipc/package.json ./packages/ipc/ +COPY ./packages/types/package.json ./packages/types/ -COPY --chown=vscode:vscode ./package.json ./ -COPY --chown=vscode:vscode ./pnpm-lock.yaml ./ -COPY --chown=vscode:vscode ./pnpm-workspace.yaml ./ -COPY --chown=vscode:vscode ./scripts/bootstrap.mjs ./scripts/ -COPY --chown=vscode:vscode ./apps/web-evals/package.json ./apps/web-evals/ -COPY --chown=vscode:vscode ./packages/build/package.json ./packages/build/ -COPY --chown=vscode:vscode ./packages/cloud/package.json ./packages/cloud/ -COPY --chown=vscode:vscode ./packages/config-eslint/package.json ./packages/config-eslint/ -COPY --chown=vscode:vscode ./packages/config-typescript/package.json ./packages/config-typescript/ -COPY --chown=vscode:vscode ./packages/evals/package.json ./packages/evals/ -COPY --chown=vscode:vscode ./packages/ipc/package.json ./packages/ipc/ -COPY --chown=vscode:vscode ./packages/telemetry/package.json ./packages/telemetry/ -COPY --chown=vscode:vscode ./packages/types/package.json ./packages/types/ -COPY --chown=vscode:vscode ./src/package.json ./src/ -COPY --chown=vscode:vscode ./webview-ui/package.json ./webview-ui/ RUN pnpm install -COPY --chown=vscode:vscode . ./ +# Copy source code +COPY . ./ -COPY --chown=vscode:vscode packages/evals/.docker/entrypoints/web.sh /usr/local/bin/entrypoint.sh -RUN sudo chmod +x /usr/local/bin/entrypoint.sh +# Build the web-evals app +RUN pnpm --filter @roo-code/web-evals build -ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development +# Copy entrypoint script +COPY packages/evals/.docker/entrypoints/web.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh +ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development +ENV REDIS_URL=redis://redis:6379 EXPOSE 3000 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/packages/evals/README.md b/packages/evals/README.md index 0aef77f863..3cec8c7e45 100644 --- a/packages/evals/README.md +++ b/packages/evals/README.md @@ -2,7 +2,13 @@ ## Get Started -NOTE: This is MacOS only for now! +### Prerequisites + +- [Docker Desktop](https://docs.docker.com/desktop/) +- [git](https://git-scm.com/) +- That's it! + +### Setup Clone the Roo Code repo: @@ -11,27 +17,28 @@ git clone https://github.com/RooCodeInc/Roo-Code.git cd Roo-Code ``` -Run the setup script: +Add your OpenRouter API key: ```sh -cd packages/evals -./scripts/setup.sh +echo "OPENROUTER_API_KEY=sk-or-v1-[...]" > packages/evals/.env.local ``` -Navigate to [localhost:3000](http://localhost:3000/) in your browser. - -## Running Migrations +### Run -Update `src/schema.ts` as needed, and then run: +Start the evals service: ```sh -pnpm db:generate +cd packages/evals && docker compose --profile server up ``` -Inspect the sql in the migration file added to `drizzle/`. +Navigate to [localhost:3000](http://localhost:3000/) in your browser. + +## Advanced Usage / Debugging + +The evals system runs VS Code headlessly in Docker containers for consistent, reproducible environments. While this design ensures reliability, it can make debugging more challenging. For debugging purposes, you can run the system locally on macOS, though this approach is less reliable due to hardware and environment variability. -If it looks okay, then run: +To configure your MacOS system to run evals locally, execute the setup script: ```sh -pnpm db:migrate +cd packages/evals && ./scripts/setup.sh ``` diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index d1db8da81a..cea7f418f9 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -1,12 +1,17 @@ -# Server: -# docker compose build web +# Build the web and runner images: +# docker compose build web runner +# +# Start all "server" services (db, redis, web): # docker compose --profile server up - -# Client: -# docker compose build runner -# docker compose up runner --no-start +# +# Start a shell in the runner container: +# docker compose run --rm runner bash +# +# Or using the docker cli: # docker run -it --rm --network evals_default evals-runner bash -# docker run --rm --network evals_default evals-runner sh -c "pnpm --filter @roo-code/evals cli 3" +# +# To enable docker execution, run: +# docker run -it --rm --network evals_default -v /var/run/docker.sock:/var/run/docker.sock -e HOST_EXECUTION_METHOD=docker evals-runner bash services: db: @@ -52,7 +57,6 @@ services: - HOST_EXECUTION_METHOD=docker volumes: - /var/run/docker.sock:/var/run/docker.sock - user: "0:0" # Run as root to access docker socket. depends_on: db: condition: service_healthy @@ -64,6 +68,10 @@ services: build: context: ../../ dockerfile: packages/evals/Dockerfile.runner + environment: + - HOST_EXECUTION_METHOD=docker + volumes: + - /var/run/docker.sock:/var/run/docker.sock stdin_open: true tty: true profiles: diff --git a/packages/evals/package.json b/packages/evals/package.json index eb166a1ee2..554356e5b1 100644 --- a/packages/evals/package.json +++ b/packages/evals/package.json @@ -12,14 +12,16 @@ "cli": "dotenvx run -f .env.development .env.local -- tsx src/cli/index.ts", "drizzle-kit": "dotenvx run -f .env.development -- tsx node_modules/drizzle-kit/bin.cjs", "drizzle-kit:test": "dotenvx run -f .env.test -- tsx node_modules/drizzle-kit/bin.cjs", - "db:start": "docker compose up -d", - "db:stop": "docker compose down", "db:generate": "pnpm drizzle-kit generate", "db:migrate": "pnpm drizzle-kit migrate", "db:push": "pnpm drizzle-kit push", "db:check": "pnpm drizzle-kit check", "db:test:push": "pnpm drizzle-kit:test push", - "db:test:check": "pnpm drizzle-kit:test check" + "db:test:check": "pnpm drizzle-kit:test check", + "db:start": "docker compose up -d db", + "db:stop": "docker compose down db", + "redis:start": "docker compose up -d redis", + "redis:stop": "docker compose down redis" }, "dependencies": { "@roo-code/ipc": "workspace:^", @@ -30,9 +32,11 @@ "execa": "^9.6.0", "node-ipc": "^12.0.0", "p-map": "^7.0.3", + "p-queue": "^8.1.0", "p-wait-for": "^5.0.2", "postgres": "^3.4.7", "ps-tree": "^1.2.0", + "redis": "^5.5.5", "zod": "^3.24.2" }, "devDependencies": { diff --git a/packages/evals/scripts/setup.sh b/packages/evals/scripts/setup.sh index 464abc3c0c..abf499b2c5 100755 --- a/packages/evals/scripts/setup.sh +++ b/packages/evals/scripts/setup.sh @@ -1,21 +1,5 @@ #!/bin/bash -menu() { - echo -e "\n๐Ÿ“‹ Which eval types would you like to support?\n" - - for i in ${!options[@]}; do - printf " %d) %-6s [%s]" $((i + 1)) "${options[i]}" "${choices[i]:- }" - - if [[ $i == 0 ]]; then - printf " (required)" - fi - - printf "\n" - done - - echo -e " q) quit\n" -} - has_asdf_plugin() { local plugin="$1" case "$plugin" in @@ -26,52 +10,106 @@ has_asdf_plugin() { build_extension() { echo "๐Ÿ”จ Building the Roo Code extension..." - cd .. - mkdir -p bin - pnpm vsix -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 - code --install-extension bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 + pnpm -w build -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 + code --install-extension ../../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 cd evals } -if [[ "$(uname -s)" != "Darwin" ]]; then - echo "โš ๏ธ Only macOS is currently supported." - exit 1 -fi - -options=("nodejs" "python" "golang" "rust" "java") -binaries=("node" "python" "go" "rustc" "javac") - -for i in "${!options[@]}"; do - choices[i]="*" -done +check_docker_services() { + echo "๐Ÿณ Checking Docker services..." -prompt="Type 1-5 to select, 'q' to quit, โŽ to continue: " - -while menu && read -rp "$prompt" num && [[ "$num" ]]; do - [[ "$num" == "q" ]] && exit 0 + if ! command -v docker &> /dev/null; then + echo "โŒ Docker is not installed. Please install Docker Desktop and try again." + exit 1 + fi - [[ "$num" != *[![:digit:]]* ]] && - ((num > 1 && num <= ${#options[@]})) || - { - continue - } + if ! docker info &> /dev/null; then + echo "โŒ Docker is not running. Please start Docker Desktop and try again." + exit 1 + fi - ((num--)) - [[ "${choices[num]}" ]] && choices[num]="" || choices[num]="*" -done + if ! docker compose version &> /dev/null; then + echo "โŒ Docker Compose is not available. Please ensure Docker Desktop is properly installed." + exit 1 + fi + + local services_to_start=() -empty=true + if ! nc -z localhost 5432 2>/dev/null; then + echo "๐Ÿ“ฆ PostgreSQL not running on port 5432" + services_to_start+=("db") + else + echo "โœ… PostgreSQL is running" + fi -for i in ${!options[@]}; do - [[ "${choices[i]}" ]] && { - empty=false - break - } -done + if ! nc -z localhost 6379 2>/dev/null; then + echo "๐Ÿ“ฆ Redis not running on port 6379" + services_to_start+=("redis") + else + echo "โœ… Redis is running" + fi -[[ "$empty" == true ]] && exit 0 + if [ ${#services_to_start[@]} -gt 0 ]; then + echo "๐Ÿš€ Starting Docker services: ${services_to_start[*]}" + + echo "๐Ÿงน Cleaning up stale Docker state..." + docker compose down --remove-orphans &>/dev/null || true + docker network prune -f &>/dev/null || true + + if docker compose --profile server up -d "${services_to_start[@]}"; then + echo "โœ… Docker services started successfully" + + echo "โณ Waiting for services to be ready..." + local timeout=30 + local elapsed=0 + local all_ready=false + + while [ $elapsed -lt $timeout ]; do + all_ready=true + + for service in "${services_to_start[@]}"; do + if [[ "$service" == "db" ]] && ! nc -z localhost 5432 2>/dev/null; then + all_ready=false + break + elif [[ "$service" == "redis" ]] && ! nc -z localhost 6379 2>/dev/null; then + all_ready=false + break + fi + done + + if [ "$all_ready" = true ]; then + echo "โœ… All services are ready" + break + fi + + sleep 1 + elapsed=$((elapsed + 1)) + + if [ $((elapsed % 5)) -eq 0 ]; then + echo " Still waiting... (${elapsed}s/${timeout}s)" + fi + done + + if [ "$all_ready" = false ]; then + echo "โŒ Timeout: Services failed to start within ${timeout} seconds" + echo " Please check Docker logs: docker compose logs" + exit 1 + fi + else + echo "โŒ Failed to start Docker services even after cleanup. Please check your docker-compose.yml file." + exit 1 + fi + else + echo "โœ… All required Docker services are already running" + fi +} -printf "\n" +if [[ "$(uname -s)" != "Darwin" ]]; then + echo "โš ๏ธ Only macOS is currently supported." + echo "The Roo Code evals system can also be run with Docker on any platform." + echo "See https://github.com/RooCodeInc/Roo-Code/blob/main/packages/evals/README.md for instructions." + exit 1 +fi if ! command -v brew &>/dev/null; then if [[ -f "/opt/homebrew/bin/brew" ]]; then @@ -159,9 +197,10 @@ else echo "โœ… gh is installed ($GH_VERSION)" fi -for i in "${!options[@]}"; do - [[ "${choices[i]}" ]] || continue +options=("nodejs" "python" "golang" "rust" "java") +binaries=("node" "python" "go" "rustc" "javac") +for i in "${!options[@]}"; do plugin="${options[$i]}" binary="${binaries[$i]}" @@ -282,7 +321,6 @@ fi # To reset VSCode: # rm -rvf ~/.vscode && rm -rvf ~/Library/Application\ Support/Code - echo -n "๐Ÿ”Œ Installing Visual Studio Code extensions... " code --install-extension golang.go &>/dev/null || exit 1 code --install-extension dbaeumer.vscode-eslint&>/dev/null || exit 1 @@ -296,20 +334,14 @@ fi echo "โœ… Done" -if [[ ! -d "../../evals" ]]; then +if [[ ! -d "../../../evals" ]]; then echo -n "๐Ÿ”— Cloning evals repository... " - - if gh auth status &>/dev/null; then - gh repo clone cte/evals ../../evals || exit 1 - else - git clone https://github.com/cte/evals.git ../../evals || exit 1 - fi - + git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals ../../../evals || exit 1 echo "โœ… Done" else echo -n "๐Ÿ”„ Updating evals repository... " - (cd ../../evals && \ + (cd ../../../evals && \ git checkout -f &>/dev/null && \ git clean -f -d &>/dev/null && \ git checkout main &>/dev/null && \ @@ -322,6 +354,9 @@ if [[ ! -s .env.local ]]; then touch .env.local || exit 1 fi +# Check and start Docker services before database operations +check_docker_services + echo -n "๐Ÿ—„๏ธ Syncing Roo Code evals database... " pnpm --filter @roo-code/evals db:push --force &>/dev/null || exit 1 echo "โœ… Done" diff --git a/packages/evals/src/cli/index.ts b/packages/evals/src/cli/index.ts index 86a8ebfcad..b50c72dd14 100644 --- a/packages/evals/src/cli/index.ts +++ b/packages/evals/src/cli/index.ts @@ -1,432 +1,44 @@ import * as fs from "fs" -import * as path from "path" -import pWaitFor from "p-wait-for" -import { execa, parseCommandString } from "execa" -import { command, run, number, positional } from "cmd-ts" -import psTree from "ps-tree" +import { command, run, number, option } from "cmd-ts" -import { RooCodeEventName, IpcOrigin, IpcMessageType, TaskCommandName } from "@roo-code/types" -import { IpcServer, IpcClient } from "@roo-code/ipc" +import { exercisesPath } from "../exercises/index.js" -import { - type Run, - type Task, - findRun, - finishRun, - getTasks, - updateTask, - createTaskMetrics, - updateTaskMetrics, - createToolError, -} from "../db/index.js" -import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js" +import { runEvals } from "./runEvals.js" +import { processTask } from "./processTask.js" -type TaskResult = { success: boolean } -type TaskPromise = Promise - -const TASK_START_DELAY = 10 * 1_000 -const TASK_TIMEOUT = 5 * 60 * 1_000 -const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000 - -const testCommands: Record = { - go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1" - java: { commands: ["./gradlew test"] }, // timeout --foreground 15s bash -c "cd '$dir' && ./gradlew test > /dev/null 2>&1" - javascript: { commands: ["pnpm install", "pnpm test"] }, // timeout 15s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1" - python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] }, // timeout 15s bash -c "cd '$dir' && uv run python3 -m pytest -o markers=task *_test.py" - rust: { commands: ["cargo test"] }, // timeout 15s bash -c "cd '$dir' && cargo test > /dev/null 2>&1" -} - -const runEvals = async (id: number) => { - const run = await findRun(id) - const tasks = await getTasks(run.id) - - if (!tasks[0]) { - throw new Error("No tasks found.") - } - - await execa({ cwd: exercisesPath })`git config user.name "Roo Code"` - await execa({ cwd: exercisesPath })`git config user.email "support@roocode.com"` - await execa({ cwd: exercisesPath })`git checkout -f` - await execa({ cwd: exercisesPath })`git clean -fd` - await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main` - - const server = new IpcServer(run.socketPath, () => {}) - server.listen() - - const runningPromises: TaskPromise[] = [] - - const processTask = async (task: Task, delay = 0) => { - if (task.finishedAt === null) { - await new Promise((resolve) => setTimeout(resolve, delay)) - await runExercise({ run, task, server }) - } - - if (task.passed === null) { - const passed = await runUnitTest({ task }) - await updateTask(task.id, { passed }) - - server.broadcast({ - type: IpcMessageType.TaskEvent, - origin: IpcOrigin.Server, - data: { eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail, taskId: task.id }, - }) - - return { success: passed } - } else { - return { success: task.passed } - } - } - - const processTaskResult = async (task: Task, promise: TaskPromise) => { - const index = runningPromises.indexOf(promise) - - if (index > -1) { - runningPromises.splice(index, 1) - } - } - - let delay = TASK_START_DELAY - - for (const task of tasks) { - const promise = processTask(task, delay) - delay = delay + TASK_START_DELAY - runningPromises.push(promise) - promise.then(() => processTaskResult(task, promise)) - - if (runningPromises.length >= run.concurrency) { - delay = 0 - await Promise.race(runningPromises) - } - } - - await Promise.all(runningPromises) - - const result = await finishRun(run.id) - console.log(`${Date.now()} [cli#run]`, result) - - await execa({ cwd: exercisesPath })`git add .` - await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify` -} - -const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => { - const { language, exercise } = task - const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8") - const dirname = path.dirname(run.socketPath) - const workspacePath = path.resolve(exercisesPath, language, exercise) - const taskSocketPath = path.resolve(dirname, `${dirname}/task-${task.id}.sock`) - - // Inject foot gun system prompt if present - if (process.env.FOOTGUN_SYSTEM_PROMPT) { - const rooDir = path.join(workspacePath, ".roo") - if (!fs.existsSync(rooDir)) { - fs.mkdirSync(rooDir, { recursive: true }) - } - fs.writeFileSync(path.join(rooDir, "system-prompt-code"), process.env.FOOTGUN_SYSTEM_PROMPT) - } - - // If debugging: - // Use --wait --log trace or --verbose. - // Don't await execa and store result as subprocess. - // subprocess.stdout.pipe(process.stdout) - - console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`) - - const controller = new AbortController() - const cancelSignal = controller.signal - - // If debugging: - // Use --wait --log trace or --verbose. - let codeCommand = `code --disable-workspace-trust` - const isDocker = fs.existsSync("/.dockerenv") - - if (isDocker) { - if (run.concurrency > 1) { - throw new Error("Cannot run multiple tasks in parallel in Docker. Please set concurrency to 1.") - } - codeCommand = `xvfb-run --auto-servernum --server-num=1 ${codeCommand} --wait --log trace --disable-gpu --password-store="basic"` - } - - const subprocess = execa({ - env: { - ROO_CODE_IPC_SOCKET_PATH: taskSocketPath, - }, - shell: "/bin/bash", - cancelSignal, - })`${codeCommand} -n ${workspacePath}` - - // If debugging: - // subprocess.stdout.pipe(process.stdout) - - // Give VSCode some time to spawn before connecting to its unix socket. - await new Promise((resolve) => setTimeout(resolve, 3_000)) - console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`) - const client = new IpcClient(taskSocketPath) - - try { - await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 }) - // eslint-disable-next-line @typescript-eslint/no-unused-vars - } catch (error) { - console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`) - client.disconnect() - return { success: false } - } - - let taskStartedAt = Date.now() - let taskFinishedAt: number | undefined - let taskMetricsId: number | undefined - let rooTaskId: string | undefined - let isClientDisconnected = false - - const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = { - broadcast: [RooCodeEventName.Message], - log: [RooCodeEventName.Message, RooCodeEventName.TaskTokenUsageUpdated, RooCodeEventName.TaskAskResponded], - } - - client.on(IpcMessageType.TaskEvent, async (taskEvent) => { - const { eventName, payload } = taskEvent - - if (!ignoreEvents.broadcast.includes(eventName)) { - server.broadcast({ - type: IpcMessageType.TaskEvent, - origin: IpcOrigin.Server, - relayClientId: client.clientId!, - data: { ...taskEvent, taskId: task.id }, - }) - } - - if (!ignoreEvents.log.includes(eventName)) { - console.log( - `${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`, - payload, - ) - } - - if (eventName === RooCodeEventName.TaskStarted) { - taskStartedAt = Date.now() - - const taskMetrics = await createTaskMetrics({ - cost: 0, - tokensIn: 0, - tokensOut: 0, - tokensContext: 0, - duration: 0, - cacheWrites: 0, - cacheReads: 0, - }) - - await updateTask(task.id, { taskMetricsId: taskMetrics.id, startedAt: new Date() }) - - taskStartedAt = Date.now() - taskMetricsId = taskMetrics.id - rooTaskId = payload[0] - } - - if (eventName === RooCodeEventName.TaskToolFailed) { - const [_taskId, toolName, error] = payload - await createToolError({ taskId: task.id, toolName, error }) - } - - if ( - (eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) && - taskMetricsId - ) { - const duration = Date.now() - taskStartedAt - - const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } = - payload[1] - - await updateTaskMetrics(taskMetricsId, { - cost: totalCost, - tokensIn: totalTokensIn, - tokensOut: totalTokensOut, - tokensContext: contextTokens, - duration, - cacheWrites: totalCacheWrites ?? 0, - cacheReads: totalCacheReads ?? 0, - }) - } - - if (eventName === RooCodeEventName.TaskCompleted && taskMetricsId) { - const toolUsage = payload[2] - await updateTaskMetrics(taskMetricsId, { toolUsage }) - } - - if (eventName === RooCodeEventName.TaskAborted || eventName === RooCodeEventName.TaskCompleted) { - taskFinishedAt = Date.now() - await updateTask(task.id, { finishedAt: new Date() }) - } - }) - - client.on(IpcMessageType.Disconnect, async () => { - console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`) - isClientDisconnected = true - }) - - console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`) - - if (client.isReady) { - client.sendMessage({ - type: IpcMessageType.TaskCommand, - origin: IpcOrigin.Client, - clientId: client.clientId!, - data: { - commandName: TaskCommandName.StartNewTask, - data: { - configuration: { - openRouterApiKey: process.env.OPENROUTER_API_KEY!, - ...run.settings, - }, - text: prompt, - newTab: true, - }, +const main = async () => { + const result = await run( + command({ + name: "cli", + description: "Execute an eval run.", + version: "0.0.0", + args: { + runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }), + taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }), }, - }) - } else { - console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`) - client.disconnect() - taskFinishedAt = Date.now() - isClientDisconnected = true - } - - try { - await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT }) - // eslint-disable-next-line @typescript-eslint/no-unused-vars - } catch (error) { - console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`) - - // Cancel the task. - if (rooTaskId && !isClientDisconnected) { - client.sendMessage({ - type: IpcMessageType.TaskCommand, - origin: IpcOrigin.Client, - clientId: client.clientId!, - data: { commandName: TaskCommandName.CancelTask, data: rooTaskId }, - }) - - // Allow some time for the task to cancel. - await new Promise((resolve) => setTimeout(resolve, 5_000)) - } - - await updateTask(task.id, { finishedAt: new Date() }) - } - - if (!isClientDisconnected) { - if (rooTaskId) { - client.sendMessage({ - type: IpcMessageType.TaskCommand, - origin: IpcOrigin.Client, - clientId: client.clientId!, - data: { commandName: TaskCommandName.CloseTask, data: rooTaskId }, - }) - - // Allow some time for the window to close. - await new Promise((resolve) => setTimeout(resolve, 2_000)) - } - - client.disconnect() - } - - controller.abort() - await subprocess - - return { success: !!taskFinishedAt } -} - -const runUnitTest = async ({ task }: { task: Task }) => { - const cmd = testCommands[task.language] - const exercisePath = path.resolve(exercisesPath, task.language, task.exercise) - const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath - const commands = cmd.commands.map((cs) => parseCommandString(cs)) - - let passed = true - - for (const command of commands) { - try { - console.log( - `${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`, - ) + handler: async (args) => { + const { runId, taskId } = args - const subprocess = execa({ cwd, shell: true, reject: false })`${command}` - - const timeout = setTimeout(async () => { - const descendants = await new Promise((resolve, reject) => { - psTree(subprocess.pid!, (err, children) => { - if (err) { - reject(err) - } - - resolve(children.map((p) => parseInt(p.PID))) - }) - }) - - console.log( - `${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}": unit tests timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`, - ) - - if (descendants.length > 0) { - for (const descendant of descendants) { - try { - console.log( - `${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendant}`, - ) - - await execa`kill -9 ${descendant}` - } catch (error) { - console.error( - `${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] Error killing descendant processes:`, - error, - ) - } - } + if (runId === -1 && taskId === -1) { + throw new Error("Either runId or taskId must be provided.") } - console.log( - `${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`, - ) + if (runId !== -1 && taskId !== -1) { + throw new Error("Only one of runId or taskId must be provided.") + } try { - await execa`kill -9 ${subprocess.pid!}` + if (runId !== -1) { + await runEvals(runId) + } else { + await processTask(taskId) + } } catch (error) { - console.error( - `${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] Error killing process:`, - error, - ) + console.error(error) + process.exit(1) } - }, UNIT_TEST_TIMEOUT) - - const result = await subprocess - - console.log( - `${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`, - ) - - clearTimeout(timeout) - - if (result.failed) { - passed = false - break - } - } catch (error) { - console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error) - passed = false - break - } - } - - return passed -} - -const main = async () => { - const result = await run( - command({ - name: "cli", - description: "Execute an eval run.", - version: "0.0.0", - args: { - runId: positional({ type: number, displayName: "runId" }), }, - handler: (args) => runEvals(args.runId), }), process.argv.slice(2), ) @@ -439,6 +51,7 @@ if (!fs.existsSync(exercisesPath)) { console.error( `Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`, ) + process.exit(1) } diff --git a/packages/evals/src/cli/processTask.ts b/packages/evals/src/cli/processTask.ts new file mode 100644 index 0000000000..7ccf512b59 --- /dev/null +++ b/packages/evals/src/cli/processTask.ts @@ -0,0 +1,56 @@ +import { RooCodeEventName, type TaskEvent } from "@roo-code/types" + +import { findTask, updateTask, findRun } from "../db/index.js" + +import { getTag } from "./utils.js" +import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js" +import { runTask } from "./runTask.js" +import { runUnitTest } from "./runUnitTest.js" +import { execa } from "execa" + +export const processTask = async (taskId: number) => { + const task = await findTask(taskId) + const run = await findRun(task.runId) + await registerRunner({ runId: run.id, taskId }) + + try { + const tag = getTag("processTask", { run, task }) + + const publish = async (e: TaskEvent) => { + const redis = await redisClient() + await redis.publish(getPubSubKey(run.id), JSON.stringify(e)) + } + + console.log(`[${Date.now()} | ${tag}] running task ${task.id} (${task.language}/${task.exercise})...`) + await runTask({ run, task, publish }) + + console.log(`[${Date.now()} | ${tag}] testing task ${task.id} (${task.language}/${task.exercise})...`) + const passed = await runUnitTest({ task }) + + console.log(`[${Date.now()} | ${tag}] task ${task.id} (${task.language}/${task.exercise}) -> ${passed}`) + await updateTask(task.id, { passed }) + + await publish({ + eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail, + taskId: task.id, + }) + } finally { + await deregisterRunner({ runId: run.id, taskId }) + } +} + +export const processTaskInContainer = async (taskId: number) => { + const args = [ + `--name evals-task-${taskId}`, + "--rm", + "--network evals_default", + "-v /var/run/docker.sock:/var/run/docker.sock", + "-e HOST_EXECUTION_METHOD=docker", + ] + + const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}` + const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true }) + // subprocess.stdout?.on("data", (data) => console.log(data.toString())) + // subprocess.stderr?.on("data", (data) => console.error(data.toString())) + await subprocess +} diff --git a/packages/evals/src/cli/redis.ts b/packages/evals/src/cli/redis.ts new file mode 100644 index 0000000000..07e4ae7e08 --- /dev/null +++ b/packages/evals/src/cli/redis.ts @@ -0,0 +1,53 @@ +import { createClient, type RedisClientType } from "redis" + +let redis: RedisClientType | undefined + +export const redisClient = async () => { + if (!redis) { + redis = createClient({ url: process.env.REDIS_URL || "redis://localhost:6379" }) + redis.on("error", (error) => console.error("redis error:", error)) + await redis.connect() + } + + return redis +} + +export const getPubSubKey = (runId: number) => `evals:${runId}` +export const getRunnersKey = (runId: number) => `runners:${runId}` +export const getHeartbeatKey = (runId: number) => `heartbeat:${runId}` + +export const registerRunner = async ({ runId, taskId }: { runId: number; taskId: number }) => { + const redis = await redisClient() + await redis.sAdd(getRunnersKey(runId), `task-${taskId}:${process.env.HOSTNAME}`) +} + +export const deregisterRunner = async ({ runId, taskId }: { runId: number; taskId: number }) => { + const redis = await redisClient() + await redis.sRem(getRunnersKey(runId), `task-${taskId}:${process.env.HOSTNAME}`) +} + +export const startHeartbeat = async (runId: number, interval: number = 10) => { + const pid = process.pid.toString() + const redis = await redisClient() + const heartbeatKey = getHeartbeatKey(runId) + await redis.setEx(heartbeatKey, interval, pid) + + return setInterval( + () => + redis.expire(heartbeatKey, interval).catch((error) => { + console.error("heartbeat error:", error) + }), + (interval * 1_000) / 2, + ) +} + +export const stopHeartbeat = async (runId: number, heartbeat: NodeJS.Timeout) => { + clearInterval(heartbeat) + + try { + const redis = await redisClient() + await redis.del(getHeartbeatKey(runId)) + } catch (error) { + console.error("redis.del failed:", error) + } +} diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts new file mode 100644 index 0000000000..aaeaf036da --- /dev/null +++ b/packages/evals/src/cli/runEvals.ts @@ -0,0 +1,56 @@ +import { execa } from "execa" +import PQueue from "p-queue" + +import { findRun, finishRun, getTasks } from "../db/index.js" +import { exercisesPath } from "../exercises/index.js" + +import { getTag, isDockerContainer } from "./utils.js" +import { processTask, processTaskInContainer } from "./processTask.js" +import { startHeartbeat, stopHeartbeat } from "./redis.js" + +export const runEvals = async (runId: number) => { + const run = await findRun(runId) + + if (run.taskMetricsId) { + throw new Error(`Run ${run.id} already finished.`) + } + + const tasks = await getTasks(runId) + + if (tasks.length === 0) { + throw new Error(`Run ${run.id} has no tasks.`) + } + + const tag = getTag("runEvals", { run }) + console.log(`[${Date.now()} | ${tag}] running ${tasks.length} task(s)`) + + const cwd = exercisesPath + await execa({ cwd })`git config user.name "Roo Code"` + await execa({ cwd })`git config user.email "support@roocode.com"` + await execa({ cwd })`git checkout -f` + await execa({ cwd })`git clean -fd` + await execa({ cwd })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main` + + const heartbeat = await startHeartbeat(run.id) + const queue = new PQueue({ concurrency: run.concurrency }) + + try { + const containerize = isDockerContainer() + + await queue.addAll( + tasks + .filter((task) => task.finishedAt === null) + .map((task) => () => (containerize ? processTaskInContainer(task.id) : processTask(task.id))), + ) + + console.log(`[${Date.now()} | ${tag}] finishRun`) + const result = await finishRun(run.id) + console.log(`[${Date.now()} | ${tag}] result ->`, result) + + await execa({ cwd: exercisesPath })`git add .` + await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify` + } finally { + console.log(`[${Date.now()} | ${tag}] cleaning up`) + stopHeartbeat(run.id, heartbeat) + } +} diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts new file mode 100644 index 0000000000..fd5729704b --- /dev/null +++ b/packages/evals/src/cli/runTask.ts @@ -0,0 +1,253 @@ +import * as fs from "fs" +import * as path from "path" +import * as os from "node:os" + +import pWaitFor from "p-wait-for" +import { execa } from "execa" + +import { + RooCodeEventName, + IpcOrigin, + IpcMessageType, + TaskCommandName, + type TaskEvent, + EVALS_SETTINGS, + EVALS_TIMEOUT, +} from "@roo-code/types" +import { IpcClient } from "@roo-code/ipc" + +import { type Run, type Task, updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js" +import { exercisesPath } from "../exercises/index.js" + +import { getTag, isDockerContainer } from "./utils.js" + +type RunTaskOptions = { + run: Run + task: Task + publish: (taskEvent: TaskEvent) => Promise +} + +export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{ success: boolean }> => { + const { language, exercise } = task + const tag = getTag("runTask", { run, task }) + + const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8") + const workspacePath = path.resolve(exercisesPath, language, exercise) + const taskSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`) + + // Inject foot gun system prompt if present. + if (process.env.FOOTGUN_SYSTEM_PROMPT) { + const rooDir = path.join(workspacePath, ".roo") + + if (!fs.existsSync(rooDir)) { + fs.mkdirSync(rooDir, { recursive: true }) + } + + fs.writeFileSync(path.join(rooDir, "system-prompt-code"), process.env.FOOTGUN_SYSTEM_PROMPT) + } + + console.log(`[${Date.now()} | ${tag}] Opening new VS Code window at ${workspacePath}`) + + const controller = new AbortController() + const cancelSignal = controller.signal + + const codeCommand = isDockerContainer() + ? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic"` + : `code --disable-workspace-trust` + + console.log(`[${Date.now()} | ${tag}] ${codeCommand}`) + + // Sleep for a random amount of time between 5 and 10 seconds. + await new Promise((resolve) => setTimeout(resolve, Math.random() * 5_000 + 5_000)) + + const subprocess = execa({ + env: { + ROO_CODE_IPC_SOCKET_PATH: taskSocketPath, + }, + shell: "/bin/bash", + cancelSignal, + })`${codeCommand} -n ${workspacePath}` + + // If debugging: + subprocess.stdout.pipe(process.stdout) + + // Give VSCode some time to spawn before connecting to its unix socket. + await new Promise((resolve) => setTimeout(resolve, 3_000)) + let client: IpcClient | undefined = undefined + let attempts = 5 + + while (true) { + try { + console.log(`[${Date.now()} | ${tag}] connecting to ${taskSocketPath}`) + client = new IpcClient(taskSocketPath) + await pWaitFor(() => client!.isReady, { interval: 250, timeout: 1_000 }) + break + } catch (_error) { + if (client) { + client.disconnect() + } + + attempts-- + + if (attempts <= 0) { + console.error(`[${Date.now()} | ${tag}] unable to connect`) + return { success: false } + } + } + } + + console.log(`[${Date.now()} | ${tag}] connected to ${taskSocketPath}`) + + let taskStartedAt = Date.now() + let taskFinishedAt: number | undefined + let taskMetricsId: number | undefined + let rooTaskId: string | undefined + let isClientDisconnected = false + + const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = { + broadcast: [RooCodeEventName.Message], + log: [RooCodeEventName.TaskTokenUsageUpdated], // [RooCodeEventName.Message, RooCodeEventName.TaskAskResponded], + } + + client.on(IpcMessageType.TaskEvent, async (taskEvent) => { + const { eventName, payload } = taskEvent + + if (!ignoreEvents.broadcast.includes(eventName)) { + await publish({ ...taskEvent, taskId: task.id }) + } + + if (!ignoreEvents.log.includes(eventName)) { + console.log(`[${Date.now()} | ${tag}] ${eventName} ->`, payload) + } + + if (eventName === RooCodeEventName.TaskStarted) { + taskStartedAt = Date.now() + + const taskMetrics = await createTaskMetrics({ + cost: 0, + tokensIn: 0, + tokensOut: 0, + tokensContext: 0, + duration: 0, + cacheWrites: 0, + cacheReads: 0, + }) + + await updateTask(task.id, { taskMetricsId: taskMetrics.id, startedAt: new Date() }) + + taskStartedAt = Date.now() + taskMetricsId = taskMetrics.id + rooTaskId = payload[0] + } + + if (eventName === RooCodeEventName.TaskToolFailed) { + const [_taskId, toolName, error] = payload + await createToolError({ taskId: task.id, toolName, error }) + } + + if ( + (eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) && + taskMetricsId + ) { + const duration = Date.now() - taskStartedAt + + const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } = + payload[1] + + await updateTaskMetrics(taskMetricsId, { + cost: totalCost, + tokensIn: totalTokensIn, + tokensOut: totalTokensOut, + tokensContext: contextTokens, + duration, + cacheWrites: totalCacheWrites ?? 0, + cacheReads: totalCacheReads ?? 0, + }) + } + + if (eventName === RooCodeEventName.TaskCompleted && taskMetricsId) { + const toolUsage = payload[2] + await updateTaskMetrics(taskMetricsId, { toolUsage }) + } + + if (eventName === RooCodeEventName.TaskAborted || eventName === RooCodeEventName.TaskCompleted) { + taskFinishedAt = Date.now() + await updateTask(task.id, { finishedAt: new Date() }) + } + }) + + client.on(IpcMessageType.Disconnect, async () => { + console.log(`[${Date.now()} | ${tag}] disconnect`) + isClientDisconnected = true + }) + + if (client.isReady) { + const configuration = { + ...EVALS_SETTINGS, + ...run.settings, + openRouterApiKey: process.env.OPENROUTER_API_KEY, + } + + client.sendMessage({ + type: IpcMessageType.TaskCommand, + origin: IpcOrigin.Client, + clientId: client.clientId!, + data: { + commandName: TaskCommandName.StartNewTask, + data: { + configuration, + text: prompt, + newTab: true, + }, + }, + }) + } else { + console.error(`[${Date.now()} | ${tag}] unable to connect`) + client.disconnect() + taskFinishedAt = Date.now() + isClientDisconnected = true + } + + try { + await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: EVALS_TIMEOUT }) + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (error) { + console.log(`[${Date.now()} | ${tag}] time limit reached`) + + // Cancel the task. + if (rooTaskId && !isClientDisconnected) { + client.sendMessage({ + type: IpcMessageType.TaskCommand, + origin: IpcOrigin.Client, + clientId: client.clientId!, + data: { commandName: TaskCommandName.CancelTask, data: rooTaskId }, + }) + + // Allow some time for the task to cancel. + await new Promise((resolve) => setTimeout(resolve, 5_000)) + } + + await updateTask(task.id, { finishedAt: new Date() }) + } + + if (!isClientDisconnected) { + if (rooTaskId) { + client.sendMessage({ + type: IpcMessageType.TaskCommand, + origin: IpcOrigin.Client, + clientId: client.clientId!, + data: { commandName: TaskCommandName.CloseTask, data: rooTaskId }, + }) + + // Allow some time for the window to close. + await new Promise((resolve) => setTimeout(resolve, 2_000)) + } + + client.disconnect() + } + + controller.abort() + await subprocess + + return { success: !!taskFinishedAt } +} diff --git a/packages/evals/src/cli/runUnitTest.ts b/packages/evals/src/cli/runUnitTest.ts new file mode 100644 index 0000000000..c109895939 --- /dev/null +++ b/packages/evals/src/cli/runUnitTest.ts @@ -0,0 +1,84 @@ +import * as path from "path" + +import { execa, parseCommandString } from "execa" +import psTree from "ps-tree" + +import { type Task } from "../db/index.js" +import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js" + +const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000 + +const testCommands: Record = { + go: { commands: ["go test"] }, + java: { commands: ["./gradlew test"] }, + javascript: { commands: ["pnpm install", "pnpm test"] }, + python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] }, + rust: { commands: ["cargo test"] }, +} + +export const runUnitTest = async ({ task }: { task: Task }) => { + const cmd = testCommands[task.language] + const exercisePath = path.resolve(exercisesPath, task.language, task.exercise) + const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath + const commands = cmd.commands.map((cs) => parseCommandString(cs)) + const tag = `cli#runUnitTest | ${task.language} / ${task.exercise}` + + let passed = true + + for (const command of commands) { + try { + const subprocess = execa({ cwd, shell: true, reject: false })`${command}` + + const timeout = setTimeout(async () => { + const descendants = await new Promise((resolve, reject) => { + psTree(subprocess.pid!, (err, children) => { + if (err) { + reject(err) + } + + resolve(children.map((p) => parseInt(p.PID))) + }) + }) + + console.log( + `${Date.now()} [${tag}] "${command.join(" ")}": unit tests timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`, + ) + + if (descendants.length > 0) { + for (const descendant of descendants) { + try { + console.log(`${Date.now()} [${tag}] killing ${descendant}`) + + await execa`kill -9 ${descendant}` + } catch (error) { + console.error(`${Date.now()} [${tag}] Error killing descendant processes:`, error) + } + } + } + + console.log(`${Date.now()} [${tag}] killing ${subprocess.pid}`) + + try { + await execa`kill -9 ${subprocess.pid!}` + } catch (error) { + console.error(`${Date.now()} [${tag}] Error killing process:`, error) + } + }, UNIT_TEST_TIMEOUT) + + const result = await subprocess + + clearTimeout(timeout) + + if (result.failed) { + passed = false + break + } + } catch (error) { + console.error(`${Date.now()} [${tag}]`, error) + passed = false + break + } + } + + return passed +} diff --git a/packages/evals/src/cli/utils.ts b/packages/evals/src/cli/utils.ts new file mode 100644 index 0000000000..ed560896a1 --- /dev/null +++ b/packages/evals/src/cli/utils.ts @@ -0,0 +1,16 @@ +import * as fs from "fs" + +import type { Run, Task } from "../db/index.js" + +export const getTag = (caller: string, { run, task }: { run: Run; task?: Task }) => + task + ? `${caller} | pid:${process.pid} | run:${run.id} | task:${task.id} | ${task.language}/${task.exercise}` + : `${caller} | pid:${process.pid} | run:${run.id}` + +export const isDockerContainer = () => { + try { + return fs.existsSync("/.dockerenv") + } catch (_error) { + return false + } +} diff --git a/packages/evals/src/db/queries/runs.ts b/packages/evals/src/db/queries/runs.ts index a294054cc6..7866e99811 100644 --- a/packages/evals/src/db/queries/runs.ts +++ b/packages/evals/src/db/queries/runs.ts @@ -114,7 +114,16 @@ export const deleteRun = async (runId: number) => { columns: { id: true, taskMetricsId: true }, }) + await db.delete(schema.toolErrors).where( + inArray( + schema.toolErrors.taskId, + tasks.map(({ id }) => id), + ), + ) + await db.delete(schema.tasks).where(eq(schema.tasks.runId, runId)) + + await db.delete(schema.toolErrors).where(eq(schema.toolErrors.runId, runId)) await db.delete(schema.runs).where(eq(schema.runs.id, runId)) const taskMetricsIds = tasks diff --git a/packages/evals/src/db/queries/tasks.ts b/packages/evals/src/db/queries/tasks.ts index 9090f1f34b..27d3cb54be 100644 --- a/packages/evals/src/db/queries/tasks.ts +++ b/packages/evals/src/db/queries/tasks.ts @@ -1,4 +1,4 @@ -import { and, eq } from "drizzle-orm" +import { and, asc, eq } from "drizzle-orm" import type { ExerciseLanguage } from "../../exercises/index.js" @@ -58,4 +58,8 @@ export const getTask = async ({ runId, language, exercise }: GetTask) => }) export const getTasks = async (runId: number) => - db.query.tasks.findMany({ where: eq(tasks.runId, runId), with: { taskMetrics: true } }) + db.query.tasks.findMany({ + where: eq(tasks.runId, runId), + with: { taskMetrics: true }, + orderBy: asc(tasks.id), + }) diff --git a/packages/evals/src/db/schema.ts b/packages/evals/src/db/schema.ts index 62480392d6..0338b812e2 100644 --- a/packages/evals/src/db/schema.ts +++ b/packages/evals/src/db/schema.ts @@ -117,4 +117,4 @@ export type UpdateToolError = Partial> * schema */ -export const schema = { runs, runsRelations, tasks, tasksRelations, taskMetrics } +export const schema = { runs, runsRelations, tasks, tasksRelations, taskMetrics, toolErrors, toolErrorsRelations } diff --git a/packages/types/src/global-settings.ts b/packages/types/src/global-settings.ts index 8220aca3a1..1d0c4f5f4d 100644 --- a/packages/types/src/global-settings.ts +++ b/packages/types/src/global-settings.ts @@ -48,7 +48,7 @@ export const globalSettingsSchema = z.object({ allowedMaxRequests: z.number().nullish(), autoCondenseContext: z.boolean().optional(), autoCondenseContextPercent: z.number().optional(), - maxConcurrentFileReads: z.number().optional(), + maxConcurrentFileReads: z.number().optional(), browserToolEnabled: z.boolean().optional(), browserViewportSize: z.string().optional(), @@ -262,3 +262,75 @@ export const GLOBAL_STATE_KEYS = [...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_ export const isGlobalStateKey = (key: string): key is Keys => GLOBAL_STATE_KEYS.includes(key as Keys) + +/** + * Evals + */ + +// Default settings when running evals (unless overridden). +export const EVALS_SETTINGS: RooCodeSettings = { + apiProvider: "openrouter", + openRouterUseMiddleOutTransform: false, + + lastShownAnnouncementId: "may-29-2025-3-19", + + pinnedApiConfigs: {}, + + autoApprovalEnabled: true, + alwaysAllowReadOnly: true, + alwaysAllowReadOnlyOutsideWorkspace: false, + alwaysAllowWrite: true, + alwaysAllowWriteOutsideWorkspace: false, + writeDelayMs: 1000, + alwaysAllowBrowser: true, + alwaysApproveResubmit: true, + requestDelaySeconds: 10, + alwaysAllowMcp: true, + alwaysAllowModeSwitch: true, + alwaysAllowSubtasks: true, + alwaysAllowExecute: true, + allowedCommands: ["*"], + + browserToolEnabled: false, + browserViewportSize: "900x600", + screenshotQuality: 75, + remoteBrowserEnabled: false, + + ttsEnabled: false, + ttsSpeed: 1, + soundEnabled: false, + soundVolume: 0.5, + + terminalOutputLineLimit: 500, + terminalShellIntegrationTimeout: 30000, + terminalCommandDelay: 0, + terminalPowershellCounter: false, + terminalZshOhMy: true, + terminalZshClearEolMark: true, + terminalZshP10k: false, + terminalZdotdir: true, + terminalCompressProgressBar: true, + terminalShellIntegrationDisabled: true, + + diffEnabled: true, + fuzzyMatchThreshold: 1, + + enableCheckpoints: false, + + rateLimitSeconds: 0, + maxOpenTabsContext: 20, + maxWorkspaceFiles: 200, + showRooIgnoredFiles: true, + maxReadFileLine: -1, // -1 to enable full file reading. + + language: "en", + telemetrySetting: "enabled", + + mcpEnabled: false, + + mode: "code", + + customModes: [], +} + +export const EVALS_TIMEOUT = 5 * 60 * 1_000 diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 009039766c..bd32858e19 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -143,9 +143,6 @@ importers: '@roo-code/evals': specifier: workspace:^ version: link:../../packages/evals - '@roo-code/ipc': - specifier: workspace:^ - version: link:../../packages/ipc '@roo-code/types': specifier: workspace:^ version: link:../../packages/types @@ -176,9 +173,6 @@ importers: p-map: specifier: ^7.0.3 version: 7.0.3 - ps-tree: - specifier: ^1.2.0 - version: 1.2.0 react: specifier: ^18.3.1 version: 18.3.1 @@ -191,6 +185,9 @@ importers: react-use: specifier: ^17.6.0 version: 17.6.0(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + redis: + specifier: ^5.5.5 + version: 5.5.5 sonner: specifier: ^2.0.5 version: 2.0.5(react-dom@18.3.1(react@18.3.1))(react@18.3.1) @@ -228,6 +225,9 @@ importers: tailwindcss: specifier: ^4 version: 4.1.6 + vitest: + specifier: ^3.2.1 + version: 3.2.1(@types/debug@4.1.12)(@types/node@22.15.29)(jiti@2.4.2)(jsdom@20.0.3)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0) apps/web-roo-code: dependencies: @@ -446,6 +446,9 @@ importers: p-map: specifier: ^7.0.3 version: 7.0.3 + p-queue: + specifier: ^8.1.0 + version: 8.1.0 p-wait-for: specifier: ^5.0.2 version: 5.0.2 @@ -455,6 +458,9 @@ importers: ps-tree: specifier: ^1.2.0 version: 1.2.0 + redis: + specifier: ^5.5.5 + version: 5.5.5 zod: specifier: ^3.24.2 version: 3.24.4 @@ -3338,6 +3344,34 @@ packages: '@radix-ui/rect@1.1.1': resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==} + '@redis/bloom@5.5.5': + resolution: {integrity: sha512-M0GDmw8k0EOFoSpmMjhFUADk/apoano97fLSpT81opgmkkDtBB9iB6l6husxnzK5t2qNz/o0+OCVG9g6lEEwKw==} + engines: {node: '>= 18'} + peerDependencies: + '@redis/client': ^5.5.5 + + '@redis/client@5.5.5': + resolution: {integrity: sha512-1Dv/CVdMNLw0mlROSnmpp4MQu+6YIJX0YR0h3g2hnPdLvk6L7TcRcrUj7BQFGSeZD2MxklAUO+rp09ITUqE5Og==} + engines: {node: '>= 18'} + + '@redis/json@5.5.5': + resolution: {integrity: sha512-Nq8wHjOhwuhD05YPWFPL9RyT3K1VdT37TKvqbhykZA2MWQgjjhLn5i1/6zZ+1b0Zc/Sr9E0eK9J8txk6YJR6EA==} + engines: {node: '>= 18'} + peerDependencies: + '@redis/client': ^5.5.5 + + '@redis/search@5.5.5': + resolution: {integrity: sha512-xM/DKrRhbsMS2QQF5bBPjR7P/QEjWWZDUr92r+UOwkZjvc/kmy0tp7h8zkxBo2jtSF99vkk2mwMzn6fQ8d60aQ==} + engines: {node: '>= 18'} + peerDependencies: + '@redis/client': ^5.5.5 + + '@redis/time-series@5.5.5': + resolution: {integrity: sha512-2ifwV75Fv/uVX4n0zqvgqIlIInHZtVj+afjcbXPBD2GhG2AeVfkitTz1bMnGnNDA78sWRYooK42OWH9yqujjyQ==} + engines: {node: '>= 18'} + peerDependencies: + '@redis/client': ^5.5.5 + '@rollup/pluginutils@5.1.4': resolution: {integrity: sha512-USm05zrsFxYLPdWWq+K3STlWiT/3ELn3RcV5hJMghpeAIhxfsUIg6mt12CBJBInWMV4VneoV7SfGv8xIwo2qNQ==} engines: {node: '>=14.0.0'} @@ -4554,6 +4588,9 @@ packages: '@vitest/expect@3.2.0': resolution: {integrity: sha512-0v4YVbhDKX3SKoy0PHWXpKhj44w+3zZkIoVES9Ex2pq+u6+Bijijbi2ua5kE+h3qT6LBWFTNZSCOEU37H8Y5sA==} + '@vitest/expect@3.2.1': + resolution: {integrity: sha512-FqS/BnDOzV6+IpxrTg5GQRyLOCtcJqkwMwcS8qGCI2IyRVDwPAtutztaf1CjtPHlZlWtl1yUPCd7HM0cNiDOYw==} + '@vitest/mocker@3.1.3': resolution: {integrity: sha512-PJbLjonJK82uCWHjzgBJZuR7zmAOrSvKk1QBxrennDIgtH4uK0TB1PvYmc0XBCigxxtiAVPfWtAdy4lpz8SQGQ==} peerDependencies: @@ -4576,36 +4613,62 @@ packages: vite: optional: true + '@vitest/mocker@3.2.1': + resolution: {integrity: sha512-OXxMJnx1lkB+Vl65Re5BrsZEHc90s5NMjD23ZQ9NlU7f7nZiETGoX4NeKZSmsKjseuMq2uOYXdLOeoM0pJU+qw==} + peerDependencies: + msw: ^2.4.9 + vite: ^5.0.0 || ^6.0.0 || ^7.0.0-0 + peerDependenciesMeta: + msw: + optional: true + vite: + optional: true + '@vitest/pretty-format@3.1.3': resolution: {integrity: sha512-i6FDiBeJUGLDKADw2Gb01UtUNb12yyXAqC/mmRWuYl+m/U9GS7s8us5ONmGkGpUUo7/iAYzI2ePVfOZTYvUifA==} '@vitest/pretty-format@3.2.0': resolution: {integrity: sha512-gUUhaUmPBHFkrqnOokmfMGRBMHhgpICud9nrz/xpNV3/4OXCn35oG+Pl8rYYsKaTNd/FAIrqRHnwpDpmYxCYZw==} + '@vitest/pretty-format@3.2.1': + resolution: {integrity: sha512-xBh1X2GPlOGBupp6E1RcUQWIxw0w/hRLd3XyBS6H+dMdKTAqHDNsIR2AnJwPA3yYe9DFy3VUKTe3VRTrAiQ01g==} + '@vitest/runner@3.1.3': resolution: {integrity: sha512-Tae+ogtlNfFei5DggOsSUvkIaSuVywujMj6HzR97AHK6XK8i3BuVyIifWAm/sE3a15lF5RH9yQIrbXYuo0IFyA==} '@vitest/runner@3.2.0': resolution: {integrity: sha512-bXdmnHxuB7fXJdh+8vvnlwi/m1zvu+I06i1dICVcDQFhyV4iKw2RExC/acavtDn93m/dRuawUObKsrNE1gJacA==} + '@vitest/runner@3.2.1': + resolution: {integrity: sha512-kygXhNTu/wkMYbwYpS3z/9tBe0O8qpdBuC3dD/AW9sWa0LE/DAZEjnHtWA9sIad7lpD4nFW1yQ+zN7mEKNH3yA==} + '@vitest/snapshot@3.1.3': resolution: {integrity: sha512-XVa5OPNTYUsyqG9skuUkFzAeFnEzDp8hQu7kZ0N25B1+6KjGm4hWLtURyBbsIAOekfWQ7Wuz/N/XXzgYO3deWQ==} '@vitest/snapshot@3.2.0': resolution: {integrity: sha512-z7P/EneBRMe7hdvWhcHoXjhA6at0Q4ipcoZo6SqgxLyQQ8KSMMCmvw1cSt7FHib3ozt0wnRHc37ivuUMbxzG/A==} + '@vitest/snapshot@3.2.1': + resolution: {integrity: sha512-5xko/ZpW2Yc65NVK9Gpfg2y4BFvcF+At7yRT5AHUpTg9JvZ4xZoyuRY4ASlmNcBZjMslV08VRLDrBOmUe2YX3g==} + '@vitest/spy@3.1.3': resolution: {integrity: sha512-x6w+ctOEmEXdWaa6TO4ilb7l9DxPR5bwEb6hILKuxfU1NqWT2mpJD9NJN7t3OTfxmVlOMrvtoFJGdgyzZ605lQ==} '@vitest/spy@3.2.0': resolution: {integrity: sha512-s3+TkCNUIEOX99S0JwNDfsHRaZDDZZR/n8F0mop0PmsEbQGKZikCGpTGZ6JRiHuONKew3Fb5//EPwCP+pUX9cw==} + '@vitest/spy@3.2.1': + resolution: {integrity: sha512-Nbfib34Z2rfcJGSetMxjDCznn4pCYPZOtQYox2kzebIJcgH75yheIKd5QYSFmR8DIZf2M8fwOm66qSDIfRFFfQ==} + '@vitest/utils@3.1.3': resolution: {integrity: sha512-2Ltrpht4OmHO9+c/nmHtF09HWiyWdworqnHIwjfvDyWjuwKbdkcS9AnhsDn+8E2RM4x++foD1/tNuLPVvWG1Rg==} '@vitest/utils@3.2.0': resolution: {integrity: sha512-gXXOe7Fj6toCsZKVQouTRLJftJwmvbhH5lKOBR6rlP950zUq9AitTUjnFoXS/CqjBC2aoejAztLPzzuva++XBw==} + '@vitest/utils@3.2.1': + resolution: {integrity: sha512-KkHlGhePEKZSub5ViknBcN5KEF+u7dSUr9NW8QsVICusUojrgrOnnY3DEWWO877ax2Pyopuk2qHmt+gkNKnBVw==} + '@vscode/codicons@0.0.36': resolution: {integrity: sha512-wsNOvNMMJ2BY8rC2N2MNBG7yOowV3ov8KlvUE/AiVUlHKTfWsw3OgAOQduX7h0Un6GssKD3aoTVH+TF3DSQwKQ==} @@ -5221,6 +5284,10 @@ packages: resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==} engines: {node: '>=6'} + cluster-key-slot@1.1.2: + resolution: {integrity: sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA==} + engines: {node: '>=0.10.0'} + cmd-ts@0.13.0: resolution: {integrity: sha512-nsnxf6wNIM/JAS7T/x/1JmbEsjH0a8tezXqqpaL0O6+eV0/aDEnRxwjxpu0VzDdRcaC1ixGSbRlUuf/IU59I4g==} @@ -8522,6 +8589,10 @@ packages: resolution: {integrity: sha512-VkndIv2fIB99swvQoA65bm+fsmt6UNdGeIB0oxBs+WhAhdh08QA04JXpI7rbB9r08/nkbysKoya9rtDERYOYMA==} engines: {node: '>=18'} + p-queue@8.1.0: + resolution: {integrity: sha512-mxLDbbGIBEXTJL0zEx8JIylaj3xQ7Z/7eEVjcF9fJX4DBiH9oqe+oahYnlKKxm0Ci9TlWTyhSHgygxMxjIB2jw==} + engines: {node: '>=18'} + p-timeout@6.1.4: resolution: {integrity: sha512-MyIV3ZA/PmyBN/ud8vV9XzwTrNtR4jFrObymZYnZqMmW0zA8Z17vnT0rBgFE/TlohB+YCHqXMgZzb3Csp49vqg==} engines: {node: '>=14.16'} @@ -9136,6 +9207,10 @@ packages: resolution: {integrity: sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==} engines: {node: '>=8'} + redis@5.5.5: + resolution: {integrity: sha512-x7vpciikEY7nptGzQrE5I+/pvwFZJDadPk/uEoyGSg/pZ2m/CX2n5EhSgUh+S5T7Gz3uKM6YzWcXEu3ioAsdFQ==} + engines: {node: '>= 18'} + reflect.getprototypeof@1.0.10: resolution: {integrity: sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==} engines: {node: '>= 0.4'} @@ -10334,6 +10409,11 @@ packages: engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} hasBin: true + vite-node@3.2.1: + resolution: {integrity: sha512-V4EyKQPxquurNJPtQJRZo8hKOoKNBRIhxcDbQFPFig0JdoWcUhwRgK8yoCXXrfYVPKS6XwirGHPszLnR8FbjCA==} + engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} + hasBin: true + vite@6.3.5: resolution: {integrity: sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==} engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} @@ -10430,6 +10510,34 @@ packages: jsdom: optional: true + vitest@3.2.1: + resolution: {integrity: sha512-VZ40MBnlE1/V5uTgdqY3DmjUgZtIzsYq758JGlyQrv5syIsaYcabkfPkEuWML49Ph0D/SoqpVFd0dyVTr551oA==} + engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} + hasBin: true + peerDependencies: + '@edge-runtime/vm': '*' + '@types/debug': ^4.1.12 + '@types/node': ^18.0.0 || ^20.0.0 || >=22.0.0 + '@vitest/browser': 3.2.1 + '@vitest/ui': 3.2.1 + happy-dom: '*' + jsdom: '*' + peerDependenciesMeta: + '@edge-runtime/vm': + optional: true + '@types/debug': + optional: true + '@types/node': + optional: true + '@vitest/browser': + optional: true + '@vitest/ui': + optional: true + happy-dom: + optional: true + jsdom: + optional: true + void-elements@3.1.0: resolution: {integrity: sha512-Dhxzh5HZuiHQhbvTW9AMetFfBHDMYpo23Uo9btPXgdYP+3T5S+p+jgNy7spra+veYhBP2dCSgxR/i2Y02h5/6w==} engines: {node: '>=0.10.0'} @@ -13324,6 +13432,26 @@ snapshots: '@radix-ui/rect@1.1.1': {} + '@redis/bloom@5.5.5(@redis/client@5.5.5)': + dependencies: + '@redis/client': 5.5.5 + + '@redis/client@5.5.5': + dependencies: + cluster-key-slot: 1.1.2 + + '@redis/json@5.5.5(@redis/client@5.5.5)': + dependencies: + '@redis/client': 5.5.5 + + '@redis/search@5.5.5(@redis/client@5.5.5)': + dependencies: + '@redis/client': 5.5.5 + + '@redis/time-series@5.5.5(@redis/client@5.5.5)': + dependencies: + '@redis/client': 5.5.5 + '@rollup/pluginutils@5.1.4(rollup@4.40.2)': dependencies: '@types/estree': 1.0.7 @@ -14765,6 +14893,14 @@ snapshots: chai: 5.2.0 tinyrainbow: 2.0.0 + '@vitest/expect@3.2.1': + dependencies: + '@types/chai': 5.2.2 + '@vitest/spy': 3.2.1 + '@vitest/utils': 3.2.1 + chai: 5.2.0 + tinyrainbow: 2.0.0 + '@vitest/mocker@3.1.3(vite@6.3.5(@types/node@20.17.50)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0))': dependencies: '@vitest/spy': 3.1.3 @@ -14789,6 +14925,14 @@ snapshots: optionalDependencies: vite: 6.3.5(@types/node@22.15.20)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0) + '@vitest/mocker@3.2.1(vite@6.3.5(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0))': + dependencies: + '@vitest/spy': 3.2.1 + estree-walker: 3.0.3 + magic-string: 0.30.17 + optionalDependencies: + vite: 6.3.5(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0) + '@vitest/pretty-format@3.1.3': dependencies: tinyrainbow: 2.0.0 @@ -14797,6 +14941,10 @@ snapshots: dependencies: tinyrainbow: 2.0.0 + '@vitest/pretty-format@3.2.1': + dependencies: + tinyrainbow: 2.0.0 + '@vitest/runner@3.1.3': dependencies: '@vitest/utils': 3.1.3 @@ -14807,6 +14955,11 @@ snapshots: '@vitest/utils': 3.2.0 pathe: 2.0.3 + '@vitest/runner@3.2.1': + dependencies: + '@vitest/utils': 3.2.1 + pathe: 2.0.3 + '@vitest/snapshot@3.1.3': dependencies: '@vitest/pretty-format': 3.1.3 @@ -14819,6 +14972,12 @@ snapshots: magic-string: 0.30.17 pathe: 2.0.3 + '@vitest/snapshot@3.2.1': + dependencies: + '@vitest/pretty-format': 3.2.1 + magic-string: 0.30.17 + pathe: 2.0.3 + '@vitest/spy@3.1.3': dependencies: tinyspy: 3.0.2 @@ -14827,6 +14986,10 @@ snapshots: dependencies: tinyspy: 4.0.3 + '@vitest/spy@3.2.1': + dependencies: + tinyspy: 4.0.3 + '@vitest/utils@3.1.3': dependencies: '@vitest/pretty-format': 3.1.3 @@ -14839,6 +15002,12 @@ snapshots: loupe: 3.1.3 tinyrainbow: 2.0.0 + '@vitest/utils@3.2.1': + dependencies: + '@vitest/pretty-format': 3.2.1 + loupe: 3.1.3 + tinyrainbow: 2.0.0 + '@vscode/codicons@0.0.36': {} '@vscode/test-cli@0.0.11': @@ -15552,6 +15721,8 @@ snapshots: clsx@2.1.1: {} + cluster-key-slot@1.1.2: {} + cmd-ts@0.13.0: dependencies: chalk: 4.1.2 @@ -19549,6 +19720,11 @@ snapshots: p-map@7.0.3: {} + p-queue@8.1.0: + dependencies: + eventemitter3: 5.0.1 + p-timeout: 6.1.4 + p-timeout@6.1.4: {} p-try@2.2.0: {} @@ -20226,6 +20402,14 @@ snapshots: indent-string: 4.0.0 strip-indent: 3.0.0 + redis@5.5.5: + dependencies: + '@redis/bloom': 5.5.5(@redis/client@5.5.5) + '@redis/client': 5.5.5 + '@redis/json': 5.5.5(@redis/client@5.5.5) + '@redis/search': 5.5.5(@redis/client@5.5.5) + '@redis/time-series': 5.5.5(@redis/client@5.5.5) + reflect.getprototypeof@1.0.10: dependencies: call-bind: 1.0.8 @@ -21693,6 +21877,27 @@ snapshots: - tsx - yaml + vite-node@3.2.1(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0): + dependencies: + cac: 6.7.14 + debug: 4.4.1(supports-color@8.1.1) + es-module-lexer: 1.7.0 + pathe: 2.0.3 + vite: 6.3.5(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0) + transitivePeerDependencies: + - '@types/node' + - jiti + - less + - lightningcss + - sass + - sass-embedded + - stylus + - sugarss + - supports-color + - terser + - tsx + - yaml + vite@6.3.5(@types/node@18.19.100)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0): dependencies: esbuild: 0.25.5 @@ -21741,6 +21946,22 @@ snapshots: tsx: 4.19.4 yaml: 2.8.0 + vite@6.3.5(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0): + dependencies: + esbuild: 0.25.5 + fdir: 6.4.4(picomatch@4.0.2) + picomatch: 4.0.2 + postcss: 8.5.4 + rollup: 4.40.2 + tinyglobby: 0.2.13 + optionalDependencies: + '@types/node': 22.15.29 + fsevents: 2.3.3 + jiti: 2.4.2 + lightningcss: 1.30.1 + tsx: 4.19.4 + yaml: 2.8.0 + vitest@3.1.3(@types/debug@4.1.12)(@types/node@20.17.50)(jiti@2.4.2)(jsdom@20.0.3)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0): dependencies: '@vitest/expect': 3.1.3 @@ -21866,6 +22087,49 @@ snapshots: - tsx - yaml + vitest@3.2.1(@types/debug@4.1.12)(@types/node@22.15.29)(jiti@2.4.2)(jsdom@20.0.3)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0): + dependencies: + '@types/chai': 5.2.2 + '@vitest/expect': 3.2.1 + '@vitest/mocker': 3.2.1(vite@6.3.5(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0)) + '@vitest/pretty-format': 3.2.1 + '@vitest/runner': 3.2.1 + '@vitest/snapshot': 3.2.1 + '@vitest/spy': 3.2.1 + '@vitest/utils': 3.2.1 + chai: 5.2.0 + debug: 4.4.1(supports-color@8.1.1) + expect-type: 1.2.1 + magic-string: 0.30.17 + pathe: 2.0.3 + picomatch: 4.0.2 + std-env: 3.9.0 + tinybench: 2.9.0 + tinyexec: 0.3.2 + tinyglobby: 0.2.14 + tinypool: 1.1.0 + tinyrainbow: 2.0.0 + vite: 6.3.5(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0) + vite-node: 3.2.1(@types/node@22.15.29)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0) + why-is-node-running: 2.3.0 + optionalDependencies: + '@types/debug': 4.1.12 + '@types/node': 22.15.29 + jsdom: 20.0.3 + transitivePeerDependencies: + - jiti + - less + - lightningcss + - msw + - sass + - sass-embedded + - stylus + - sugarss + - supports-color + - terser + - tsx + - yaml + void-elements@3.1.0: {} vscode-jsonrpc@8.2.0: {} From 34379bcaa0c555b2b572f915d1707e133c4c4ef8 Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 5 Jun 2025 11:06:38 -0700 Subject: [PATCH 5/9] More progress --- packages/evals/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/packages/evals/README.md b/packages/evals/README.md index 3cec8c7e45..44dcc7da59 100644 --- a/packages/evals/README.md +++ b/packages/evals/README.md @@ -42,3 +42,14 @@ To configure your MacOS system to run evals locally, execute the setup script: ```sh cd packages/evals && ./scripts/setup.sh ``` + +The setup script does the following: + +- Installs development tools: Homebrew, asdf, GitHub CLI, pnpm +- Installs programming languages: Node.js 20.19.2, Python 3.13.2, Go 1.24.2, Rust 1.85.1, Java 17 +- Sets up VS Code with required extensions +- Configures Docker services (PostgreSQL, Redis) +- Clones/updates the evals repository +- Creates and migrates a Postgres database +- Prompts for an OpenRouter API key to add to `.env.local` +- Optionally builds and installs the Roo Code extension from source From 57bba0f7dd37d220ee6193f862dbe1dfcae2e829 Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 5 Jun 2025 11:09:18 -0700 Subject: [PATCH 6/9] Fix `build_extension` --- packages/evals/scripts/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/evals/scripts/setup.sh b/packages/evals/scripts/setup.sh index abf499b2c5..279485c852 100755 --- a/packages/evals/scripts/setup.sh +++ b/packages/evals/scripts/setup.sh @@ -10,7 +10,7 @@ has_asdf_plugin() { build_extension() { echo "๐Ÿ”จ Building the Roo Code extension..." - pnpm -w build -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 + pnpm -w vsix -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 code --install-extension ../../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1 cd evals } From 6d8dba35803bf2d066068e9a52b8ae347238f27f Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 5 Jun 2025 11:54:06 -0700 Subject: [PATCH 7/9] More progress --- apps/web-evals/src/lib/server/runs.ts | 2 +- packages/evals/Dockerfile.runner | 8 +++++++- packages/evals/README.md | 2 +- packages/evals/docker-compose.yml | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/web-evals/src/lib/server/runs.ts b/apps/web-evals/src/lib/server/runs.ts index 831e5cfe11..6b7f263b62 100644 --- a/apps/web-evals/src/lib/server/runs.ts +++ b/apps/web-evals/src/lib/server/runs.ts @@ -48,7 +48,7 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values revalidatePath("/runs") try { - const isRunningInDocker = true // fs.existsSync("/.dockerenv") + const isRunningInDocker = fs.existsSync("/.dockerenv") const dockerArgs = [ `--name evals-controller-${run.id}`, diff --git a/packages/evals/Dockerfile.runner b/packages/evals/Dockerfile.runner index d6d141b27a..c68b4f80c0 100644 --- a/packages/evals/Dockerfile.runner +++ b/packages/evals/Dockerfile.runner @@ -113,6 +113,12 @@ RUN pnpm install # Copy source code COPY . ./ +# Validate that .env.local exists and is not empty +RUN if [ ! -f "packages/evals/.env.local" ] || [ ! -s "packages/evals/.env.local" ]; then \ + echo "ERROR: packages/evals/.env.local is missing or empty. Please create it with your API keys before building."; \ + exit 1; \ +fi + # Copy ENV secrets COPY packages/evals/.env.local ./packages/evals/ @@ -120,7 +126,7 @@ COPY packages/evals/.env.local ./packages/evals/ RUN cp -r /roo/.vscode-template /roo/.vscode # Build the Roo Code extension -RUN pnpm build -- --out ../bin/roo-code.vsix \ +RUN pnpm vsix -- --out ../bin/roo-code.vsix \ && code --no-sandbox --user-data-dir /roo/.vscode --install-extension bin/roo-code.vsix # Copy entrypoint script diff --git a/packages/evals/README.md b/packages/evals/README.md index 44dcc7da59..3ed3f852e7 100644 --- a/packages/evals/README.md +++ b/packages/evals/README.md @@ -28,7 +28,7 @@ echo "OPENROUTER_API_KEY=sk-or-v1-[...]" > packages/evals/.env.local Start the evals service: ```sh -cd packages/evals && docker compose --profile server up +docker compose -f packages/evals/docker-compose.yml --profile server --profile runner up --build --scale runner=0 ``` Navigate to [localhost:3000](http://localhost:3000/) in your browser. diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index cea7f418f9..24b9626f74 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -75,4 +75,4 @@ services: stdin_open: true tty: true profiles: - - client + - runner From 0df1da5a1b261273f348c26a92b7c90ed51a3c88 Mon Sep 17 00:00:00 2001 From: Chris Estreich Date: Thu, 5 Jun 2025 11:56:39 -0700 Subject: [PATCH 8/9] Update README.md --- packages/evals/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/evals/README.md b/packages/evals/README.md index 3ed3f852e7..3c3962ad88 100644 --- a/packages/evals/README.md +++ b/packages/evals/README.md @@ -1,7 +1,5 @@ # Run Roo Code Evals -## Get Started - ### Prerequisites - [Docker Desktop](https://docs.docker.com/desktop/) From dfb892be9e4b9dbaf437a179ba4a7f1c82f657d1 Mon Sep 17 00:00:00 2001 From: Chris Estreich Date: Thu, 5 Jun 2025 12:15:40 -0700 Subject: [PATCH 9/9] Update README.md --- packages/evals/README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/packages/evals/README.md b/packages/evals/README.md index 3c3962ad88..b085ad98af 100644 --- a/packages/evals/README.md +++ b/packages/evals/README.md @@ -29,7 +29,23 @@ Start the evals service: docker compose -f packages/evals/docker-compose.yml --profile server --profile runner up --build --scale runner=0 ``` -Navigate to [localhost:3000](http://localhost:3000/) in your browser. +The initial build process can take a minute or two. Upon success you should see ouput indicating that a web service is running on [localhost:3000](http://localhost:3000/): +Screenshot 2025-06-05 at 12 05 38โ€ฏPM + +Additionally, you'll find in Docker Desktop that database and redis services are running: +Screenshot 2025-06-05 at 12 07 09โ€ฏPM + +Navigate to [localhost:3000](http://localhost:3000/) in your browser and click the ๐Ÿš€ button. + +By default a evals run will run all programming exercises in [Roo Code Evals](https://github.com/RooCodeInc/Roo-Code-Evals) repository with the Claude Sonnet 4 model and default settings. For basic configuration you can specify the LLM to use and any subset of the exercises you'd like. For advanced configuration you can import a Roo Code settings file which will allow you to run the evals with Roo Code configured any way you'd like (this includes custom modes, a footgun prompt, etc). + +Screenshot 2025-06-05 at 12 08 06โ€ฏPM + +After clicking "Launch" you should find that a "controller" container has spawned as well as `N` "task" containers where `N` is the value you chose for concurrency: +Screenshot 2025-06-05 at 12 13 29โ€ฏPM + +The web app's UI should update in realtime with the results of the eval run: +Screenshot 2025-06-05 at 12 14 52โ€ฏPM ## Advanced Usage / Debugging