From 3a3f7d5e67b7d18338b2b197e99203c428ccc21b Mon Sep 17 00:00:00 2001 From: cte Date: Fri, 11 Jul 2025 12:07:38 -0700 Subject: [PATCH 1/2] Minor fixes for local (non-Docker) evals --- packages/evals/README.md | 16 ++++++++-------- packages/evals/docker-compose.yml | 4 ---- packages/evals/src/cli/runEvals.ts | 6 +++--- packages/evals/src/cli/runTask.ts | 5 +++-- packages/types/src/global-settings.ts | 2 +- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/packages/evals/README.md b/packages/evals/README.md index b26945dc8c9..98354fb3b3b 100644 --- a/packages/evals/README.md +++ b/packages/evals/README.md @@ -26,26 +26,26 @@ echo "OPENROUTER_API_KEY=sk-or-v1-[...]" > packages/evals/.env.local Start the evals service: ```sh -docker compose -f packages/evals/docker-compose.yml --profile server --profile runner up --build --scale runner=0 +pnpm evals ``` -The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on [localhost:3000](http://localhost:3000/): -Screenshot 2025-06-05 at 12 05 38 PM +The initial build process can take a minute or two. Upon success you should see ouput indicating that a web service is running on localhost:3000: + Additionally, you'll find in Docker Desktop that database and redis services are running: -Screenshot 2025-06-05 at 12 07 09 PM + Navigate to [localhost:3446](http://localhost:3446/) in your browser and click the 🚀 button. By default a evals run will run all programming exercises in [Roo Code Evals](https://github.com/RooCodeInc/Roo-Code-Evals) repository with the Claude Sonnet 4 model and default settings. For basic configuration you can specify the LLM to use and any subset of the exercises you'd like. For advanced configuration you can import a Roo Code settings file which will allow you to run the evals with Roo Code configured any way you'd like (this includes custom modes, a footgun prompt, etc). -Screenshot 2025-06-05 at 12 08 06 PM + After clicking "Launch" you should find that a "controller" container has spawned as well as `N` "task" containers where `N` is the value you chose for concurrency: -Screenshot 2025-06-05 at 12 13 29 PM + The web app's UI should update in realtime with the results of the eval run: -Screenshot 2025-06-05 at 12 14 52 PM + ## Resource Usage @@ -60,7 +60,7 @@ CPU Limit = 2 * concurrency The memory and CPU limits can be set from the "Resources" section of the Docker Desktop settings: -Screenshot 2025-06-06 at 8 54 24 AM + ## Stopping diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index 3b251f8f95c..74c25cf2609 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -17,8 +17,6 @@ services: db: container_name: evals-db image: postgres:15.4 - # expose: - # - 5432 ports: - "${EVALS_DB_PORT:-5432}:5432" volumes: @@ -40,8 +38,6 @@ services: redis: container_name: evals-redis image: redis:7-alpine - # expose: - # - 6379 ports: - "${EVALS_REDIS_PORT:-6379}:6379" volumes: diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts index 00199bbb444..6d13abf5a8f 100644 --- a/packages/evals/src/cli/runEvals.ts +++ b/packages/evals/src/cli/runEvals.ts @@ -20,16 +20,16 @@ export const runEvals = async (runId: number) => { throw new Error(`Run ${run.id} has no tasks.`) } + const containerized = isDockerContainer() + const logger = new Logger({ - logDir: `/var/log/evals/runs/${run.id}`, + logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`, filename: `controller.log`, tag: getTag("runEvals", { run }), }) logger.info(`running ${tasks.length} task(s)`) - const containerized = isDockerContainer() - if (!containerized) { await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH }) } diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index 0ee37a8a6eb..507d614ea5a 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -44,10 +44,12 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?: const run = await findRun(task.runId) await registerRunner({ runId: run.id, taskId }) + const containerized = isDockerContainer() + logger = logger || new Logger({ - logDir: `/var/log/evals/runs/${run.id}`, + logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`, filename: `${language}-${exercise}.log`, tag: getTag("runTask", { run, task }), }) @@ -298,7 +300,6 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => ...run.settings, // Allow the provided settings to override `openRouterApiKey`. }, text: prompt, - newTab: true, }, }) diff --git a/packages/types/src/global-settings.ts b/packages/types/src/global-settings.ts index 79a09ff0175..646c587e8ec 100644 --- a/packages/types/src/global-settings.ts +++ b/packages/types/src/global-settings.ts @@ -177,7 +177,7 @@ export const EVALS_SETTINGS: RooCodeSettings = { apiProvider: "openrouter", openRouterUseMiddleOutTransform: false, - lastShownAnnouncementId: "may-29-2025-3-19", + lastShownAnnouncementId: "jul-09-2025-3-23-0", pinnedApiConfigs: {}, From 5cacfa3ce3e5335e57132445eac64b46b3317f9c Mon Sep 17 00:00:00 2001 From: Chris Estreich Date: Fri, 11 Jul 2025 12:10:55 -0700 Subject: [PATCH 2/2] Update packages/evals/README.md Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --- packages/evals/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/evals/README.md b/packages/evals/README.md index 98354fb3b3b..7ef5be139bf 100644 --- a/packages/evals/README.md +++ b/packages/evals/README.md @@ -29,7 +29,7 @@ Start the evals service: pnpm evals ``` -The initial build process can take a minute or two. Upon success you should see ouput indicating that a web service is running on localhost:3000: +The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on localhost:3000: Additionally, you'll find in Docker Desktop that database and redis services are running: