diff --git a/packages/evals/README.md b/packages/evals/README.md
index b26945dc8c9..7ef5be139bf 100644
--- a/packages/evals/README.md
+++ b/packages/evals/README.md
@@ -26,26 +26,26 @@ echo "OPENROUTER_API_KEY=sk-or-v1-[...]" > packages/evals/.env.local
Start the evals service:
```sh
-docker compose -f packages/evals/docker-compose.yml --profile server --profile runner up --build --scale runner=0
+pnpm evals
```
-The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on [localhost:3000](http://localhost:3000/):
-
+The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on localhost:3000:
+
Additionally, you'll find in Docker Desktop that database and redis services are running:
-
+
Navigate to [localhost:3446](http://localhost:3446/) in your browser and click the 🚀 button.
By default a evals run will run all programming exercises in [Roo Code Evals](https://github.com/RooCodeInc/Roo-Code-Evals) repository with the Claude Sonnet 4 model and default settings. For basic configuration you can specify the LLM to use and any subset of the exercises you'd like. For advanced configuration you can import a Roo Code settings file which will allow you to run the evals with Roo Code configured any way you'd like (this includes custom modes, a footgun prompt, etc).
-
+
After clicking "Launch" you should find that a "controller" container has spawned as well as `N` "task" containers where `N` is the value you chose for concurrency:
-
+
The web app's UI should update in realtime with the results of the eval run:
-
+
## Resource Usage
@@ -60,7 +60,7 @@ CPU Limit = 2 * concurrency
The memory and CPU limits can be set from the "Resources" section of the Docker Desktop settings:
-
+
## Stopping
diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml
index 3b251f8f95c..74c25cf2609 100644
--- a/packages/evals/docker-compose.yml
+++ b/packages/evals/docker-compose.yml
@@ -17,8 +17,6 @@ services:
db:
container_name: evals-db
image: postgres:15.4
- # expose:
- # - 5432
ports:
- "${EVALS_DB_PORT:-5432}:5432"
volumes:
@@ -40,8 +38,6 @@ services:
redis:
container_name: evals-redis
image: redis:7-alpine
- # expose:
- # - 6379
ports:
- "${EVALS_REDIS_PORT:-6379}:6379"
volumes:
diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts
index 00199bbb444..6d13abf5a8f 100644
--- a/packages/evals/src/cli/runEvals.ts
+++ b/packages/evals/src/cli/runEvals.ts
@@ -20,16 +20,16 @@ export const runEvals = async (runId: number) => {
throw new Error(`Run ${run.id} has no tasks.`)
}
+ const containerized = isDockerContainer()
+
const logger = new Logger({
- logDir: `/var/log/evals/runs/${run.id}`,
+ logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
filename: `controller.log`,
tag: getTag("runEvals", { run }),
})
logger.info(`running ${tasks.length} task(s)`)
- const containerized = isDockerContainer()
-
if (!containerized) {
await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH })
}
diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts
index 0ee37a8a6eb..507d614ea5a 100644
--- a/packages/evals/src/cli/runTask.ts
+++ b/packages/evals/src/cli/runTask.ts
@@ -44,10 +44,12 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?:
const run = await findRun(task.runId)
await registerRunner({ runId: run.id, taskId })
+ const containerized = isDockerContainer()
+
logger =
logger ||
new Logger({
- logDir: `/var/log/evals/runs/${run.id}`,
+ logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
filename: `${language}-${exercise}.log`,
tag: getTag("runTask", { run, task }),
})
@@ -298,7 +300,6 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
...run.settings, // Allow the provided settings to override `openRouterApiKey`.
},
text: prompt,
- newTab: true,
},
})
diff --git a/packages/types/src/global-settings.ts b/packages/types/src/global-settings.ts
index 79a09ff0175..646c587e8ec 100644
--- a/packages/types/src/global-settings.ts
+++ b/packages/types/src/global-settings.ts
@@ -177,7 +177,7 @@ export const EVALS_SETTINGS: RooCodeSettings = {
apiProvider: "openrouter",
openRouterUseMiddleOutTransform: false,
- lastShownAnnouncementId: "may-29-2025-3-19",
+ lastShownAnnouncementId: "jul-09-2025-3-23-0",
pinnedApiConfigs: {},