Skip to content
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: Evals

on:
pull_request:
types: [labeled]
workflow_dispatch:

env:
DOCKER_BUILDKIT: 1
COMPOSE_DOCKER_CLI_BUILD: 1

jobs:
evals:
# Run if triggered manually or if PR has 'evals' label.
if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals')
runs-on: blacksmith-16vcpu-ubuntu-2404
timeout-minutes: 45

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Create environment
run: |
cd packages/evals
cat > .env.local << EOF
OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
EOF
cat > .env.development << EOF
NODE_ENV=development
DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
REDIS_URL=redis://redis:6379
HOST_EXECUTION_METHOD=docker
EOF
- name: Build image
uses: docker/build-push-action@v5
with:
context: .
file: packages/evals/Dockerfile.runner
tags: evals-runner:latest
cache-from: type=gha
cache-to: type=gha,mode=max
push: false
load: true

- name: Tag image
run: |
cd packages/evals
docker tag evals-runner:latest evals-runner
- name: Start containers
run: |
cd packages/evals
docker compose up -d db redis
timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
- name: Test runner
run: |
cd packages/evals
docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
docker compose run --rm runner docker --version
docker compose run --rm runner docker ps
- name: Run evals
run: |
cd packages/evals
docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci
- name: Cleanup
if: always()
run: |
cd packages/evals
docker compose down -v --remove-orphans
23 changes: 4 additions & 19 deletions apps/web-evals/src/actions/exercises.ts
Original file line number Diff line number Diff line change
@@ -1,37 +1,22 @@
"use server"

import * as fs from "fs/promises"
import * as path from "path"
import { fileURLToPath } from "url"

import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals"
import { exerciseLanguages, listDirectories } from "@roo-code/evals"

const __dirname = path.dirname(fileURLToPath(import.meta.url)) // <repo>/apps/web-evals/src/actions

const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals")

export const listDirectories = async (relativePath: string) => {
try {
const targetPath = path.resolve(__dirname, relativePath)
const entries = await fs.readdir(targetPath, { withFileTypes: true })
return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name)
} catch (error) {
console.error(`Error listing directories at ${relativePath}:`, error)
return []
}
}
const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals")

export const getExercises = async () => {
const result = await Promise.all(
exerciseLanguages.map(async (language) => {
const languagePath = path.join(EXERCISES_BASE_PATH, language)
const exercises = await listDirectories(languagePath)
const languagePath = path.join(EVALS_REPO_PATH, language)
const exercises = await listDirectories(__dirname, languagePath)
return exercises.map((exercise) => `${language}/${exercise}`)
}),
)

return result.flat()
}

export const getExercisesForLanguage = async (language: ExerciseLanguage) =>
listDirectories(path.join(EXERCISES_BASE_PATH, language))
11 changes: 7 additions & 4 deletions apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"use server"

import { spawn } from "child_process"
import * as path from "path"
import fs from "fs"
import { fileURLToPath } from "url"
import { spawn } from "child_process"

import { revalidatePath } from "next/cache"
import pMap from "p-map"
Expand All @@ -12,11 +14,12 @@ import {
createRun as _createRun,
deleteRun as _deleteRun,
createTask,
getExercisesForLanguage,
} from "@roo-code/evals"

import { CreateRun } from "@/lib/schemas"

import { getExercisesForLanguage } from "./exercises"
const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")

// eslint-disable-next-line @typescript-eslint/no-unused-vars
export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) {
Expand All @@ -37,9 +40,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values
}
} else {
for (const language of exerciseLanguages) {
const exercises = await getExercisesForLanguage(language)
const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)

await pMap(exercises, (exercise) => createTask({ ...values, runId: run.id, language, exercise }), {
await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), {
concurrency: 10,
})
}
Expand Down
24 changes: 24 additions & 0 deletions apps/web-evals/src/app/api/health/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import { NextResponse } from "next/server"

export async function GET() {
try {
return NextResponse.json(
{
status: "healthy",
timestamp: new Date().toISOString(),
uptime: process.uptime(),
environment: process.env.NODE_ENV || "production",
},
{ status: 200 },
)
} catch (error) {
return NextResponse.json(
{
status: "unhealthy",
timestamp: new Date().toISOString(),
error: error instanceof Error ? error.message : "Unknown error",
},
{ status: 503 },
)
}
}
1 change: 1 addition & 0 deletions packages/evals/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ RUN apt update && \
git \
vim \
jq \
netcat-openbsd \
apt-transport-https \
ca-certificates \
gnupg \
Expand Down
2 changes: 1 addition & 1 deletion packages/evals/Dockerfile.web
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ RUN npm install -g npm@latest
RUN npm install -g npm-run-all

# Install system packages
RUN apt update && apt install -y curl git vim jq postgresql-client
RUN apt update && apt install -y curl git vim jq netcat-openbsd postgresql-client

# Install Docker cli
RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release
Expand Down
12 changes: 8 additions & 4 deletions packages/evals/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ services:
db:
container_name: evals-db
image: postgres:15.4
expose:
- 5432
# expose:
# - 5432
ports:
- "${EVALS_DB_PORT:-5432}:5432"
Copy link
Collaborator Author

@cte cte Jun 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turns out that we still need this if you want to run the evals natively instead of inside Docker.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, forgot about that case.

volumes:
- ./.docker/postgres:/var/lib/postgresql/data
- ./.docker/scripts/postgres:/docker-entrypoint-initdb.d
Expand All @@ -38,8 +40,10 @@ services:
redis:
container_name: evals-redis
image: redis:7-alpine
expose:
- 6379
# expose:
# - 6379
ports:
- "${EVALS_REDIS_PORT:-6379}:6379"
volumes:
- ./.docker/redis:/data
command: redis-server --appendonly yes
Expand Down
3 changes: 2 additions & 1 deletion packages/evals/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"db:start": "docker compose up -d db",
"db:stop": "docker compose down db",
"redis:start": "docker compose up -d redis",
"redis:stop": "docker compose down redis"
"redis:stop": "docker compose down redis",
"services:start": "docker compose up -d db redis"
},
"dependencies": {
"@roo-code/ipc": "workspace:^",
Expand Down
86 changes: 0 additions & 86 deletions packages/evals/src/cli/FileLogger.ts

This file was deleted.

30 changes: 14 additions & 16 deletions packages/evals/src/cli/index.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import * as fs from "fs"

import { command, run, number, option } from "cmd-ts"
import { run, command, option, flag, number, boolean } from "cmd-ts"

import { exercisesPath } from "../exercises/index.js"
import { EVALS_REPO_PATH } from "../exercises/index.js"

import { runCi } from "./runCi.js"
import { runEvals } from "./runEvals.js"
import { processTask } from "./processTask.js"
import { processTask } from "./runTask.js"

const main = async () => {
await run(
Expand All @@ -14,25 +15,22 @@ const main = async () => {
description: "Execute an eval run.",
version: "0.0.0",
args: {
ci: flag({ type: boolean, long: "ci", defaultValue: () => false }),
runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }),
taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }),
},
handler: async (args) => {
const { runId, taskId } = args

if (runId === -1 && taskId === -1) {
throw new Error("Either runId or taskId must be provided.")
}

if (runId !== -1 && taskId !== -1) {
throw new Error("Only one of runId or taskId must be provided.")
}
const { runId, taskId, ci } = args

try {
if (runId !== -1) {
if (ci) {
await runCi({ concurrency: 3, exercisesPerLanguage: 5 })
} else if (runId !== -1) {
await runEvals(runId)
} else {
} else if (taskId !== -1) {
await processTask({ taskId })
} else {
throw new Error("Either runId or taskId must be provided.")
}
} catch (error) {
console.error(error)
Expand All @@ -46,9 +44,9 @@ const main = async () => {
process.exit(0)
}

if (!fs.existsSync(exercisesPath)) {
if (!fs.existsSync(EVALS_REPO_PATH)) {
console.error(
`Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
`Exercises do not exist at ${EVALS_REPO_PATH}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
)

process.exit(1)
Expand Down
Loading
Loading