Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
name: Evals

on:
pull_request:
types: [labeled]
workflow_dispatch:

env:
DOCKER_BUILDKIT: 1
COMPOSE_DOCKER_CLI_BUILD: 1

jobs:
evals:
# Run if triggered manually or if PR has 'evals' label.
if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals')
runs-on: ubuntu-latest
timeout-minutes: 30

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Create environment
run: |
cd packages/evals

cat > .env.local << EOF
OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
EOF

cat > .env.development << EOF
NODE_ENV=development
DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
REDIS_URL=redis://redis:6379
HOST_EXECUTION_METHOD=docker
EOF

- name: Build web image
uses: docker/build-push-action@v5
with:
context: .
file: packages/evals/Dockerfile.web
tags: evals-web:latest
cache-from: type=gha
cache-to: type=gha,mode=max
push: false
load: true

- name: Build runner image
uses: docker/build-push-action@v5
with:
context: .
file: packages/evals/Dockerfile.runner
tags: evals-runner:latest
cache-from: type=gha
cache-to: type=gha,mode=max
push: false
load: true

- name: Tag images
run: |
cd packages/evals
docker tag evals-web:latest evals-web
docker tag evals-runner:latest evals-runner

- name: Start containers
run: |
cd packages/evals
docker compose --profile server up -d

- name: Wait for containers
run: |
cd packages/evals

# Wait for services
echo "Waiting for PostgreSQL..."
timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'

echo "Waiting for Redis..."
timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'

echo "Testing database connection from web container..."
docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'

echo "Testing Redis connection from web container..."
docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'

echo "Testing web service startup..."
timeout 60 bash -c 'until curl -f http://localhost:3000/api/health 2>/dev/null; do echo "Waiting for web service..."; sleep 2; done'

echo "✓ Web service is healthy"
echo "Health check response:"
curl -s http://localhost:3000/api/health | jq 2>/dev/null || curl -s http://localhost:3000/api/health

- name: Test runner container networking
run: |
cd packages/evals

echo "Testing runner container can connect to services..."
docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'

- name: Test Docker socket access
run: |
cd packages/evals

echo "Testing Docker socket access in runner..."
docker compose run --rm runner docker --version
docker compose run --rm runner docker ps

- name: Show service status
if: always()
run: |
cd packages/evals

echo "=== Service Status ==="
docker compose ps

echo "=== Network Information ==="
docker network ls | grep evals || echo "No evals network found"

echo "=== Container Information ==="
docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'

- name: Run evals
run: |
cd packages/evals
docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci

- name: Cleanup
if: always()
run: |
cd packages/evals
docker compose down -v --remove-orphans
23 changes: 4 additions & 19 deletions apps/web-evals/src/actions/exercises.ts
Original file line number Diff line number Diff line change
@@ -1,37 +1,22 @@
"use server"

import * as fs from "fs/promises"
import * as path from "path"
import { fileURLToPath } from "url"

import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals"
import { exerciseLanguages, listDirectories } from "@roo-code/evals"

const __dirname = path.dirname(fileURLToPath(import.meta.url)) // <repo>/apps/web-evals/src/actions

const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals")

export const listDirectories = async (relativePath: string) => {
try {
const targetPath = path.resolve(__dirname, relativePath)
const entries = await fs.readdir(targetPath, { withFileTypes: true })
return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name)
} catch (error) {
console.error(`Error listing directories at ${relativePath}:`, error)
return []
}
}
const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals")

export const getExercises = async () => {
const result = await Promise.all(
exerciseLanguages.map(async (language) => {
const languagePath = path.join(EXERCISES_BASE_PATH, language)
const exercises = await listDirectories(languagePath)
const languagePath = path.join(EVALS_REPO_PATH, language)
const exercises = await listDirectories(__dirname, languagePath)
return exercises.map((exercise) => `${language}/${exercise}`)
}),
)

return result.flat()
}

export const getExercisesForLanguage = async (language: ExerciseLanguage) =>
listDirectories(path.join(EXERCISES_BASE_PATH, language))
11 changes: 7 additions & 4 deletions apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"use server"

import { spawn } from "child_process"
import * as path from "path"
import fs from "fs"
import { fileURLToPath } from "url"
import { spawn } from "child_process"

import { revalidatePath } from "next/cache"
import pMap from "p-map"
Expand All @@ -12,11 +14,12 @@ import {
createRun as _createRun,
deleteRun as _deleteRun,
createTask,
getExercisesForLanguage,
} from "@roo-code/evals"

import { CreateRun } from "@/lib/schemas"

import { getExercisesForLanguage } from "./exercises"
const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")

// eslint-disable-next-line @typescript-eslint/no-unused-vars
export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) {
Expand All @@ -37,9 +40,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values
}
} else {
for (const language of exerciseLanguages) {
const exercises = await getExercisesForLanguage(language)
const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)

await pMap(exercises, (exercise) => createTask({ ...values, runId: run.id, language, exercise }), {
await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), {
concurrency: 10,
})
}
Expand Down
24 changes: 24 additions & 0 deletions apps/web-evals/src/app/api/health/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import { NextResponse } from "next/server"

export async function GET() {
try {
return NextResponse.json(
{
status: "healthy",
timestamp: new Date().toISOString(),
uptime: process.uptime(),
environment: process.env.NODE_ENV || "production",
},
{ status: 200 },
)
} catch (error) {
return NextResponse.json(
{
status: "unhealthy",
timestamp: new Date().toISOString(),
error: error instanceof Error ? error.message : "Unknown error",
},
{ status: 503 },
)
}
}
1 change: 1 addition & 0 deletions packages/evals/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ RUN apt update && \
git \
vim \
jq \
netcat-openbsd \
apt-transport-https \
ca-certificates \
gnupg \
Expand Down
2 changes: 1 addition & 1 deletion packages/evals/Dockerfile.web
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ RUN npm install -g npm@latest
RUN npm install -g npm-run-all

# Install system packages
RUN apt update && apt install -y curl git vim jq postgresql-client
RUN apt update && apt install -y curl git vim jq netcat-openbsd postgresql-client

# Install Docker cli
RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release
Expand Down
12 changes: 8 additions & 4 deletions packages/evals/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ services:
db:
container_name: evals-db
image: postgres:15.4
expose:
- 5432
# expose:
# - 5432
ports:
- "${EVALS_DB_PORT:-5432}:5432"
Copy link
Collaborator Author

@cte cte Jun 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turns out that we still need this if you want to run the evals natively instead of inside Docker.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, forgot about that case.

volumes:
- ./.docker/postgres:/var/lib/postgresql/data
- ./.docker/scripts/postgres:/docker-entrypoint-initdb.d
Expand All @@ -38,8 +40,10 @@ services:
redis:
container_name: evals-redis
image: redis:7-alpine
expose:
- 6379
# expose:
# - 6379
ports:
- "${EVALS_REDIS_PORT:-6379}:6379"
volumes:
- ./.docker/redis:/data
command: redis-server --appendonly yes
Expand Down
3 changes: 2 additions & 1 deletion packages/evals/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"db:start": "docker compose up -d db",
"db:stop": "docker compose down db",
"redis:start": "docker compose up -d redis",
"redis:stop": "docker compose down redis"
"redis:stop": "docker compose down redis",
"services:start": "docker compose up -d db redis"
},
"dependencies": {
"@roo-code/ipc": "workspace:^",
Expand Down
86 changes: 0 additions & 86 deletions packages/evals/src/cli/FileLogger.ts

This file was deleted.

Loading