Skip to content

Commit 8d5dab3

Browse files
authored
GHA evals (#4472)
1 parent bb08fc8 commit 8d5dab3

File tree

17 files changed

+407
-287
lines changed

17 files changed

+407
-287
lines changed

.github/workflows/evals.yml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
name: Evals
2+
3+
on:
4+
pull_request:
5+
types: [labeled]
6+
workflow_dispatch:
7+
8+
env:
9+
DOCKER_BUILDKIT: 1
10+
COMPOSE_DOCKER_CLI_BUILD: 1
11+
12+
jobs:
13+
evals:
14+
# Run if triggered manually or if PR has 'evals' label.
15+
if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals')
16+
runs-on: blacksmith-16vcpu-ubuntu-2404
17+
timeout-minutes: 45
18+
19+
defaults:
20+
run:
21+
working-directory: packages/evals
22+
23+
steps:
24+
- name: Checkout repository
25+
uses: actions/checkout@v4
26+
27+
- name: Set up Docker Buildx
28+
uses: docker/setup-buildx-action@v3
29+
30+
- name: Create environment
31+
run: |
32+
cat > .env.local << EOF
33+
OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
34+
EOF
35+
36+
cat > .env.development << EOF
37+
NODE_ENV=development
38+
DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
39+
REDIS_URL=redis://redis:6379
40+
HOST_EXECUTION_METHOD=docker
41+
EOF
42+
43+
- name: Build image
44+
uses: docker/build-push-action@v5
45+
with:
46+
context: .
47+
file: packages/evals/Dockerfile.runner
48+
tags: evals-runner:latest
49+
cache-from: type=gha
50+
cache-to: type=gha,mode=max
51+
push: false
52+
load: true
53+
54+
- name: Tag image
55+
run: docker tag evals-runner:latest evals-runner
56+
57+
- name: Start containers
58+
run: |
59+
docker compose up -d db redis
60+
timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
61+
timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
62+
docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
63+
docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
64+
docker compose run --rm runner docker ps
65+
66+
- name: Run database migrations
67+
run: docker compose run --rm runner pnpm --filter @roo-code/evals db:migrate
68+
69+
- name: Run evals
70+
run: docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci
71+
72+
- name: Cleanup
73+
if: always()
74+
run: docker compose down -v --remove-orphans
Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,22 @@
11
"use server"
22

3-
import * as fs from "fs/promises"
43
import * as path from "path"
54
import { fileURLToPath } from "url"
65

7-
import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals"
6+
import { exerciseLanguages, listDirectories } from "@roo-code/evals"
87

98
const __dirname = path.dirname(fileURLToPath(import.meta.url)) // <repo>/apps/web-evals/src/actions
109

11-
const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals")
12-
13-
export const listDirectories = async (relativePath: string) => {
14-
try {
15-
const targetPath = path.resolve(__dirname, relativePath)
16-
const entries = await fs.readdir(targetPath, { withFileTypes: true })
17-
return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name)
18-
} catch (error) {
19-
console.error(`Error listing directories at ${relativePath}:`, error)
20-
return []
21-
}
22-
}
10+
const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals")
2311

2412
export const getExercises = async () => {
2513
const result = await Promise.all(
2614
exerciseLanguages.map(async (language) => {
27-
const languagePath = path.join(EXERCISES_BASE_PATH, language)
28-
const exercises = await listDirectories(languagePath)
15+
const languagePath = path.join(EVALS_REPO_PATH, language)
16+
const exercises = await listDirectories(__dirname, languagePath)
2917
return exercises.map((exercise) => `${language}/${exercise}`)
3018
}),
3119
)
3220

3321
return result.flat()
3422
}
35-
36-
export const getExercisesForLanguage = async (language: ExerciseLanguage) =>
37-
listDirectories(path.join(EXERCISES_BASE_PATH, language))

apps/web-evals/src/actions/runs.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"use server"
22

3-
import { spawn } from "child_process"
3+
import * as path from "path"
44
import fs from "fs"
5+
import { fileURLToPath } from "url"
6+
import { spawn } from "child_process"
57

68
import { revalidatePath } from "next/cache"
79
import pMap from "p-map"
@@ -12,11 +14,12 @@ import {
1214
createRun as _createRun,
1315
deleteRun as _deleteRun,
1416
createTask,
17+
getExercisesForLanguage,
1518
} from "@roo-code/evals"
1619

1720
import { CreateRun } from "@/lib/schemas"
1821

19-
import { getExercisesForLanguage } from "./exercises"
22+
const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
2023

2124
// eslint-disable-next-line @typescript-eslint/no-unused-vars
2225
export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) {
@@ -37,9 +40,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values
3740
}
3841
} else {
3942
for (const language of exerciseLanguages) {
40-
const exercises = await getExercisesForLanguage(language)
43+
const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
4144

42-
await pMap(exercises, (exercise) => createTask({ ...values, runId: run.id, language, exercise }), {
45+
await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), {
4346
concurrency: 10,
4447
})
4548
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import { NextResponse } from "next/server"
2+
3+
export async function GET() {
4+
try {
5+
return NextResponse.json(
6+
{
7+
status: "healthy",
8+
timestamp: new Date().toISOString(),
9+
uptime: process.uptime(),
10+
environment: process.env.NODE_ENV || "production",
11+
},
12+
{ status: 200 },
13+
)
14+
} catch (error) {
15+
return NextResponse.json(
16+
{
17+
status: "unhealthy",
18+
timestamp: new Date().toISOString(),
19+
error: error instanceof Error ? error.message : "Unknown error",
20+
},
21+
{ status: 503 },
22+
)
23+
}
24+
}

packages/evals/Dockerfile.runner

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ RUN apt update && \
1313
git \
1414
vim \
1515
jq \
16+
netcat-openbsd \
1617
apt-transport-https \
1718
ca-certificates \
1819
gnupg \

packages/evals/Dockerfile.web

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ RUN npm install -g npm@latest
88
RUN npm install -g npm-run-all
99

1010
# Install system packages
11-
RUN apt update && apt install -y curl git vim jq postgresql-client
11+
RUN apt update && apt install -y curl git vim jq netcat-openbsd postgresql-client
1212

1313
# Install Docker cli
1414
RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release

packages/evals/docker-compose.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ services:
1717
db:
1818
container_name: evals-db
1919
image: postgres:15.4
20-
expose:
21-
- 5432
20+
# expose:
21+
# - 5432
22+
ports:
23+
- "${EVALS_DB_PORT:-5432}:5432"
2224
volumes:
2325
- ./.docker/postgres:/var/lib/postgresql/data
2426
- ./.docker/scripts/postgres:/docker-entrypoint-initdb.d
@@ -38,8 +40,10 @@ services:
3840
redis:
3941
container_name: evals-redis
4042
image: redis:7-alpine
41-
expose:
42-
- 6379
43+
# expose:
44+
# - 6379
45+
ports:
46+
- "${EVALS_REDIS_PORT:-6379}:6379"
4347
volumes:
4448
- ./.docker/redis:/data
4549
command: redis-server --appendonly yes

packages/evals/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
"db:start": "docker compose up -d db",
2222
"db:stop": "docker compose down db",
2323
"redis:start": "docker compose up -d redis",
24-
"redis:stop": "docker compose down redis"
24+
"redis:stop": "docker compose down redis",
25+
"services:start": "docker compose up -d db redis"
2526
},
2627
"dependencies": {
2728
"@roo-code/ipc": "workspace:^",

packages/evals/src/cli/FileLogger.ts

Lines changed: 0 additions & 86 deletions
This file was deleted.

packages/evals/src/cli/index.ts

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import * as fs from "fs"
22

3-
import { command, run, number, option } from "cmd-ts"
3+
import { run, command, option, flag, number, boolean } from "cmd-ts"
44

5-
import { exercisesPath } from "../exercises/index.js"
5+
import { EVALS_REPO_PATH } from "../exercises/index.js"
66

7+
import { runCi } from "./runCi.js"
78
import { runEvals } from "./runEvals.js"
8-
import { processTask } from "./processTask.js"
9+
import { processTask } from "./runTask.js"
910

1011
const main = async () => {
1112
await run(
@@ -14,25 +15,22 @@ const main = async () => {
1415
description: "Execute an eval run.",
1516
version: "0.0.0",
1617
args: {
18+
ci: flag({ type: boolean, long: "ci", defaultValue: () => false }),
1719
runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }),
1820
taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }),
1921
},
2022
handler: async (args) => {
21-
const { runId, taskId } = args
22-
23-
if (runId === -1 && taskId === -1) {
24-
throw new Error("Either runId or taskId must be provided.")
25-
}
26-
27-
if (runId !== -1 && taskId !== -1) {
28-
throw new Error("Only one of runId or taskId must be provided.")
29-
}
23+
const { runId, taskId, ci } = args
3024

3125
try {
32-
if (runId !== -1) {
26+
if (ci) {
27+
await runCi({ concurrency: 3, exercisesPerLanguage: 5 })
28+
} else if (runId !== -1) {
3329
await runEvals(runId)
34-
} else {
30+
} else if (taskId !== -1) {
3531
await processTask({ taskId })
32+
} else {
33+
throw new Error("Either runId or taskId must be provided.")
3634
}
3735
} catch (error) {
3836
console.error(error)
@@ -46,9 +44,9 @@ const main = async () => {
4644
process.exit(0)
4745
}
4846

49-
if (!fs.existsSync(exercisesPath)) {
47+
if (!fs.existsSync(EVALS_REPO_PATH)) {
5048
console.error(
51-
`Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
49+
`Exercises do not exist at ${EVALS_REPO_PATH}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
5250
)
5351

5452
process.exit(1)

0 commit comments

Comments
 (0)