-
Notifications
You must be signed in to change notification settings - Fork 2.6k
GHA evals #4472
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GHA evals #4472
Changes from 19 commits
91ad6ac
f5d3852
1465297
1b02cce
dd568b1
e369005
92e70b1
70a940f
1c77f66
88f93ca
2bc65ca
6adb2a6
43a4110
bacb17d
8453c4a
1d305ab
1ffe324
d0e8f2b
e2ced71
e1132c7
41df128
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| name: Evals | ||
|
|
||
| on: | ||
| pull_request: | ||
| types: [labeled] | ||
| workflow_dispatch: | ||
|
|
||
| env: | ||
| DOCKER_BUILDKIT: 1 | ||
| COMPOSE_DOCKER_CLI_BUILD: 1 | ||
|
|
||
| jobs: | ||
| evals: | ||
| # Run if triggered manually or if PR has 'evals' label. | ||
| if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals') | ||
| runs-on: blacksmith-16vcpu-ubuntu-2404 | ||
| timeout-minutes: 45 | ||
|
|
||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Set up Docker Buildx | ||
| uses: docker/setup-buildx-action@v3 | ||
|
|
||
| - name: Create environment | ||
| run: | | ||
| cd packages/evals | ||
| cat > .env.local << EOF | ||
| OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} | ||
| EOF | ||
| cat > .env.development << EOF | ||
| NODE_ENV=development | ||
| DATABASE_URL=postgresql://postgres:password@db:5432/evals_development | ||
| REDIS_URL=redis://redis:6379 | ||
| HOST_EXECUTION_METHOD=docker | ||
| EOF | ||
| - name: Build image | ||
| uses: docker/build-push-action@v5 | ||
| with: | ||
| context: . | ||
| file: packages/evals/Dockerfile.runner | ||
| tags: evals-runner:latest | ||
| cache-from: type=gha | ||
| cache-to: type=gha,mode=max | ||
| push: false | ||
| load: true | ||
|
|
||
| - name: Tag image | ||
| run: | | ||
| cd packages/evals | ||
| docker tag evals-runner:latest evals-runner | ||
| - name: Start containers | ||
| run: | | ||
| cd packages/evals | ||
| docker compose up -d db redis | ||
| timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' | ||
| timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' | ||
| docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' | ||
| docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' | ||
| - name: Test runner | ||
| run: | | ||
| cd packages/evals | ||
| docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' | ||
| docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' | ||
| docker compose run --rm runner docker --version | ||
| docker compose run --rm runner docker ps | ||
| - name: Run evals | ||
| run: | | ||
| cd packages/evals | ||
| docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci | ||
| - name: Cleanup | ||
| if: always() | ||
| run: | | ||
| cd packages/evals | ||
| docker compose down -v --remove-orphans | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,37 +1,22 @@ | ||
| "use server" | ||
|
|
||
| import * as fs from "fs/promises" | ||
| import * as path from "path" | ||
| import { fileURLToPath } from "url" | ||
|
|
||
| import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals" | ||
| import { exerciseLanguages, listDirectories } from "@roo-code/evals" | ||
|
|
||
| const __dirname = path.dirname(fileURLToPath(import.meta.url)) // <repo>/apps/web-evals/src/actions | ||
|
|
||
| const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals") | ||
|
|
||
| export const listDirectories = async (relativePath: string) => { | ||
| try { | ||
| const targetPath = path.resolve(__dirname, relativePath) | ||
| const entries = await fs.readdir(targetPath, { withFileTypes: true }) | ||
| return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name) | ||
| } catch (error) { | ||
| console.error(`Error listing directories at ${relativePath}:`, error) | ||
| return [] | ||
| } | ||
| } | ||
| const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals") | ||
|
|
||
| export const getExercises = async () => { | ||
| const result = await Promise.all( | ||
| exerciseLanguages.map(async (language) => { | ||
| const languagePath = path.join(EXERCISES_BASE_PATH, language) | ||
| const exercises = await listDirectories(languagePath) | ||
| const languagePath = path.join(EVALS_REPO_PATH, language) | ||
| const exercises = await listDirectories(__dirname, languagePath) | ||
| return exercises.map((exercise) => `${language}/${exercise}`) | ||
| }), | ||
| ) | ||
|
|
||
| return result.flat() | ||
| } | ||
|
|
||
| export const getExercisesForLanguage = async (language: ExerciseLanguage) => | ||
| listDirectories(path.join(EXERCISES_BASE_PATH, language)) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| import { NextResponse } from "next/server" | ||
|
|
||
| export async function GET() { | ||
| try { | ||
| return NextResponse.json( | ||
| { | ||
| status: "healthy", | ||
| timestamp: new Date().toISOString(), | ||
| uptime: process.uptime(), | ||
| environment: process.env.NODE_ENV || "production", | ||
| }, | ||
| { status: 200 }, | ||
| ) | ||
| } catch (error) { | ||
| return NextResponse.json( | ||
| { | ||
| status: "unhealthy", | ||
| timestamp: new Date().toISOString(), | ||
| error: error instanceof Error ? error.message : "Unknown error", | ||
| }, | ||
| { status: 503 }, | ||
| ) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,8 +17,10 @@ services: | |
| db: | ||
| container_name: evals-db | ||
| image: postgres:15.4 | ||
| expose: | ||
| - 5432 | ||
| # expose: | ||
| # - 5432 | ||
| ports: | ||
| - "${EVALS_DB_PORT:-5432}:5432" | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Turns out that we still need this if you want to run the evals natively instead of inside Docker.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, forgot about that case. |
||
| volumes: | ||
| - ./.docker/postgres:/var/lib/postgresql/data | ||
| - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d | ||
|
|
@@ -38,8 +40,10 @@ services: | |
| redis: | ||
| container_name: evals-redis | ||
| image: redis:7-alpine | ||
| expose: | ||
| - 6379 | ||
| # expose: | ||
| # - 6379 | ||
| ports: | ||
| - "${EVALS_REDIS_PORT:-6379}:6379" | ||
| volumes: | ||
| - ./.docker/redis:/data | ||
| command: redis-server --appendonly yes | ||
|
|
||
This file was deleted.
Uh oh!
There was an error while loading. Please reload this page.