From 91ad6acdfbd401e2f54c3d4de95a3044f8452a3f Mon Sep 17 00:00:00 2001 From: cte Date: Mon, 9 Jun 2025 10:58:34 -0700 Subject: [PATCH 01/20] GHA evals --- .github/workflows/evals-quick-test.yml | 148 ++++++++++++++++ .github/workflows/evals.yml | 235 +++++++++++++++++++++++++ packages/evals/GITHUB_ACTIONS.md | 201 +++++++++++++++++++++ 3 files changed, 584 insertions(+) create mode 100644 .github/workflows/evals-quick-test.yml create mode 100644 .github/workflows/evals.yml create mode 100644 packages/evals/GITHUB_ACTIONS.md diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml new file mode 100644 index 0000000000..e8f92f6212 --- /dev/null +++ b/.github/workflows/evals-quick-test.yml @@ -0,0 +1,148 @@ +name: Evals Quick Test + +on: + workflow_dispatch: + +env: + DOCKER_BUILDKIT: 1 + COMPOSE_DOCKER_CLI_BUILD: 1 + +jobs: + test-docker-compose: + name: Test Docker Compose Networking + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Create test environment + run: | + cd packages/evals + + # Create minimal test environment + cat > .env.test << EOF + NODE_ENV=test + DATABASE_URL=postgresql://postgres:password@db:5432/evals_test + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + - name: Build images + run: | + cd packages/evals + docker compose build web runner + + - name: Start server services + run: | + cd packages/evals + docker compose --profile server up -d + + - name: Test service connectivity + run: | + cd packages/evals + + # Wait for services + echo "Waiting for PostgreSQL..." + timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' + + echo "Waiting for Redis..." + timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' + + # Test inter-container networking + echo "Testing database connection from web container..." + docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' + + echo "Testing Redis connection from web container..." + docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' + + # Test that web service can start (basic health check) + echo "Testing web service startup..." + timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..." + + - name: Test runner container networking + run: | + cd packages/evals + + echo "Testing runner container can connect to services..." + docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' + docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' + docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"' + + - name: Verify Docker socket access + run: | + cd packages/evals + + echo "Testing Docker socket access in runner..." + docker compose run --rm runner docker --version + docker compose run --rm runner docker ps + + - name: Show service status + if: always() + run: | + cd packages/evals + echo "=== Service Status ===" + docker compose ps + + echo "=== Network Information ===" + docker network ls | grep evals || echo "No evals network found" + + echo "=== Container Information ===" + docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"' + docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"' + + - name: Cleanup + if: always() + run: | + cd packages/evals + docker compose down -v --remove-orphans + + validate-compose-file: + name: Validate Compose Configuration + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Validate Docker Compose syntax + run: | + cd packages/evals + docker compose config --quiet + + - name: Check service definitions + run: | + cd packages/evals + + # Verify all expected services are defined + services=$(docker compose config --services | sort) + expected_services="db redis runner web" + + echo "Defined services: $services" + echo "Expected services: $expected_services" + + for service in $expected_services; do + if ! echo "$services" | grep -q "^$service$"; then + echo "ERROR: Service '$service' not found" + exit 1 + fi + done + + echo "✓ All expected services found" + + - name: Check profiles + run: | + cd packages/evals + + # Test profile configurations + echo "Testing server profile..." + docker compose --profile server config --services | sort + + echo "Testing runner profile..." + docker compose --profile runner config --services | sort + + echo "✓ Profiles validated" \ No newline at end of file diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 0000000000..b20633fd2e --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,235 @@ +name: Evals Docker Compose + +on: + workflow_dispatch: + inputs: + run_full_evals: + description: 'Run full evaluation suite' + required: false + default: 'false' + type: boolean + concurrency: + description: 'Evaluation concurrency level' + required: false + default: '2' + type: string + +env: + DOCKER_BUILDKIT: 1 + COMPOSE_DOCKER_CLI_BUILD: 1 + +jobs: + build-and-test: + name: Build and Test Evals + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Create environment files + run: | + cd packages/evals + + # Create .env.test for testing + cat > .env.test << EOF + NODE_ENV=test + DATABASE_URL=postgresql://postgres:password@db:5432/evals_test + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + # Create .env.development for development + cat > .env.development << EOF + NODE_ENV=development + DATABASE_URL=postgresql://postgres:password@db:5432/evals_development + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + - name: Build Docker images + run: | + cd packages/evals + docker compose build web runner + + - name: Start server services + run: | + cd packages/evals + docker compose --profile server up -d + + - name: Wait for services to be ready + run: | + cd packages/evals + + # Wait for PostgreSQL to be ready + echo "Waiting for PostgreSQL..." + timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' + + # Wait for Redis to be ready + echo "Waiting for Redis..." + timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' + + # Wait for web service to be ready + echo "Waiting for web service..." + timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' + + - name: Run database migrations + run: | + cd packages/evals + docker compose exec -T web pnpm db:push + + - name: Run tests + run: | + cd packages/evals + docker compose run --rm runner pnpm _test + + - name: Check service logs on failure + if: failure() + run: | + cd packages/evals + echo "=== Database logs ===" + docker compose logs db + echo "=== Redis logs ===" + docker compose logs redis + echo "=== Web service logs ===" + docker compose logs web + + - name: Cleanup + if: always() + run: | + cd packages/evals + docker compose down -v --remove-orphans + + run-sample-evals: + name: Run Sample Evaluations + runs-on: ubuntu-latest + needs: build-and-test + if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch' + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Create environment files + run: | + cd packages/evals + + cat > .env.local << EOF + OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }} + EOF + + cat > .env.development << EOF + NODE_ENV=development + DATABASE_URL=postgresql://postgres:password@db:5432/evals_development + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + - name: Build and start services + run: | + cd packages/evals + docker compose --profile server --profile runner up --build -d --scale runner=0 + + - name: Wait for services + run: | + cd packages/evals + timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' + timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' + timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' + + - name: Run database setup + run: | + cd packages/evals + docker compose exec -T web pnpm db:push + + - name: Run sample evaluation + env: + CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }} + run: | + cd packages/evals + + # Run a limited set of evaluations for CI + docker compose run --rm runner pnpm cli run \ + --concurrency $CONCURRENCY \ + --timeout 300 \ + --max-exercises 3 \ + --model "anthropic/claude-3-5-sonnet-20241022" + + - name: Upload evaluation results + if: always() + uses: actions/upload-artifact@v4 + with: + name: evaluation-results + path: | + packages/evals/results/ + packages/evals/logs/ + retention-days: 7 + + - name: Cleanup + if: always() + run: | + cd packages/evals + docker compose down -v --remove-orphans + + security-scan: + name: Security Scan + runs-on: ubuntu-latest + needs: build-and-test + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + scan-type: 'fs' + scan-ref: 'packages/evals' + format: 'sarif' + output: 'trivy-results.sarif' + + - name: Upload Trivy scan results + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: 'trivy-results.sarif' + + docker-compose-validate: + name: Validate Docker Compose + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Validate Docker Compose file + run: | + cd packages/evals + docker compose config --quiet + + - name: Check Docker Compose services + run: | + cd packages/evals + docker compose config --services | sort > services.txt + echo "Available services:" + cat services.txt + + # Verify expected services exist + for service in db redis web runner; do + if ! grep -q "^$service$" services.txt; then + echo "ERROR: Service '$service' not found in docker-compose.yml" + exit 1 + fi + done + + echo "All expected services found ✓" \ No newline at end of file diff --git a/packages/evals/GITHUB_ACTIONS.md b/packages/evals/GITHUB_ACTIONS.md new file mode 100644 index 0000000000..454c42ec87 --- /dev/null +++ b/packages/evals/GITHUB_ACTIONS.md @@ -0,0 +1,201 @@ +# GitHub Actions for Evals + +This document describes the GitHub Actions workflows available for the Roo Code Evals system. + +## Workflows + +### 1. `evals.yml` - Full Evaluation Workflow + +**Purpose**: Comprehensive testing and evaluation workflow that builds, tests, and optionally runs full evaluations. + +**Triggers**: + +- Push to `main` or `develop` branches (when evals files change) +- Pull requests to `main` or `develop` branches (when evals files change) +- Manual dispatch with options + +**Jobs**: + +#### `build-and-test` + +- Builds Docker images for web and runner services +- Starts PostgreSQL, Redis, and web services +- Waits for all services to be ready +- Runs database migrations +- Executes test suite +- Provides detailed logging on failure + +#### `run-sample-evals` (conditional) + +- Only runs when manually triggered with `run_full_evals: true` +- Requires `OPENROUTER_API_KEY` secret to be configured +- Runs a limited set of evaluations for CI testing +- Uploads evaluation results as artifacts +- Configurable concurrency level + +#### `security-scan` + +- Runs Trivy vulnerability scanner on the evals package +- Uploads results to GitHub Security tab + +#### `docker-compose-validate` + +- Validates Docker Compose file syntax +- Verifies all expected services are defined + +**Required Secrets**: + +- `OPENROUTER_API_KEY` (only for full evaluation runs) + +### 2. `evals-quick-test.yml` - Quick Networking Test + +**Purpose**: Fast validation of Docker Compose networking and basic functionality. + +**Triggers**: + +- Push to `main` or `develop` branches (when evals files change) +- Pull requests to `main` or `develop` branches (when evals files change) + +**Jobs**: + +#### `test-docker-compose` + +- Tests inter-container networking between all services +- Verifies database and Redis connectivity +- Tests Docker socket access in runner container +- Validates service startup and health + +#### `validate-compose-file` + +- Validates Docker Compose syntax +- Checks service definitions and profiles + +## Usage Examples + +### Manual Workflow Dispatch + +To run full evaluations manually: + +1. Go to Actions tab in GitHub +2. Select "Evals Docker Compose" workflow +3. Click "Run workflow" +4. Configure options: + - `run_full_evals`: Set to `true` to run actual evaluations + - `concurrency`: Set evaluation concurrency (default: 2) + +### Setting Up Secrets + +For full evaluation runs, add the OpenRouter API key: + +1. Go to repository Settings → Secrets and variables → Actions +2. Add new repository secret: + - Name: `OPENROUTER_API_KEY` + - Value: Your OpenRouter API key (e.g., `sk-or-v1-...`) + +## Docker Compose Networking in GitHub Actions + +The workflows demonstrate that Docker Compose networking works seamlessly in GitHub Actions: + +### Service Communication + +- Services communicate using service names as hostnames +- Database: `postgresql://postgres:password@db:5432/evals_development` +- Redis: `redis://redis:6379` +- Web service: `http://web:3000` + +### Network Features Tested + +- ✅ Container-to-container communication +- ✅ Service discovery via service names +- ✅ Port mapping and internal networking +- ✅ Health checks and service dependencies +- ✅ Docker socket mounting for Docker-in-Docker +- ✅ Volume mounts for data persistence +- ✅ Profile-based service grouping + +### Networking Validation + +The workflows include comprehensive networking tests: + +```bash +# Test database connectivity +docker compose exec -T web sh -c 'nc -z db 5432' + +# Test Redis connectivity +docker compose exec -T web sh -c 'nc -z redis 6379' + +# Test cross-service communication +docker compose run --rm runner sh -c 'nc -z web 3000' +``` + +## Resource Considerations + +GitHub Actions runners have the following limits: + +- **Memory**: 7 GB RAM +- **CPU**: 2-core CPU +- **Disk**: 14 GB SSD space +- **Time**: 6 hours maximum job runtime + +For the evals system: + +- Quick tests typically complete in 5-10 minutes +- Full evaluation runs may take 30-60 minutes depending on scope +- Resource usage scales with concurrency settings + +## Troubleshooting + +### Common Issues + +1. **Service startup timeouts** + + - Increase timeout values in workflow + - Check service health check configurations + - Review service logs in workflow output + +2. **Networking failures** + + - Verify service names match docker-compose.yml + - Check port configurations + - Ensure services are in the same Docker network + +3. **Docker socket access issues** + - Verify `/var/run/docker.sock` mount in docker-compose.yml + - Check Docker-in-Docker permissions + +### Debugging + +The workflows include comprehensive logging: + +- Service status and health checks +- Network information and container details +- Service logs on failure +- Artifact uploads for evaluation results + +To debug locally, you can run the same commands used in the workflows: + +```bash +cd packages/evals + +# Build and start services +docker compose --profile server up -d + +# Test connectivity +docker compose exec -T web sh -c 'nc -z db 5432' +docker compose exec -T redis redis-cli ping + +# View logs +docker compose logs db +docker compose logs redis +docker compose logs web +``` + +## Performance Optimization + +For faster CI runs: + +- Use Docker layer caching with `docker/setup-buildx-action` +- Minimize Docker image sizes +- Use health checks to avoid unnecessary wait times +- Run tests in parallel where possible +- Cache dependencies between workflow runs From f5d385208edc6e8f7f2d8ec3085f783c89989174 Mon Sep 17 00:00:00 2001 From: cte Date: Mon, 9 Jun 2025 11:09:14 -0700 Subject: [PATCH 02/20] Add GitHub Actions workflows for evals Docker Compose testing - Add comprehensive evals.yml workflow for full testing and evaluation runs - Add evals-quick-test.yml for fast Docker Compose networking validation - Include documentation in GITHUB_ACTIONS.md - Workflows trigger on PRs and support manual dispatch for testing --- .github/workflows/evals-quick-test.yml | 5 +++++ .github/workflows/evals.yml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml index e8f92f6212..99cfa6fae7 100644 --- a/.github/workflows/evals-quick-test.yml +++ b/.github/workflows/evals-quick-test.yml @@ -1,6 +1,11 @@ name: Evals Quick Test on: + pull_request: + branches: [main, develop] + paths: + - 'packages/evals/**' + - '.github/workflows/evals*.yml' workflow_dispatch: env: diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index b20633fd2e..0ef2093dcf 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -1,6 +1,11 @@ name: Evals Docker Compose on: + pull_request: + branches: [main, develop] + paths: + - 'packages/evals/**' + - '.github/workflows/evals*.yml' workflow_dispatch: inputs: run_full_evals: From 1465297c8920de63148f916fa0ee8791a1dbebbd Mon Sep 17 00:00:00 2001 From: cte Date: Mon, 9 Jun 2025 11:11:27 -0700 Subject: [PATCH 03/20] Revert this --- .github/workflows/evals.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 0ef2093dcf..b20633fd2e 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -1,11 +1,6 @@ name: Evals Docker Compose on: - pull_request: - branches: [main, develop] - paths: - - 'packages/evals/**' - - '.github/workflows/evals*.yml' workflow_dispatch: inputs: run_full_evals: From 1b02cceb98cecd246a2f1e789d12b1e7c708ad88 Mon Sep 17 00:00:00 2001 From: cte Date: Mon, 9 Jun 2025 11:14:31 -0700 Subject: [PATCH 04/20] Remove "validate-compose-file" --- .github/workflows/evals-quick-test.yml | 46 -------------------------- 1 file changed, 46 deletions(-) diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml index 99cfa6fae7..ae64c847fd 100644 --- a/.github/workflows/evals-quick-test.yml +++ b/.github/workflows/evals-quick-test.yml @@ -105,49 +105,3 @@ jobs: run: | cd packages/evals docker compose down -v --remove-orphans - - validate-compose-file: - name: Validate Compose Configuration - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Validate Docker Compose syntax - run: | - cd packages/evals - docker compose config --quiet - - - name: Check service definitions - run: | - cd packages/evals - - # Verify all expected services are defined - services=$(docker compose config --services | sort) - expected_services="db redis runner web" - - echo "Defined services: $services" - echo "Expected services: $expected_services" - - for service in $expected_services; do - if ! echo "$services" | grep -q "^$service$"; then - echo "ERROR: Service '$service' not found" - exit 1 - fi - done - - echo "✓ All expected services found" - - - name: Check profiles - run: | - cd packages/evals - - # Test profile configurations - echo "Testing server profile..." - docker compose --profile server config --services | sort - - echo "Testing runner profile..." - docker compose --profile runner config --services | sort - - echo "✓ Profiles validated" \ No newline at end of file From dd568b1b970397144f2f95f6c6d33d275178d4a1 Mon Sep 17 00:00:00 2001 From: cte Date: Mon, 9 Jun 2025 11:18:56 -0700 Subject: [PATCH 05/20] Add .env.local file --- .github/workflows/evals-quick-test.yml | 205 +++++------ .github/workflows/evals.yml | 454 ++++++++++++------------- 2 files changed, 331 insertions(+), 328 deletions(-) diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml index ae64c847fd..b4452e692d 100644 --- a/.github/workflows/evals-quick-test.yml +++ b/.github/workflows/evals-quick-test.yml @@ -1,107 +1,112 @@ name: Evals Quick Test on: - pull_request: - branches: [main, develop] - paths: - - 'packages/evals/**' - - '.github/workflows/evals*.yml' - workflow_dispatch: + pull_request: + branches: [main, develop] + paths: + - "packages/evals/**" + - ".github/workflows/evals*.yml" + workflow_dispatch: env: - DOCKER_BUILDKIT: 1 - COMPOSE_DOCKER_CLI_BUILD: 1 + DOCKER_BUILDKIT: 1 + COMPOSE_DOCKER_CLI_BUILD: 1 jobs: - test-docker-compose: - name: Test Docker Compose Networking - runs-on: ubuntu-latest - timeout-minutes: 15 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Create test environment - run: | - cd packages/evals - - # Create minimal test environment - cat > .env.test << EOF - NODE_ENV=test - DATABASE_URL=postgresql://postgres:password@db:5432/evals_test - REDIS_URL=redis://redis:6379 - HOST_EXECUTION_METHOD=docker - EOF - - - name: Build images - run: | - cd packages/evals - docker compose build web runner - - - name: Start server services - run: | - cd packages/evals - docker compose --profile server up -d - - - name: Test service connectivity - run: | - cd packages/evals - - # Wait for services - echo "Waiting for PostgreSQL..." - timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' - - echo "Waiting for Redis..." - timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - - # Test inter-container networking - echo "Testing database connection from web container..." - docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' - - echo "Testing Redis connection from web container..." - docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' - - # Test that web service can start (basic health check) - echo "Testing web service startup..." - timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..." - - - name: Test runner container networking - run: | - cd packages/evals - - echo "Testing runner container can connect to services..." - docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' - docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' - docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"' - - - name: Verify Docker socket access - run: | - cd packages/evals - - echo "Testing Docker socket access in runner..." - docker compose run --rm runner docker --version - docker compose run --rm runner docker ps - - - name: Show service status - if: always() - run: | - cd packages/evals - echo "=== Service Status ===" - docker compose ps - - echo "=== Network Information ===" - docker network ls | grep evals || echo "No evals network found" - - echo "=== Container Information ===" - docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"' - docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"' - - - name: Cleanup - if: always() - run: | - cd packages/evals - docker compose down -v --remove-orphans + test-docker-compose: + name: Test Docker Compose Networking + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Create test environment + run: | + cd packages/evals + + # Create .env.local (required for Docker build) + cat > .env.local << EOF + OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} + EOF + + # Create development environment + cat > .env.development << EOF + NODE_ENV=development + DATABASE_URL=postgresql://postgres:password@db:5432/evals_development + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + - name: Build images + run: | + cd packages/evals + docker compose build web runner + + - name: Start server services + run: | + cd packages/evals + docker compose --profile server up -d + + - name: Test service connectivity + run: | + cd packages/evals + + # Wait for services + echo "Waiting for PostgreSQL..." + timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' + + echo "Waiting for Redis..." + timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' + + # Test inter-container networking + echo "Testing database connection from web container..." + docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' + + echo "Testing Redis connection from web container..." + docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' + + # Test that web service can start (basic health check) + echo "Testing web service startup..." + timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..." + + - name: Test runner container networking + run: | + cd packages/evals + + echo "Testing runner container can connect to services..." + docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' + docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' + docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"' + + - name: Verify Docker socket access + run: | + cd packages/evals + + echo "Testing Docker socket access in runner..." + docker compose run --rm runner docker --version + docker compose run --rm runner docker ps + + - name: Show service status + if: always() + run: | + cd packages/evals + echo "=== Service Status ===" + docker compose ps + + echo "=== Network Information ===" + docker network ls | grep evals || echo "No evals network found" + + echo "=== Container Information ===" + docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"' + docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"' + + - name: Cleanup + if: always() + run: | + cd packages/evals + docker compose down -v --remove-orphans diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index b20633fd2e..3148a023c6 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -1,235 +1,233 @@ name: Evals Docker Compose on: - workflow_dispatch: - inputs: - run_full_evals: - description: 'Run full evaluation suite' - required: false - default: 'false' - type: boolean - concurrency: - description: 'Evaluation concurrency level' - required: false - default: '2' - type: string + workflow_dispatch: + inputs: + run_full_evals: + description: "Run full evaluation suite" + required: false + default: "false" + type: boolean + concurrency: + description: "Evaluation concurrency level" + required: false + default: "2" + type: string env: - DOCKER_BUILDKIT: 1 - COMPOSE_DOCKER_CLI_BUILD: 1 + DOCKER_BUILDKIT: 1 + COMPOSE_DOCKER_CLI_BUILD: 1 jobs: - build-and-test: - name: Build and Test Evals - runs-on: ubuntu-latest - timeout-minutes: 30 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Create environment files - run: | - cd packages/evals - - # Create .env.test for testing - cat > .env.test << EOF - NODE_ENV=test - DATABASE_URL=postgresql://postgres:password@db:5432/evals_test - REDIS_URL=redis://redis:6379 - HOST_EXECUTION_METHOD=docker - EOF - - # Create .env.development for development - cat > .env.development << EOF - NODE_ENV=development - DATABASE_URL=postgresql://postgres:password@db:5432/evals_development - REDIS_URL=redis://redis:6379 - HOST_EXECUTION_METHOD=docker - EOF - - - name: Build Docker images - run: | - cd packages/evals - docker compose build web runner - - - name: Start server services - run: | - cd packages/evals - docker compose --profile server up -d - - - name: Wait for services to be ready - run: | - cd packages/evals - - # Wait for PostgreSQL to be ready - echo "Waiting for PostgreSQL..." - timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' - - # Wait for Redis to be ready - echo "Waiting for Redis..." - timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - - # Wait for web service to be ready - echo "Waiting for web service..." - timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' - - - name: Run database migrations - run: | - cd packages/evals - docker compose exec -T web pnpm db:push - - - name: Run tests - run: | - cd packages/evals - docker compose run --rm runner pnpm _test - - - name: Check service logs on failure - if: failure() - run: | - cd packages/evals - echo "=== Database logs ===" - docker compose logs db - echo "=== Redis logs ===" - docker compose logs redis - echo "=== Web service logs ===" - docker compose logs web - - - name: Cleanup - if: always() - run: | - cd packages/evals - docker compose down -v --remove-orphans - - run-sample-evals: - name: Run Sample Evaluations - runs-on: ubuntu-latest - needs: build-and-test - if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch' - timeout-minutes: 60 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Create environment files - run: | - cd packages/evals - - cat > .env.local << EOF - OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }} - EOF - - cat > .env.development << EOF - NODE_ENV=development - DATABASE_URL=postgresql://postgres:password@db:5432/evals_development - REDIS_URL=redis://redis:6379 - HOST_EXECUTION_METHOD=docker - EOF - - - name: Build and start services - run: | - cd packages/evals - docker compose --profile server --profile runner up --build -d --scale runner=0 - - - name: Wait for services - run: | - cd packages/evals - timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' - timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' - - - name: Run database setup - run: | - cd packages/evals - docker compose exec -T web pnpm db:push - - - name: Run sample evaluation - env: - CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }} - run: | - cd packages/evals - - # Run a limited set of evaluations for CI - docker compose run --rm runner pnpm cli run \ - --concurrency $CONCURRENCY \ - --timeout 300 \ - --max-exercises 3 \ - --model "anthropic/claude-3-5-sonnet-20241022" - - - name: Upload evaluation results - if: always() - uses: actions/upload-artifact@v4 - with: - name: evaluation-results - path: | - packages/evals/results/ - packages/evals/logs/ - retention-days: 7 - - - name: Cleanup - if: always() - run: | - cd packages/evals - docker compose down -v --remove-orphans - - security-scan: - name: Security Scan - runs-on: ubuntu-latest - needs: build-and-test - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - scan-type: 'fs' - scan-ref: 'packages/evals' - format: 'sarif' - output: 'trivy-results.sarif' - - - name: Upload Trivy scan results - uses: github/codeql-action/upload-sarif@v3 - if: always() - with: - sarif_file: 'trivy-results.sarif' - - docker-compose-validate: - name: Validate Docker Compose - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Validate Docker Compose file - run: | - cd packages/evals - docker compose config --quiet - - - name: Check Docker Compose services - run: | - cd packages/evals - docker compose config --services | sort > services.txt - echo "Available services:" - cat services.txt - - # Verify expected services exist - for service in db redis web runner; do - if ! grep -q "^$service$" services.txt; then - echo "ERROR: Service '$service' not found in docker-compose.yml" - exit 1 - fi - done - - echo "All expected services found ✓" \ No newline at end of file + build-and-test: + name: Build and Test Evals + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Create environment files + run: | + cd packages/evals + + # Create .env.local (required for Docker build) + cat > .env.local << EOF + OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} + EOF + + # Create .env.development for development + cat > .env.development << EOF + NODE_ENV=development + DATABASE_URL=postgresql://postgres:password@db:5432/evals_development + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + - name: Build Docker images + run: | + cd packages/evals + docker compose build web runner + + - name: Start server services + run: | + cd packages/evals + docker compose --profile server up -d + + - name: Wait for services to be ready + run: | + cd packages/evals + + # Wait for PostgreSQL to be ready + echo "Waiting for PostgreSQL..." + timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' + + # Wait for Redis to be ready + echo "Waiting for Redis..." + timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' + + # Wait for web service to be ready + echo "Waiting for web service..." + timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' + + - name: Run database migrations + run: | + cd packages/evals + docker compose exec -T web pnpm db:push + + - name: Run tests + run: | + cd packages/evals + docker compose run --rm runner pnpm _test + + - name: Check service logs on failure + if: failure() + run: | + cd packages/evals + echo "=== Database logs ===" + docker compose logs db + echo "=== Redis logs ===" + docker compose logs redis + echo "=== Web service logs ===" + docker compose logs web + + - name: Cleanup + if: always() + run: | + cd packages/evals + docker compose down -v --remove-orphans + + run-sample-evals: + name: Run Sample Evaluations + runs-on: ubuntu-latest + needs: build-and-test + if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch' + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Create environment files + run: | + cd packages/evals + + # Create .env.local with actual API key for evaluations + cat > .env.local << EOF + OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} + EOF + + cat > .env.development << EOF + NODE_ENV=development + DATABASE_URL=postgresql://postgres:password@db:5432/evals_development + REDIS_URL=redis://redis:6379 + HOST_EXECUTION_METHOD=docker + EOF + + - name: Build and start services + run: | + cd packages/evals + docker compose --profile server --profile runner up --build -d --scale runner=0 + + - name: Wait for services + run: | + cd packages/evals + timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' + timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' + timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' + + - name: Run database setup + run: | + cd packages/evals + docker compose exec -T web pnpm db:push + + - name: Run sample evaluation + env: + CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }} + run: | + cd packages/evals + + # Run a limited set of evaluations for CI + docker compose run --rm runner pnpm cli run \ + --concurrency $CONCURRENCY \ + --timeout 300 \ + --max-exercises 3 \ + --model "anthropic/claude-3-5-sonnet-20241022" + + - name: Upload evaluation results + if: always() + uses: actions/upload-artifact@v4 + with: + name: evaluation-results + path: | + packages/evals/results/ + packages/evals/logs/ + retention-days: 7 + + - name: Cleanup + if: always() + run: | + cd packages/evals + docker compose down -v --remove-orphans + + security-scan: + name: Security Scan + runs-on: ubuntu-latest + needs: build-and-test + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + scan-type: "fs" + scan-ref: "packages/evals" + format: "sarif" + output: "trivy-results.sarif" + + - name: Upload Trivy scan results + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: "trivy-results.sarif" + + docker-compose-validate: + name: Validate Docker Compose + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Validate Docker Compose file + run: | + cd packages/evals + docker compose config --quiet + + - name: Check Docker Compose services + run: | + cd packages/evals + docker compose config --services | sort > services.txt + echo "Available services:" + cat services.txt + + # Verify expected services exist + for service in db redis web runner; do + if ! grep -q "^$service$" services.txt; then + echo "ERROR: Service '$service' not found in docker-compose.yml" + exit 1 + fi + done + + echo "All expected services found ✓" From e369005131406ca165cedd370103b3c5822b2273 Mon Sep 17 00:00:00 2001 From: cte Date: Mon, 9 Jun 2025 11:37:07 -0700 Subject: [PATCH 06/20] Install nc --- packages/evals/Dockerfile.runner | 1 + packages/evals/Dockerfile.web | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/evals/Dockerfile.runner b/packages/evals/Dockerfile.runner index c68b4f80c0..ec3277461c 100644 --- a/packages/evals/Dockerfile.runner +++ b/packages/evals/Dockerfile.runner @@ -13,6 +13,7 @@ RUN apt update && \ git \ vim \ jq \ + netcat-openbsd \ apt-transport-https \ ca-certificates \ gnupg \ diff --git a/packages/evals/Dockerfile.web b/packages/evals/Dockerfile.web index 55e8b5a298..b8713f69b9 100644 --- a/packages/evals/Dockerfile.web +++ b/packages/evals/Dockerfile.web @@ -8,7 +8,7 @@ RUN npm install -g npm@latest RUN npm install -g npm-run-all # Install system packages -RUN apt update && apt install -y curl git vim jq postgresql-client +RUN apt update && apt install -y curl git vim jq netcat-openbsd postgresql-client # Install Docker cli RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release From 92e70b1803807833d831dceecaa9591c6c6e3c06 Mon Sep 17 00:00:00 2001 From: cte Date: Mon, 9 Jun 2025 11:42:52 -0700 Subject: [PATCH 07/20] Add Docker layer caching --- .github/workflows/evals-quick-test.yml | 29 +++++++++++-- .github/workflows/evals.yml | 59 +++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml index b4452e692d..9dfaff9257 100644 --- a/.github/workflows/evals-quick-test.yml +++ b/.github/workflows/evals-quick-test.yml @@ -42,10 +42,33 @@ jobs: HOST_EXECUTION_METHOD=docker EOF - - name: Build images + - name: Build images with cache + uses: docker/build-push-action@v5 + with: + context: . + file: packages/evals/Dockerfile.web + tags: evals-web:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + + - name: Build runner image with cache + uses: docker/build-push-action@v5 + with: + context: . + file: packages/evals/Dockerfile.runner + tags: evals-runner:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + + - name: Tag images for docker-compose run: | - cd packages/evals - docker compose build web runner + cd packages/evals + docker tag evals-web:latest evals-web + docker tag evals-runner:latest evals-runner - name: Start server services run: | diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 3148a023c6..ce2505db22 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -50,10 +50,33 @@ jobs: HOST_EXECUTION_METHOD=docker EOF - - name: Build Docker images + - name: Build web image with cache + uses: docker/build-push-action@v5 + with: + context: . + file: packages/evals/Dockerfile.web + tags: evals-web:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + + - name: Build runner image with cache + uses: docker/build-push-action@v5 + with: + context: . + file: packages/evals/Dockerfile.runner + tags: evals-runner:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + + - name: Tag images for docker-compose run: | - cd packages/evals - docker compose build web runner + cd packages/evals + docker tag evals-web:latest evals-web + docker tag evals-runner:latest evals-runner - name: Start server services run: | @@ -133,10 +156,34 @@ jobs: HOST_EXECUTION_METHOD=docker EOF - - name: Build and start services + - name: Build web image with cache + uses: docker/build-push-action@v5 + with: + context: . + file: packages/evals/Dockerfile.web + tags: evals-web:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + + - name: Build runner image with cache + uses: docker/build-push-action@v5 + with: + context: . + file: packages/evals/Dockerfile.runner + tags: evals-runner:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + + - name: Tag images and start services run: | - cd packages/evals - docker compose --profile server --profile runner up --build -d --scale runner=0 + cd packages/evals + docker tag evals-web:latest evals-web + docker tag evals-runner:latest evals-runner + docker compose --profile server --profile runner up -d --scale runner=0 - name: Wait for services run: | From 1c77f66710bae09772e555d99b4b3c9e0257ecd1 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 06:28:21 -0700 Subject: [PATCH 08/20] Retry logic fix --- packages/evals/src/cli/runTask.ts | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index 3b1ba61104..14e73dc59c 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -87,6 +87,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => let taskStartedAt = Date.now() let taskFinishedAt: number | undefined let taskAbortedAt: number | undefined + let taskTimedOut: boolean = false let taskMetricsId: number | undefined let rooTaskId: string | undefined let isClientDisconnected = false @@ -196,6 +197,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => timeout: EVALS_TIMEOUT, }) } catch (_error) { + taskTimedOut = true logger.error("time limit reached") if (rooTaskId && !isClientDisconnected) { @@ -207,16 +209,16 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => taskFinishedAt = Date.now() } - if (taskFinishedAt) { - logger.info("setting task finished at") - await updateTask(task.id, { finishedAt: new Date() }) - } - - if (!taskFinishedAt && isClientDisconnected) { + if (!taskFinishedAt && !taskTimedOut) { logger.error("client disconnected before task finished") throw new Error("Client disconnected before task completion.") } + // If the task was aborted unexpectedly or the client disconnected + // unexpectedly, then throw to trigger a retry. + logger.info("setting task finished at") + await updateTask(task.id, { finishedAt: new Date() }) + if (rooTaskId && !isClientDisconnected) { logger.info("closing task") client.sendCommand({ commandName: TaskCommandName.CloseTask, data: rooTaskId }) From 88f93ca07baa5f73f4a99337eb9305b5d456780f Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 07:50:29 -0700 Subject: [PATCH 09/20] Add ci mode to evals cli --- .github/workflows/evals-quick-test.yml | 135 ------------ .github/workflows/evals.yml | 263 ++++++------------------ apps/web-evals/src/actions/exercises.ts | 23 +-- apps/web-evals/src/actions/runs.ts | 11 +- packages/evals/src/cli/FileLogger.ts | 86 -------- packages/evals/src/cli/index.ts | 30 ++- packages/evals/src/cli/processTask.ts | 112 ---------- packages/evals/src/cli/runCi.ts | 25 +++ packages/evals/src/cli/runEvals.ts | 13 +- packages/evals/src/cli/runTask.ts | 127 +++++++++++- packages/evals/src/cli/runUnitTest.ts | 37 ++-- packages/evals/src/cli/utils.ts | 85 ++++++++ packages/evals/src/exercises/index.ts | 10 +- 13 files changed, 346 insertions(+), 611 deletions(-) delete mode 100644 .github/workflows/evals-quick-test.yml delete mode 100644 packages/evals/src/cli/FileLogger.ts delete mode 100644 packages/evals/src/cli/processTask.ts create mode 100644 packages/evals/src/cli/runCi.ts diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml deleted file mode 100644 index 9dfaff9257..0000000000 --- a/.github/workflows/evals-quick-test.yml +++ /dev/null @@ -1,135 +0,0 @@ -name: Evals Quick Test - -on: - pull_request: - branches: [main, develop] - paths: - - "packages/evals/**" - - ".github/workflows/evals*.yml" - workflow_dispatch: - -env: - DOCKER_BUILDKIT: 1 - COMPOSE_DOCKER_CLI_BUILD: 1 - -jobs: - test-docker-compose: - name: Test Docker Compose Networking - runs-on: ubuntu-latest - timeout-minutes: 15 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Create test environment - run: | - cd packages/evals - - # Create .env.local (required for Docker build) - cat > .env.local << EOF - OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} - EOF - - # Create development environment - cat > .env.development << EOF - NODE_ENV=development - DATABASE_URL=postgresql://postgres:password@db:5432/evals_development - REDIS_URL=redis://redis:6379 - HOST_EXECUTION_METHOD=docker - EOF - - - name: Build images with cache - uses: docker/build-push-action@v5 - with: - context: . - file: packages/evals/Dockerfile.web - tags: evals-web:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - - - name: Build runner image with cache - uses: docker/build-push-action@v5 - with: - context: . - file: packages/evals/Dockerfile.runner - tags: evals-runner:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - - - name: Tag images for docker-compose - run: | - cd packages/evals - docker tag evals-web:latest evals-web - docker tag evals-runner:latest evals-runner - - - name: Start server services - run: | - cd packages/evals - docker compose --profile server up -d - - - name: Test service connectivity - run: | - cd packages/evals - - # Wait for services - echo "Waiting for PostgreSQL..." - timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' - - echo "Waiting for Redis..." - timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - - # Test inter-container networking - echo "Testing database connection from web container..." - docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' - - echo "Testing Redis connection from web container..." - docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' - - # Test that web service can start (basic health check) - echo "Testing web service startup..." - timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..." - - - name: Test runner container networking - run: | - cd packages/evals - - echo "Testing runner container can connect to services..." - docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' - docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' - docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"' - - - name: Verify Docker socket access - run: | - cd packages/evals - - echo "Testing Docker socket access in runner..." - docker compose run --rm runner docker --version - docker compose run --rm runner docker ps - - - name: Show service status - if: always() - run: | - cd packages/evals - echo "=== Service Status ===" - docker compose ps - - echo "=== Network Information ===" - docker network ls | grep evals || echo "No evals network found" - - echo "=== Container Information ===" - docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"' - docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"' - - - name: Cleanup - if: always() - run: | - cd packages/evals - docker compose down -v --remove-orphans diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index ce2505db22..af039e5662 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -1,39 +1,31 @@ -name: Evals Docker Compose +name: Evals Quick Test on: + pull_request: + branches: [main, develop] + paths: + - "packages/evals/**" + - ".github/workflows/evals*.yml" workflow_dispatch: - inputs: - run_full_evals: - description: "Run full evaluation suite" - required: false - default: "false" - type: boolean - concurrency: - description: "Evaluation concurrency level" - required: false - default: "2" - type: string env: DOCKER_BUILDKIT: 1 COMPOSE_DOCKER_CLI_BUILD: 1 jobs: - build-and-test: - name: Build and Test Evals + test-docker-compose: + name: Test Docker Compose Networking runs-on: ubuntu-latest - timeout-minutes: 30 + timeout-minutes: 15 steps: - name: Checkout repository uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Create environment files + - name: Create test environment run: | cd packages/evals @@ -42,7 +34,7 @@ jobs: OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} EOF - # Create .env.development for development + # Create development environment cat > .env.development << EOF NODE_ENV=development DATABASE_URL=postgresql://postgres:password@db:5432/evals_development @@ -50,231 +42,100 @@ jobs: HOST_EXECUTION_METHOD=docker EOF - - name: Build web image with cache + - name: Build images with cache uses: docker/build-push-action@v5 with: - context: . - file: packages/evals/Dockerfile.web - tags: evals-web:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - + context: . + file: packages/evals/Dockerfile.web + tags: evals-web:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + - name: Build runner image with cache uses: docker/build-push-action@v5 with: - context: . - file: packages/evals/Dockerfile.runner - tags: evals-runner:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - + context: . + file: packages/evals/Dockerfile.runner + tags: evals-runner:latest + cache-from: type=gha + cache-to: type=gha,mode=max + push: false + load: true + - name: Tag images for docker-compose run: | - cd packages/evals - docker tag evals-web:latest evals-web - docker tag evals-runner:latest evals-runner + cd packages/evals + docker tag evals-web:latest evals-web + docker tag evals-runner:latest evals-runner - name: Start server services run: | cd packages/evals docker compose --profile server up -d - - name: Wait for services to be ready + - name: Test service connectivity run: | cd packages/evals - # Wait for PostgreSQL to be ready + # Wait for services echo "Waiting for PostgreSQL..." - timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' + timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' - # Wait for Redis to be ready echo "Waiting for Redis..." timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - # Wait for web service to be ready - echo "Waiting for web service..." - timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' + # Test inter-container networking + echo "Testing database connection from web container..." + docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' - - name: Run database migrations - run: | - cd packages/evals - docker compose exec -T web pnpm db:push + echo "Testing Redis connection from web container..." + docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' - - name: Run tests - run: | - cd packages/evals - docker compose run --rm runner pnpm _test + # Test that web service can start (basic health check) + echo "Testing web service startup..." + timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..." - - name: Check service logs on failure - if: failure() + - name: Test runner container networking run: | cd packages/evals - echo "=== Database logs ===" - docker compose logs db - echo "=== Redis logs ===" - docker compose logs redis - echo "=== Web service logs ===" - docker compose logs web - - name: Cleanup - if: always() + echo "Testing runner container can connect to services..." + docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' + docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' + docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"' + + - name: Verify Docker socket access run: | cd packages/evals - docker compose down -v --remove-orphans - - run-sample-evals: - name: Run Sample Evaluations - runs-on: ubuntu-latest - needs: build-and-test - if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch' - timeout-minutes: 60 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + echo "Testing Docker socket access in runner..." + docker compose run --rm runner docker --version + docker compose run --rm runner docker ps - - name: Create environment files + - name: Show service status + if: always() run: | cd packages/evals - # Create .env.local with actual API key for evaluations - cat > .env.local << EOF - OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} - EOF - - cat > .env.development << EOF - NODE_ENV=development - DATABASE_URL=postgresql://postgres:password@db:5432/evals_development - REDIS_URL=redis://redis:6379 - HOST_EXECUTION_METHOD=docker - EOF - - - name: Build web image with cache - uses: docker/build-push-action@v5 - with: - context: . - file: packages/evals/Dockerfile.web - tags: evals-web:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - - - name: Build runner image with cache - uses: docker/build-push-action@v5 - with: - context: . - file: packages/evals/Dockerfile.runner - tags: evals-runner:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - - - name: Tag images and start services - run: | - cd packages/evals - docker tag evals-web:latest evals-web - docker tag evals-runner:latest evals-runner - docker compose --profile server --profile runner up -d --scale runner=0 + echo "=== Service Status ===" + docker compose ps - - name: Wait for services - run: | - cd packages/evals - timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done' - timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done' + echo "=== Network Information ===" + docker network ls | grep evals || echo "No evals network found" - - name: Run database setup - run: | - cd packages/evals - docker compose exec -T web pnpm db:push + echo "=== Container Information ===" + docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"' + docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"' - name: Run sample evaluation - env: - CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }} run: | cd packages/evals - - # Run a limited set of evaluations for CI - docker compose run --rm runner pnpm cli run \ - --concurrency $CONCURRENCY \ - --timeout 300 \ - --max-exercises 3 \ - --model "anthropic/claude-3-5-sonnet-20241022" - - - name: Upload evaluation results - if: always() - uses: actions/upload-artifact@v4 - with: - name: evaluation-results - path: | - packages/evals/results/ - packages/evals/logs/ - retention-days: 7 + docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci - name: Cleanup if: always() run: | cd packages/evals docker compose down -v --remove-orphans - - security-scan: - name: Security Scan - runs-on: ubuntu-latest - needs: build-and-test - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - scan-type: "fs" - scan-ref: "packages/evals" - format: "sarif" - output: "trivy-results.sarif" - - - name: Upload Trivy scan results - uses: github/codeql-action/upload-sarif@v3 - if: always() - with: - sarif_file: "trivy-results.sarif" - - docker-compose-validate: - name: Validate Docker Compose - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Validate Docker Compose file - run: | - cd packages/evals - docker compose config --quiet - - - name: Check Docker Compose services - run: | - cd packages/evals - docker compose config --services | sort > services.txt - echo "Available services:" - cat services.txt - - # Verify expected services exist - for service in db redis web runner; do - if ! grep -q "^$service$" services.txt; then - echo "ERROR: Service '$service' not found in docker-compose.yml" - exit 1 - fi - done - - echo "All expected services found ✓" diff --git a/apps/web-evals/src/actions/exercises.ts b/apps/web-evals/src/actions/exercises.ts index 8cffa40ba3..17eb1ff085 100644 --- a/apps/web-evals/src/actions/exercises.ts +++ b/apps/web-evals/src/actions/exercises.ts @@ -1,37 +1,22 @@ "use server" -import * as fs from "fs/promises" import * as path from "path" import { fileURLToPath } from "url" -import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals" +import { exerciseLanguages, listDirectories } from "@roo-code/evals" const __dirname = path.dirname(fileURLToPath(import.meta.url)) // /apps/web-evals/src/actions -const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals") - -export const listDirectories = async (relativePath: string) => { - try { - const targetPath = path.resolve(__dirname, relativePath) - const entries = await fs.readdir(targetPath, { withFileTypes: true }) - return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name) - } catch (error) { - console.error(`Error listing directories at ${relativePath}:`, error) - return [] - } -} +const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals") export const getExercises = async () => { const result = await Promise.all( exerciseLanguages.map(async (language) => { - const languagePath = path.join(EXERCISES_BASE_PATH, language) - const exercises = await listDirectories(languagePath) + const languagePath = path.join(EVALS_REPO_PATH, language) + const exercises = await listDirectories(__dirname, languagePath) return exercises.map((exercise) => `${language}/${exercise}`) }), ) return result.flat() } - -export const getExercisesForLanguage = async (language: ExerciseLanguage) => - listDirectories(path.join(EXERCISES_BASE_PATH, language)) diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts index 80a4659567..90387d3257 100644 --- a/apps/web-evals/src/actions/runs.ts +++ b/apps/web-evals/src/actions/runs.ts @@ -1,7 +1,9 @@ "use server" -import { spawn } from "child_process" +import * as path from "path" import fs from "fs" +import { fileURLToPath } from "url" +import { spawn } from "child_process" import { revalidatePath } from "next/cache" import pMap from "p-map" @@ -12,11 +14,12 @@ import { createRun as _createRun, deleteRun as _deleteRun, createTask, + getExercisesForLanguage, } from "@roo-code/evals" import { CreateRun } from "@/lib/schemas" -import { getExercisesForLanguage } from "./exercises" +const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") // eslint-disable-next-line @typescript-eslint/no-unused-vars export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) { @@ -37,9 +40,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values } } else { for (const language of exerciseLanguages) { - const exercises = await getExercisesForLanguage(language) + const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language) - await pMap(exercises, (exercise) => createTask({ ...values, runId: run.id, language, exercise }), { + await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), { concurrency: 10, }) } diff --git a/packages/evals/src/cli/FileLogger.ts b/packages/evals/src/cli/FileLogger.ts deleted file mode 100644 index 443c1d2c53..0000000000 --- a/packages/evals/src/cli/FileLogger.ts +++ /dev/null @@ -1,86 +0,0 @@ -import * as fs from "fs" -import * as path from "path" - -export enum LogLevel { - INFO = "INFO", - ERROR = "ERROR", - WARN = "WARN", - DEBUG = "DEBUG", -} - -export interface LoggerOptions { - logDir: string - filename: string - tag: string -} - -export class FileLogger { - private logStream: fs.WriteStream | undefined - private logFilePath: string - private tag: string - - constructor({ logDir, filename, tag }: LoggerOptions) { - this.tag = tag - this.logFilePath = path.join(logDir, filename) - this.initializeLogger(logDir) - } - - private initializeLogger(logDir: string): void { - try { - fs.mkdirSync(logDir, { recursive: true }) - } catch (error) { - console.error(`Failed to create log directory ${logDir}:`, error) - } - - try { - this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" }) - } catch (error) { - console.error(`Failed to create log file ${this.logFilePath}:`, error) - } - } - - private writeToLog(level: LogLevel, message: string, ...args: unknown[]) { - try { - const timestamp = new Date().toISOString() - - const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${ - args.length > 0 ? JSON.stringify(args) : "" - }\n` - - console.log(logLine.trim()) - - if (this.logStream) { - this.logStream.write(logLine) - } - } catch (error) { - console.error(`Failed to write to log file ${this.logFilePath}:`, error) - } - } - - public info(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.INFO, message, ...args) - } - - public error(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.ERROR, message, ...args) - } - - public warn(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.WARN, message, ...args) - } - - public debug(message: string, ...args: unknown[]): void { - this.writeToLog(LogLevel.DEBUG, message, ...args) - } - - public log(message: string, ...args: unknown[]): void { - this.info(message, ...args) - } - - public close(): void { - if (this.logStream) { - this.logStream.end() - this.logStream = undefined - } - } -} diff --git a/packages/evals/src/cli/index.ts b/packages/evals/src/cli/index.ts index 35c9203093..0a0be28390 100644 --- a/packages/evals/src/cli/index.ts +++ b/packages/evals/src/cli/index.ts @@ -1,11 +1,12 @@ import * as fs from "fs" -import { command, run, number, option } from "cmd-ts" +import { run, command, option, flag, number, boolean } from "cmd-ts" -import { exercisesPath } from "../exercises/index.js" +import { EVALS_REPO_PATH } from "../exercises/index.js" +import { runCi } from "./runCi.js" import { runEvals } from "./runEvals.js" -import { processTask } from "./processTask.js" +import { processTask } from "./runTask.js" const main = async () => { await run( @@ -14,25 +15,22 @@ const main = async () => { description: "Execute an eval run.", version: "0.0.0", args: { + ci: flag({ type: boolean, long: "ci", defaultValue: () => false }), runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }), taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }), }, handler: async (args) => { - const { runId, taskId } = args - - if (runId === -1 && taskId === -1) { - throw new Error("Either runId or taskId must be provided.") - } - - if (runId !== -1 && taskId !== -1) { - throw new Error("Only one of runId or taskId must be provided.") - } + const { runId, taskId, ci } = args try { - if (runId !== -1) { + if (ci) { + await runCi() + } else if (runId !== -1) { await runEvals(runId) - } else { + } else if (taskId !== -1) { await processTask({ taskId }) + } else { + throw new Error("Either runId or taskId must be provided.") } } catch (error) { console.error(error) @@ -46,9 +44,9 @@ const main = async () => { process.exit(0) } -if (!fs.existsSync(exercisesPath)) { +if (!fs.existsSync(EVALS_REPO_PATH)) { console.error( - `Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`, + `Exercises do not exist at ${EVALS_REPO_PATH}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`, ) process.exit(1) diff --git a/packages/evals/src/cli/processTask.ts b/packages/evals/src/cli/processTask.ts deleted file mode 100644 index 2b70013864..0000000000 --- a/packages/evals/src/cli/processTask.ts +++ /dev/null @@ -1,112 +0,0 @@ -import { execa } from "execa" - -import { RooCodeEventName, type TaskEvent } from "@roo-code/types" - -import { findTask, updateTask, findRun } from "../db/index.js" - -import { getTag } from "./utils.js" -import { FileLogger } from "./FileLogger.js" -import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js" -import { runTask } from "./runTask.js" -import { runUnitTest } from "./runUnitTest.js" - -export const processTask = async ({ taskId, logger }: { taskId: number; logger?: FileLogger }) => { - const task = await findTask(taskId) - const { language, exercise } = task - const run = await findRun(task.runId) - await registerRunner({ runId: run.id, taskId }) - - logger = - logger || - new FileLogger({ - logDir: `/var/log/evals/runs/${run.id}`, - filename: `${language}-${exercise}.log`, - tag: getTag("runTask", { run, task }), - }) - - try { - const publish = async (e: TaskEvent) => { - const redis = await redisClient() - await redis.publish(getPubSubKey(run.id), JSON.stringify(e)) - } - - logger.info(`running task ${task.id} (${language}/${exercise})...`) - await runTask({ run, task, publish, logger }) - - logger.info(`testing task ${task.id} (${language}/${exercise})...`) - const passed = await runUnitTest({ run, task }) - - logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`) - await updateTask(task.id, { passed }) - - await publish({ - eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail, - taskId: task.id, - }) - } finally { - await deregisterRunner({ runId: run.id, taskId }) - } -} - -export const processTaskInContainer = async ({ - taskId, - logger, - maxRetries = 10, -}: { - taskId: number - logger: FileLogger - maxRetries?: number -}) => { - const baseArgs = [ - "--rm", - "--network evals_default", - "-v /var/run/docker.sock:/var/run/docker.sock", - "-v /tmp/evals:/var/log/evals", - "-e HOST_EXECUTION_METHOD=docker", - ] - - const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}` - logger.info(command) - - for (let attempt = 0; attempt <= maxRetries; attempt++) { - const containerName = `evals-task-${taskId}.${attempt}` - const args = [`--name ${containerName}`, ...baseArgs] - const isRetry = attempt > 0 - - if (isRetry) { - const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random()) - logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`) - await new Promise((resolve) => setTimeout(resolve, delayMs)) - } - - logger.info( - `${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`, - ) - - const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true }) - // subprocess.stdout?.on("data", (data) => console.log(data.toString())) - // subprocess.stderr?.on("data", (data) => console.error(data.toString())) - - try { - const result = await subprocess - logger.info(`container process completed with exit code: ${result.exitCode}`) - return - } catch (error) { - if (error && typeof error === "object" && "exitCode" in error) { - logger.error( - `container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`, - ) - } else { - logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`) - } - - if (attempt === maxRetries) { - break - } - } - } - - logger.error(`all ${maxRetries + 1} attempts failed, giving up`) - - // TODO: Mark task as failed. -} diff --git a/packages/evals/src/cli/runCi.ts b/packages/evals/src/cli/runCi.ts new file mode 100644 index 0000000000..da9fbca92f --- /dev/null +++ b/packages/evals/src/cli/runCi.ts @@ -0,0 +1,25 @@ +import pMap from "p-map" + +import { EVALS_REPO_PATH, exerciseLanguages, getExercisesForLanguage } from "../exercises/index.js" +import { createRun, createTask } from "../db/index.js" + +import { runEvals } from "./runEvals.js" + +export const runCi = async ({ + concurrency = 3, + exercisesPerLanguage = 3, +}: { + concurrency?: number + exercisesPerLanguage?: number +} = {}) => { + console.log("Running evals in CI mode.") + + const run = await createRun({ model: "anthropic/claude-sonnet-4", socketPath: "", concurrency }) + + for (const language of exerciseLanguages) { + const exercises = (await getExercisesForLanguage(EVALS_REPO_PATH, language)).slice(0, exercisesPerLanguage) + await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise })) + } + + await runEvals(run.id) +} diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts index 56bc6ce222..00199bbb44 100644 --- a/packages/evals/src/cli/runEvals.ts +++ b/packages/evals/src/cli/runEvals.ts @@ -1,12 +1,11 @@ import PQueue from "p-queue" import { findRun, finishRun, getTasks } from "../db/index.js" -import { exercisesPath } from "../exercises/index.js" +import { EVALS_REPO_PATH } from "../exercises/index.js" -import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js" -import { processTask, processTaskInContainer } from "./processTask.js" +import { Logger, getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js" import { startHeartbeat, stopHeartbeat } from "./redis.js" -import { FileLogger } from "./FileLogger.js" +import { processTask, processTaskInContainer } from "./runTask.js" export const runEvals = async (runId: number) => { const run = await findRun(runId) @@ -21,7 +20,7 @@ export const runEvals = async (runId: number) => { throw new Error(`Run ${run.id} has no tasks.`) } - const logger = new FileLogger({ + const logger = new Logger({ logDir: `/var/log/evals/runs/${run.id}`, filename: `controller.log`, tag: getTag("runEvals", { run }), @@ -32,7 +31,7 @@ export const runEvals = async (runId: number) => { const containerized = isDockerContainer() if (!containerized) { - await resetEvalsRepo({ run, cwd: exercisesPath }) + await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH }) } const heartbeat = await startHeartbeat(run.id) @@ -63,7 +62,7 @@ export const runEvals = async (runId: number) => { // will lost when the container is destroyed. I think we should // store the diffs in the database instead. if (!containerized) { - await commitEvalsRepoChanges({ run, cwd: exercisesPath }) + await commitEvalsRepoChanges({ run, cwd: EVALS_REPO_PATH }) } } finally { logger.info("cleaning up") diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index 14e73dc59c..14028b493a 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -15,11 +15,21 @@ import { } from "@roo-code/types" import { IpcClient } from "@roo-code/ipc" -import { type Run, type Task, updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js" -import { exercisesPath } from "../exercises/index.js" - -import { isDockerContainer } from "./utils.js" -import { FileLogger } from "./FileLogger.js" +import { + type Run, + type Task, + findRun, + findTask, + updateTask, + createTaskMetrics, + updateTaskMetrics, + createToolError, +} from "../db/index.js" +import { EVALS_REPO_PATH } from "../exercises/index.js" + +import { Logger, getTag, isDockerContainer } from "./utils.js" +import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js" +import { runUnitTest } from "./runUnitTest.js" class SubprocessTimeoutError extends Error { constructor(timeout: number) { @@ -28,17 +38,118 @@ class SubprocessTimeoutError extends Error { } } +export const processTask = async ({ taskId, logger }: { taskId: number; logger?: Logger }) => { + const task = await findTask(taskId) + const { language, exercise } = task + const run = await findRun(task.runId) + await registerRunner({ runId: run.id, taskId }) + + logger = + logger || + new Logger({ + logDir: `/var/log/evals/runs/${run.id}`, + filename: `${language}-${exercise}.log`, + tag: getTag("runTask", { run, task }), + }) + + try { + const publish = async (e: TaskEvent) => { + const redis = await redisClient() + await redis.publish(getPubSubKey(run.id), JSON.stringify(e)) + } + + logger.info(`running task ${task.id} (${language}/${exercise})...`) + await runTask({ run, task, publish, logger }) + + logger.info(`testing task ${task.id} (${language}/${exercise})...`) + const passed = await runUnitTest({ task, logger }) + + logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`) + await updateTask(task.id, { passed }) + + await publish({ + eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail, + taskId: task.id, + }) + } finally { + await deregisterRunner({ runId: run.id, taskId }) + } +} + +export const processTaskInContainer = async ({ + taskId, + logger, + maxRetries = 10, +}: { + taskId: number + logger: Logger + maxRetries?: number +}) => { + const baseArgs = [ + "--rm", + "--network evals_default", + "-v /var/run/docker.sock:/var/run/docker.sock", + "-v /tmp/evals:/var/log/evals", + "-e HOST_EXECUTION_METHOD=docker", + ] + + const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}` + logger.info(command) + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + const containerName = `evals-task-${taskId}.${attempt}` + const args = [`--name ${containerName}`, ...baseArgs] + const isRetry = attempt > 0 + + if (isRetry) { + const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random()) + logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`) + await new Promise((resolve) => setTimeout(resolve, delayMs)) + } + + logger.info( + `${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`, + ) + + const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true }) + // subprocess.stdout?.on("data", (data) => console.log(data.toString())) + // subprocess.stderr?.on("data", (data) => console.error(data.toString())) + + try { + const result = await subprocess + logger.info(`container process completed with exit code: ${result.exitCode}`) + return + } catch (error) { + if (error && typeof error === "object" && "exitCode" in error) { + logger.error( + `container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`, + ) + } else { + logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`) + } + + if (attempt === maxRetries) { + break + } + } + } + + logger.error(`all ${maxRetries + 1} attempts failed, giving up`) + + // TODO: Mark task as failed. +} + type RunTaskOptions = { run: Run task: Task publish: (taskEvent: TaskEvent) => Promise - logger: FileLogger + logger: Logger } export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => { const { language, exercise } = task - const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8") - const workspacePath = path.resolve(exercisesPath, language, exercise) + const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8") + const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise) const ipcSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`) const env = { ROO_CODE_IPC_SOCKET_PATH: ipcSocketPath } const controller = new AbortController() diff --git a/packages/evals/src/cli/runUnitTest.ts b/packages/evals/src/cli/runUnitTest.ts index 7785312e76..6f8fbac619 100644 --- a/packages/evals/src/cli/runUnitTest.ts +++ b/packages/evals/src/cli/runUnitTest.ts @@ -3,14 +3,14 @@ import * as path from "path" import { execa, parseCommandString } from "execa" import psTree from "ps-tree" -import type { Run, Task } from "../db/index.js" -import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js" +import type { Task } from "../db/index.js" +import { type ExerciseLanguage, EVALS_REPO_PATH } from "../exercises/index.js" -import { getTag } from "./utils.js" +import { Logger } from "./utils.js" const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000 -const testCommands: Record = { +const testCommands: Record = { go: { commands: ["go test"] }, java: { commands: ["./gradlew test"] }, javascript: { commands: ["pnpm install", "pnpm test"] }, @@ -18,22 +18,21 @@ const testCommands: Record { - const tag = getTag("runUnitTest", { run, task }) - const log = (message: string, ...args: unknown[]) => console.log(`[${Date.now()} | ${tag}] ${message}`, ...args) - const logError = (message: string, ...args: unknown[]) => - console.error(`[${Date.now()} | ${tag}] ${message}`, ...args) +type RunUnitTestOptions = { + task: Task + logger: Logger +} +export const runUnitTest = async ({ task, logger }: RunUnitTestOptions) => { const cmd = testCommands[task.language] - const exercisePath = path.resolve(exercisesPath, task.language, task.exercise) - const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath + const cwd = path.resolve(EVALS_REPO_PATH, task.language, task.exercise) const commands = cmd.commands.map((cs) => parseCommandString(cs)) let passed = true for (const command of commands) { try { - log(`running "${command.join(" ")}"`) + logger.info(`running "${command.join(" ")}"`) const subprocess = execa({ cwd, shell: "/bin/bash", reject: false })`${command}` subprocess.stdout.pipe(process.stdout) subprocess.stderr.pipe(process.stderr) @@ -49,25 +48,27 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => { }) }) - log(`"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`) + logger.info( + `"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`, + ) if (descendants.length > 0) { for (const descendant of descendants) { try { - log(`killing descendant process ${descendant}`) + logger.info(`killing descendant process ${descendant}`) await execa`kill -9 ${descendant}` } catch (error) { - logError(`failed to kill descendant process ${descendant}:`, error) + logger.error(`failed to kill descendant process ${descendant}:`, error) } } } - log(`killing main process ${subprocess.pid}`) + logger.info(`killing main process ${subprocess.pid}`) try { await execa`kill -9 ${subprocess.pid!}` } catch (error) { - logError(`failed to kill main process ${subprocess.pid}:`, error) + logger.error(`failed to kill main process ${subprocess.pid}:`, error) } }, UNIT_TEST_TIMEOUT) @@ -80,7 +81,7 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => { break } } catch (error) { - logError(`unexpected error:`, error) + logger.error(`unexpected error:`, error) passed = false break } diff --git a/packages/evals/src/cli/utils.ts b/packages/evals/src/cli/utils.ts index cbabb451b9..bf1489d09b 100644 --- a/packages/evals/src/cli/utils.ts +++ b/packages/evals/src/cli/utils.ts @@ -1,4 +1,5 @@ import * as fs from "fs" +import * as path from "path" import { execa } from "execa" @@ -29,3 +30,87 @@ export const commitEvalsRepoChanges = async ({ run, cwd }: { run: Run; cwd: stri await execa({ cwd })`git add .` await execa({ cwd })`git commit -m ${`Run #${run.id}`} --no-verify` } + +enum LogLevel { + INFO = "INFO", + ERROR = "ERROR", + WARN = "WARN", + DEBUG = "DEBUG", +} + +interface LoggerOptions { + logDir: string + filename: string + tag: string +} + +export class Logger { + private logStream: fs.WriteStream | undefined + private logFilePath: string + private tag: string + + constructor({ logDir, filename, tag }: LoggerOptions) { + this.tag = tag + this.logFilePath = path.join(logDir, filename) + this.initializeLogger(logDir) + } + + private initializeLogger(logDir: string): void { + try { + fs.mkdirSync(logDir, { recursive: true }) + } catch (error) { + console.error(`Failed to create log directory ${logDir}:`, error) + } + + try { + this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" }) + } catch (error) { + console.error(`Failed to create log file ${this.logFilePath}:`, error) + } + } + + private writeToLog(level: LogLevel, message: string, ...args: unknown[]) { + try { + const timestamp = new Date().toISOString() + + const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${ + args.length > 0 ? JSON.stringify(args) : "" + }\n` + + console.log(logLine.trim()) + + if (this.logStream) { + this.logStream.write(logLine) + } + } catch (error) { + console.error(`Failed to write to log file ${this.logFilePath}:`, error) + } + } + + public info(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.INFO, message, ...args) + } + + public error(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.ERROR, message, ...args) + } + + public warn(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.WARN, message, ...args) + } + + public debug(message: string, ...args: unknown[]): void { + this.writeToLog(LogLevel.DEBUG, message, ...args) + } + + public log(message: string, ...args: unknown[]): void { + this.info(message, ...args) + } + + public close(): void { + if (this.logStream) { + this.logStream.end() + this.logStream = undefined + } + } +} diff --git a/packages/evals/src/exercises/index.ts b/packages/evals/src/exercises/index.ts index 17e339f21a..7ba34f2a2b 100644 --- a/packages/evals/src/exercises/index.ts +++ b/packages/evals/src/exercises/index.ts @@ -4,15 +4,15 @@ import { fileURLToPath } from "url" const __dirname = path.dirname(fileURLToPath(import.meta.url)) -export const exercisesPath = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals") +export const EVALS_REPO_PATH = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals") export const exerciseLanguages = ["go", "java", "javascript", "python", "rust"] as const export type ExerciseLanguage = (typeof exerciseLanguages)[number] -const listDirectories = async (relativePath: string) => { +export const listDirectories = async (basePath: string, relativePath: string) => { try { - const targetPath = path.resolve(__dirname, relativePath) + const targetPath = path.resolve(basePath, relativePath) const entries = await fs.readdir(targetPath, { withFileTypes: true }) return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name) } catch (error) { @@ -21,5 +21,5 @@ const listDirectories = async (relativePath: string) => { } } -export const getExercisesForLanguage = async (language: ExerciseLanguage) => - listDirectories(path.join(exercisesPath, language)) +export const getExercisesForLanguage = async (basePath: string, language: ExerciseLanguage) => + listDirectories(__dirname, path.join(basePath, language)) From 2bc65ca8f0fb9b6736b056422fc52329ca495bd9 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 08:07:43 -0700 Subject: [PATCH 10/20] Increase timeout --- .github/workflows/evals.yml | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index af039e5662..ff315b4194 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -1,4 +1,4 @@ -name: Evals Quick Test +name: Evals on: pull_request: @@ -13,10 +13,9 @@ env: COMPOSE_DOCKER_CLI_BUILD: 1 jobs: - test-docker-compose: - name: Test Docker Compose Networking + evals: runs-on: ubuntu-latest - timeout-minutes: 15 + timeout-minutes: 30 steps: - name: Checkout repository @@ -25,16 +24,14 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Create test environment + - name: Create environment run: | cd packages/evals - # Create .env.local (required for Docker build) cat > .env.local << EOF OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} EOF - # Create development environment cat > .env.development << EOF NODE_ENV=development DATABASE_URL=postgresql://postgres:password@db:5432/evals_development @@ -42,7 +39,7 @@ jobs: HOST_EXECUTION_METHOD=docker EOF - - name: Build images with cache + - name: Build web image uses: docker/build-push-action@v5 with: context: . @@ -53,7 +50,7 @@ jobs: push: false load: true - - name: Build runner image with cache + - name: Build runner image uses: docker/build-push-action@v5 with: context: . @@ -64,18 +61,18 @@ jobs: push: false load: true - - name: Tag images for docker-compose + - name: Tag images run: | cd packages/evals docker tag evals-web:latest evals-web docker tag evals-runner:latest evals-runner - - name: Start server services + - name: Start containers run: | cd packages/evals docker compose --profile server up -d - - name: Test service connectivity + - name: Wait for containers run: | cd packages/evals @@ -86,14 +83,12 @@ jobs: echo "Waiting for Redis..." timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - # Test inter-container networking echo "Testing database connection from web container..." docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' echo "Testing Redis connection from web container..." docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' - # Test that web service can start (basic health check) echo "Testing web service startup..." timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..." @@ -106,7 +101,7 @@ jobs: docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"' - - name: Verify Docker socket access + - name: Test Docker socket access run: | cd packages/evals @@ -129,7 +124,7 @@ jobs: docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"' docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"' - - name: Run sample evaluation + - name: Run evals run: | cd packages/evals docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci From 6adb2a6adbaca22ade09b2735f00013525155ab0 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 08:35:11 -0700 Subject: [PATCH 11/20] Add web health check --- .github/workflows/evals.yml | 6 +++++- apps/web-evals/src/app/api/health/route.ts | 24 ++++++++++++++++++++++ packages/evals/package.json | 3 ++- packages/evals/src/cli/runCi.ts | 4 ++-- 4 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 apps/web-evals/src/app/api/health/route.ts diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index ff315b4194..c3d9983417 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -90,7 +90,11 @@ jobs: docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' echo "Testing web service startup..." - timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..." + timeout 60 bash -c 'until curl -f http://localhost:3000/api/health 2>/dev/null; do echo "Waiting for web service..."; sleep 2; done' + + echo "✓ Web service is healthy" + echo "Health check response:" + curl -s http://localhost:3000/api/health | jq 2>/dev/null || curl -s http://localhost:3000/api/health - name: Test runner container networking run: | diff --git a/apps/web-evals/src/app/api/health/route.ts b/apps/web-evals/src/app/api/health/route.ts new file mode 100644 index 0000000000..ca8a833942 --- /dev/null +++ b/apps/web-evals/src/app/api/health/route.ts @@ -0,0 +1,24 @@ +import { NextResponse } from "next/server" + +export async function GET() { + try { + return NextResponse.json( + { + status: "healthy", + timestamp: new Date().toISOString(), + uptime: process.uptime(), + environment: process.env.NODE_ENV || "production", + }, + { status: 200 }, + ) + } catch (error) { + return NextResponse.json( + { + status: "unhealthy", + timestamp: new Date().toISOString(), + error: error instanceof Error ? error.message : "Unknown error", + }, + { status: 503 }, + ) + } +} diff --git a/packages/evals/package.json b/packages/evals/package.json index 554356e5b1..88195b134b 100644 --- a/packages/evals/package.json +++ b/packages/evals/package.json @@ -21,7 +21,8 @@ "db:start": "docker compose up -d db", "db:stop": "docker compose down db", "redis:start": "docker compose up -d redis", - "redis:stop": "docker compose down redis" + "redis:stop": "docker compose down redis", + "services:start": "docker compose up -d db redis" }, "dependencies": { "@roo-code/ipc": "workspace:^", diff --git a/packages/evals/src/cli/runCi.ts b/packages/evals/src/cli/runCi.ts index da9fbca92f..1b16d52aee 100644 --- a/packages/evals/src/cli/runCi.ts +++ b/packages/evals/src/cli/runCi.ts @@ -6,8 +6,8 @@ import { createRun, createTask } from "../db/index.js" import { runEvals } from "./runEvals.js" export const runCi = async ({ - concurrency = 3, - exercisesPerLanguage = 3, + concurrency = 1, + exercisesPerLanguage = 1, }: { concurrency?: number exercisesPerLanguage?: number From 43a411012f8bf1be2a6b8edb61ab8ac0da35a74a Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 08:36:27 -0700 Subject: [PATCH 12/20] Forward ports for running locally --- packages/evals/docker-compose.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml index 37d95dbb59..93e643e44b 100644 --- a/packages/evals/docker-compose.yml +++ b/packages/evals/docker-compose.yml @@ -17,8 +17,10 @@ services: db: container_name: evals-db image: postgres:15.4 - expose: - - 5432 + # expose: + # - 5432 + ports: + - "${EVALS_DB_PORT:-5432}:5432" volumes: - ./.docker/postgres:/var/lib/postgresql/data - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d @@ -38,8 +40,10 @@ services: redis: container_name: evals-redis image: redis:7-alpine - expose: - - 6379 + # expose: + # - 6379 + ports: + - "${EVALS_REDIS_PORT:-6379}:6379" volumes: - ./.docker/redis:/data command: redis-server --appendonly yes From bacb17d5e8f49be9b5ff97241fe01478962afeaf Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 08:44:38 -0700 Subject: [PATCH 13/20] Trigger on label --- .github/workflows/evals.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index c3d9983417..e96b414c6c 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -2,10 +2,7 @@ name: Evals on: pull_request: - branches: [main, develop] - paths: - - "packages/evals/**" - - ".github/workflows/evals*.yml" + types: [labeled] workflow_dispatch: env: @@ -14,6 +11,8 @@ env: jobs: evals: + # Run if triggered manually or if PR has 'evals' label. + if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals') runs-on: ubuntu-latest timeout-minutes: 30 @@ -91,7 +90,7 @@ jobs: echo "Testing web service startup..." timeout 60 bash -c 'until curl -f http://localhost:3000/api/health 2>/dev/null; do echo "Waiting for web service..."; sleep 2; done' - + echo "✓ Web service is healthy" echo "Health check response:" curl -s http://localhost:3000/api/health | jq 2>/dev/null || curl -s http://localhost:3000/api/health From 8453c4a85a8785e31a60aeb47facfd691548ac92 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 08:49:24 -0700 Subject: [PATCH 14/20] Remove this --- packages/evals/GITHUB_ACTIONS.md | 201 ------------------------------- 1 file changed, 201 deletions(-) delete mode 100644 packages/evals/GITHUB_ACTIONS.md diff --git a/packages/evals/GITHUB_ACTIONS.md b/packages/evals/GITHUB_ACTIONS.md deleted file mode 100644 index 454c42ec87..0000000000 --- a/packages/evals/GITHUB_ACTIONS.md +++ /dev/null @@ -1,201 +0,0 @@ -# GitHub Actions for Evals - -This document describes the GitHub Actions workflows available for the Roo Code Evals system. - -## Workflows - -### 1. `evals.yml` - Full Evaluation Workflow - -**Purpose**: Comprehensive testing and evaluation workflow that builds, tests, and optionally runs full evaluations. - -**Triggers**: - -- Push to `main` or `develop` branches (when evals files change) -- Pull requests to `main` or `develop` branches (when evals files change) -- Manual dispatch with options - -**Jobs**: - -#### `build-and-test` - -- Builds Docker images for web and runner services -- Starts PostgreSQL, Redis, and web services -- Waits for all services to be ready -- Runs database migrations -- Executes test suite -- Provides detailed logging on failure - -#### `run-sample-evals` (conditional) - -- Only runs when manually triggered with `run_full_evals: true` -- Requires `OPENROUTER_API_KEY` secret to be configured -- Runs a limited set of evaluations for CI testing -- Uploads evaluation results as artifacts -- Configurable concurrency level - -#### `security-scan` - -- Runs Trivy vulnerability scanner on the evals package -- Uploads results to GitHub Security tab - -#### `docker-compose-validate` - -- Validates Docker Compose file syntax -- Verifies all expected services are defined - -**Required Secrets**: - -- `OPENROUTER_API_KEY` (only for full evaluation runs) - -### 2. `evals-quick-test.yml` - Quick Networking Test - -**Purpose**: Fast validation of Docker Compose networking and basic functionality. - -**Triggers**: - -- Push to `main` or `develop` branches (when evals files change) -- Pull requests to `main` or `develop` branches (when evals files change) - -**Jobs**: - -#### `test-docker-compose` - -- Tests inter-container networking between all services -- Verifies database and Redis connectivity -- Tests Docker socket access in runner container -- Validates service startup and health - -#### `validate-compose-file` - -- Validates Docker Compose syntax -- Checks service definitions and profiles - -## Usage Examples - -### Manual Workflow Dispatch - -To run full evaluations manually: - -1. Go to Actions tab in GitHub -2. Select "Evals Docker Compose" workflow -3. Click "Run workflow" -4. Configure options: - - `run_full_evals`: Set to `true` to run actual evaluations - - `concurrency`: Set evaluation concurrency (default: 2) - -### Setting Up Secrets - -For full evaluation runs, add the OpenRouter API key: - -1. Go to repository Settings → Secrets and variables → Actions -2. Add new repository secret: - - Name: `OPENROUTER_API_KEY` - - Value: Your OpenRouter API key (e.g., `sk-or-v1-...`) - -## Docker Compose Networking in GitHub Actions - -The workflows demonstrate that Docker Compose networking works seamlessly in GitHub Actions: - -### Service Communication - -- Services communicate using service names as hostnames -- Database: `postgresql://postgres:password@db:5432/evals_development` -- Redis: `redis://redis:6379` -- Web service: `http://web:3000` - -### Network Features Tested - -- ✅ Container-to-container communication -- ✅ Service discovery via service names -- ✅ Port mapping and internal networking -- ✅ Health checks and service dependencies -- ✅ Docker socket mounting for Docker-in-Docker -- ✅ Volume mounts for data persistence -- ✅ Profile-based service grouping - -### Networking Validation - -The workflows include comprehensive networking tests: - -```bash -# Test database connectivity -docker compose exec -T web sh -c 'nc -z db 5432' - -# Test Redis connectivity -docker compose exec -T web sh -c 'nc -z redis 6379' - -# Test cross-service communication -docker compose run --rm runner sh -c 'nc -z web 3000' -``` - -## Resource Considerations - -GitHub Actions runners have the following limits: - -- **Memory**: 7 GB RAM -- **CPU**: 2-core CPU -- **Disk**: 14 GB SSD space -- **Time**: 6 hours maximum job runtime - -For the evals system: - -- Quick tests typically complete in 5-10 minutes -- Full evaluation runs may take 30-60 minutes depending on scope -- Resource usage scales with concurrency settings - -## Troubleshooting - -### Common Issues - -1. **Service startup timeouts** - - - Increase timeout values in workflow - - Check service health check configurations - - Review service logs in workflow output - -2. **Networking failures** - - - Verify service names match docker-compose.yml - - Check port configurations - - Ensure services are in the same Docker network - -3. **Docker socket access issues** - - Verify `/var/run/docker.sock` mount in docker-compose.yml - - Check Docker-in-Docker permissions - -### Debugging - -The workflows include comprehensive logging: - -- Service status and health checks -- Network information and container details -- Service logs on failure -- Artifact uploads for evaluation results - -To debug locally, you can run the same commands used in the workflows: - -```bash -cd packages/evals - -# Build and start services -docker compose --profile server up -d - -# Test connectivity -docker compose exec -T web sh -c 'nc -z db 5432' -docker compose exec -T redis redis-cli ping - -# View logs -docker compose logs db -docker compose logs redis -docker compose logs web -``` - -## Performance Optimization - -For faster CI runs: - -- Use Docker layer caching with `docker/setup-buildx-action` -- Minimize Docker image sizes -- Use health checks to avoid unnecessary wait times -- Run tests in parallel where possible -- Cache dependencies between workflow runs From 1d305ab371c448f53d047eaf0106b11aad043470 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 09:37:45 -0700 Subject: [PATCH 15/20] More powerful runner --- .github/workflows/evals.yml | 2 +- packages/evals/src/cli/index.ts | 2 +- packages/evals/src/cli/runCi.ts | 11 ++++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index e96b414c6c..fd115df964 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -13,7 +13,7 @@ jobs: evals: # Run if triggered manually or if PR has 'evals' label. if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals') - runs-on: ubuntu-latest + runs-on: blacksmith-16vcpu-ubuntu-2404 timeout-minutes: 30 steps: diff --git a/packages/evals/src/cli/index.ts b/packages/evals/src/cli/index.ts index 0a0be28390..de62be8ae0 100644 --- a/packages/evals/src/cli/index.ts +++ b/packages/evals/src/cli/index.ts @@ -24,7 +24,7 @@ const main = async () => { try { if (ci) { - await runCi() + await runCi({ concurrency: 3, exercisesPerLanguage: 5 }) } else if (runId !== -1) { await runEvals(runId) } else if (taskId !== -1) { diff --git a/packages/evals/src/cli/runCi.ts b/packages/evals/src/cli/runCi.ts index 1b16d52aee..ca8a88e0e0 100644 --- a/packages/evals/src/cli/runCi.ts +++ b/packages/evals/src/cli/runCi.ts @@ -7,7 +7,7 @@ import { runEvals } from "./runEvals.js" export const runCi = async ({ concurrency = 1, - exercisesPerLanguage = 1, + exercisesPerLanguage, }: { concurrency?: number exercisesPerLanguage?: number @@ -17,8 +17,13 @@ export const runCi = async ({ const run = await createRun({ model: "anthropic/claude-sonnet-4", socketPath: "", concurrency }) for (const language of exerciseLanguages) { - const exercises = (await getExercisesForLanguage(EVALS_REPO_PATH, language)).slice(0, exercisesPerLanguage) - await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise })) + let exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language) + + if (exercisesPerLanguage) { + exercises = exercises.slice(0, exercisesPerLanguage) + } + + await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), { concurrency }) } await runEvals(run.id) From 1ffe32462bb5af9a8a90adafb4956a4befcdd984 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 09:44:03 -0700 Subject: [PATCH 16/20] Remove web container --- .github/workflows/evals.yml | 58 ++----------------------------------- 1 file changed, 3 insertions(+), 55 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index fd115df964..39267edb92 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -38,18 +38,7 @@ jobs: HOST_EXECUTION_METHOD=docker EOF - - name: Build web image - uses: docker/build-push-action@v5 - with: - context: . - file: packages/evals/Dockerfile.web - tags: evals-web:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - - - name: Build runner image + - name: Build image uses: docker/build-push-action@v5 with: context: . @@ -60,10 +49,9 @@ jobs: push: false load: true - - name: Tag images + - name: Tag image run: | cd packages/evals - docker tag evals-web:latest evals-web docker tag evals-runner:latest evals-runner - name: Start containers @@ -74,59 +62,19 @@ jobs: - name: Wait for containers run: | cd packages/evals - - # Wait for services - echo "Waiting for PostgreSQL..." timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' - - echo "Waiting for Redis..." timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - - echo "Testing database connection from web container..." docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' - - echo "Testing Redis connection from web container..." docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' - echo "Testing web service startup..." - timeout 60 bash -c 'until curl -f http://localhost:3000/api/health 2>/dev/null; do echo "Waiting for web service..."; sleep 2; done' - - echo "✓ Web service is healthy" - echo "Health check response:" - curl -s http://localhost:3000/api/health | jq 2>/dev/null || curl -s http://localhost:3000/api/health - - - name: Test runner container networking + - name: Test runner run: | cd packages/evals - - echo "Testing runner container can connect to services..." docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' - docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"' - - - name: Test Docker socket access - run: | - cd packages/evals - - echo "Testing Docker socket access in runner..." docker compose run --rm runner docker --version docker compose run --rm runner docker ps - - name: Show service status - if: always() - run: | - cd packages/evals - - echo "=== Service Status ===" - docker compose ps - - echo "=== Network Information ===" - docker network ls | grep evals || echo "No evals network found" - - echo "=== Container Information ===" - docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"' - docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"' - - name: Run evals run: | cd packages/evals From d0e8f2bcf98a9b8380c9fa73c060d281c013462a Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 09:44:26 -0700 Subject: [PATCH 17/20] Increase timeout --- .github/workflows/evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 39267edb92..c472296f9d 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -14,7 +14,7 @@ jobs: # Run if triggered manually or if PR has 'evals' label. if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals') runs-on: blacksmith-16vcpu-ubuntu-2404 - timeout-minutes: 30 + timeout-minutes: 45 steps: - name: Checkout repository From e2ced71d43652f186022256145c743555e17d999 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 09:56:59 -0700 Subject: [PATCH 18/20] Don't start the web container --- .github/workflows/evals.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index c472296f9d..4c3e4d09f0 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -57,11 +57,7 @@ jobs: - name: Start containers run: | cd packages/evals - docker compose --profile server up -d - - - name: Wait for containers - run: | - cd packages/evals + docker compose up -d db redis timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' From e1132c7dcbdb5428776981a5d72eb0d55295bc74 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 10:08:17 -0700 Subject: [PATCH 19/20] Oops --- .github/workflows/evals.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 4c3e4d09f0..d12dcd4eac 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -60,8 +60,6 @@ jobs: docker compose up -d db redis timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"' - docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"' - name: Test runner run: | From 41df1286b36f14bf0182673a780ce6c16a9f1b56 Mon Sep 17 00:00:00 2001 From: cte Date: Tue, 10 Jun 2025 10:22:34 -0700 Subject: [PATCH 20/20] More cleanup --- .github/workflows/evals.yml | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index d12dcd4eac..9d8f9fb49b 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -16,6 +16,10 @@ jobs: runs-on: blacksmith-16vcpu-ubuntu-2404 timeout-minutes: 45 + defaults: + run: + working-directory: packages/evals + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -25,8 +29,6 @@ jobs: - name: Create environment run: | - cd packages/evals - cat > .env.local << EOF OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} EOF @@ -50,32 +52,23 @@ jobs: load: true - name: Tag image - run: | - cd packages/evals - docker tag evals-runner:latest evals-runner + run: docker tag evals-runner:latest evals-runner - name: Start containers run: | - cd packages/evals docker compose up -d db redis timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - - - name: Test runner - run: | - cd packages/evals docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"' docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"' - docker compose run --rm runner docker --version docker compose run --rm runner docker ps + - name: Run database migrations + run: docker compose run --rm runner pnpm --filter @roo-code/evals db:migrate + - name: Run evals - run: | - cd packages/evals - docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci + run: docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci - name: Cleanup if: always() - run: | - cd packages/evals - docker compose down -v --remove-orphans + run: docker compose down -v --remove-orphans