From 91ad6acdfbd401e2f54c3d4de95a3044f8452a3f Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Mon, 9 Jun 2025 10:58:34 -0700
Subject: [PATCH 01/20] GHA evals

---
 .github/workflows/evals-quick-test.yml | 148 ++++++++++++++++
 .github/workflows/evals.yml            | 235 +++++++++++++++++++++++++
 packages/evals/GITHUB_ACTIONS.md       | 201 +++++++++++++++++++++
 3 files changed, 584 insertions(+)
 create mode 100644 .github/workflows/evals-quick-test.yml
 create mode 100644 .github/workflows/evals.yml
 create mode 100644 packages/evals/GITHUB_ACTIONS.md

diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml
new file mode 100644
index 0000000000..e8f92f6212
--- /dev/null
+++ b/.github/workflows/evals-quick-test.yml
@@ -0,0 +1,148 @@
+name: Evals Quick Test
+
+on:
+  workflow_dispatch:
+
+env:
+  DOCKER_BUILDKIT: 1
+  COMPOSE_DOCKER_CLI_BUILD: 1
+
+jobs:
+  test-docker-compose:
+    name: Test Docker Compose Networking
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Create test environment
+        run: |
+          cd packages/evals
+          
+          # Create minimal test environment
+          cat > .env.test << EOF
+          NODE_ENV=test
+          DATABASE_URL=postgresql://postgres:password@db:5432/evals_test
+          REDIS_URL=redis://redis:6379
+          HOST_EXECUTION_METHOD=docker
+          EOF
+
+      - name: Build images
+        run: |
+          cd packages/evals
+          docker compose build web runner
+
+      - name: Start server services
+        run: |
+          cd packages/evals
+          docker compose --profile server up -d
+
+      - name: Test service connectivity
+        run: |
+          cd packages/evals
+          
+          # Wait for services
+          echo "Waiting for PostgreSQL..."
+          timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
+          
+          echo "Waiting for Redis..."
+          timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
+          
+          # Test inter-container networking
+          echo "Testing database connection from web container..."
+          docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
+          
+          echo "Testing Redis connection from web container..."
+          docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
+          
+          # Test that web service can start (basic health check)
+          echo "Testing web service startup..."
+          timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..."
+
+      - name: Test runner container networking
+        run: |
+          cd packages/evals
+          
+          echo "Testing runner container can connect to services..."
+          docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
+          docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
+          docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'
+
+      - name: Verify Docker socket access
+        run: |
+          cd packages/evals
+          
+          echo "Testing Docker socket access in runner..."
+          docker compose run --rm runner docker --version
+          docker compose run --rm runner docker ps
+
+      - name: Show service status
+        if: always()
+        run: |
+          cd packages/evals
+          echo "=== Service Status ==="
+          docker compose ps
+          
+          echo "=== Network Information ==="
+          docker network ls | grep evals || echo "No evals network found"
+          
+          echo "=== Container Information ==="
+          docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
+          docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'
+
+      - name: Cleanup
+        if: always()
+        run: |
+          cd packages/evals
+          docker compose down -v --remove-orphans
+
+  validate-compose-file:
+    name: Validate Compose Configuration
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Validate Docker Compose syntax
+        run: |
+          cd packages/evals
+          docker compose config --quiet
+
+      - name: Check service definitions
+        run: |
+          cd packages/evals
+          
+          # Verify all expected services are defined
+          services=$(docker compose config --services | sort)
+          expected_services="db redis runner web"
+          
+          echo "Defined services: $services"
+          echo "Expected services: $expected_services"
+          
+          for service in $expected_services; do
+            if ! echo "$services" | grep -q "^$service$"; then
+              echo "ERROR: Service '$service' not found"
+              exit 1
+            fi
+          done
+          
+          echo "✓ All expected services found"
+
+      - name: Check profiles
+        run: |
+          cd packages/evals
+          
+          # Test profile configurations
+          echo "Testing server profile..."
+          docker compose --profile server config --services | sort
+          
+          echo "Testing runner profile..."  
+          docker compose --profile runner config --services | sort
+          
+          echo "✓ Profiles validated"
\ No newline at end of file
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
new file mode 100644
index 0000000000..b20633fd2e
--- /dev/null
+++ b/.github/workflows/evals.yml
@@ -0,0 +1,235 @@
+name: Evals Docker Compose
+
+on:
+  workflow_dispatch:
+    inputs:
+      run_full_evals:
+        description: 'Run full evaluation suite'
+        required: false
+        default: 'false'
+        type: boolean
+      concurrency:
+        description: 'Evaluation concurrency level'
+        required: false
+        default: '2'
+        type: string
+
+env:
+  DOCKER_BUILDKIT: 1
+  COMPOSE_DOCKER_CLI_BUILD: 1
+
+jobs:
+  build-and-test:
+    name: Build and Test Evals
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Create environment files
+        run: |
+          cd packages/evals
+          
+          # Create .env.test for testing
+          cat > .env.test << EOF
+          NODE_ENV=test
+          DATABASE_URL=postgresql://postgres:password@db:5432/evals_test
+          REDIS_URL=redis://redis:6379
+          HOST_EXECUTION_METHOD=docker
+          EOF
+          
+          # Create .env.development for development
+          cat > .env.development << EOF
+          NODE_ENV=development
+          DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+          REDIS_URL=redis://redis:6379
+          HOST_EXECUTION_METHOD=docker
+          EOF
+
+      - name: Build Docker images
+        run: |
+          cd packages/evals
+          docker compose build web runner
+
+      - name: Start server services
+        run: |
+          cd packages/evals
+          docker compose --profile server up -d
+
+      - name: Wait for services to be ready
+        run: |
+          cd packages/evals
+          
+          # Wait for PostgreSQL to be ready
+          echo "Waiting for PostgreSQL..."
+          timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
+          
+          # Wait for Redis to be ready
+          echo "Waiting for Redis..."
+          timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
+          
+          # Wait for web service to be ready
+          echo "Waiting for web service..."
+          timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
+
+      - name: Run database migrations
+        run: |
+          cd packages/evals
+          docker compose exec -T web pnpm db:push
+
+      - name: Run tests
+        run: |
+          cd packages/evals
+          docker compose run --rm runner pnpm _test
+
+      - name: Check service logs on failure
+        if: failure()
+        run: |
+          cd packages/evals
+          echo "=== Database logs ==="
+          docker compose logs db
+          echo "=== Redis logs ==="
+          docker compose logs redis
+          echo "=== Web service logs ==="
+          docker compose logs web
+
+      - name: Cleanup
+        if: always()
+        run: |
+          cd packages/evals
+          docker compose down -v --remove-orphans
+
+  run-sample-evals:
+    name: Run Sample Evaluations
+    runs-on: ubuntu-latest
+    needs: build-and-test
+    if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch'
+    timeout-minutes: 60
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Create environment files
+        run: |
+          cd packages/evals
+          
+          cat > .env.local << EOF
+          OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
+          EOF
+          
+          cat > .env.development << EOF
+          NODE_ENV=development
+          DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+          REDIS_URL=redis://redis:6379
+          HOST_EXECUTION_METHOD=docker
+          EOF
+
+      - name: Build and start services
+        run: |
+          cd packages/evals
+          docker compose --profile server --profile runner up --build -d --scale runner=0
+
+      - name: Wait for services
+        run: |
+          cd packages/evals
+          timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
+          timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
+          timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
+
+      - name: Run database setup
+        run: |
+          cd packages/evals
+          docker compose exec -T web pnpm db:push
+
+      - name: Run sample evaluation
+        env:
+          CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }}
+        run: |
+          cd packages/evals
+          
+          # Run a limited set of evaluations for CI
+          docker compose run --rm runner pnpm cli run \
+            --concurrency $CONCURRENCY \
+            --timeout 300 \
+            --max-exercises 3 \
+            --model "anthropic/claude-3-5-sonnet-20241022"
+
+      - name: Upload evaluation results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: evaluation-results
+          path: |
+            packages/evals/results/
+            packages/evals/logs/
+          retention-days: 7
+
+      - name: Cleanup
+        if: always()
+        run: |
+          cd packages/evals
+          docker compose down -v --remove-orphans
+
+  security-scan:
+    name: Security Scan
+    runs-on: ubuntu-latest
+    needs: build-and-test
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@master
+        with:
+          scan-type: 'fs'
+          scan-ref: 'packages/evals'
+          format: 'sarif'
+          output: 'trivy-results.sarif'
+
+      - name: Upload Trivy scan results
+        uses: github/codeql-action/upload-sarif@v3
+        if: always()
+        with:
+          sarif_file: 'trivy-results.sarif'
+
+  docker-compose-validate:
+    name: Validate Docker Compose
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Validate Docker Compose file
+        run: |
+          cd packages/evals
+          docker compose config --quiet
+
+      - name: Check Docker Compose services
+        run: |
+          cd packages/evals
+          docker compose config --services | sort > services.txt
+          echo "Available services:"
+          cat services.txt
+          
+          # Verify expected services exist
+          for service in db redis web runner; do
+            if ! grep -q "^$service$" services.txt; then
+              echo "ERROR: Service '$service' not found in docker-compose.yml"
+              exit 1
+            fi
+          done
+          
+          echo "All expected services found ✓"
\ No newline at end of file
diff --git a/packages/evals/GITHUB_ACTIONS.md b/packages/evals/GITHUB_ACTIONS.md
new file mode 100644
index 0000000000..454c42ec87
--- /dev/null
+++ b/packages/evals/GITHUB_ACTIONS.md
@@ -0,0 +1,201 @@
+# GitHub Actions for Evals
+
+This document describes the GitHub Actions workflows available for the Roo Code Evals system.
+
+## Workflows
+
+### 1. `evals.yml` - Full Evaluation Workflow
+
+**Purpose**: Comprehensive testing and evaluation workflow that builds, tests, and optionally runs full evaluations.
+
+**Triggers**:
+
+- Push to `main` or `develop` branches (when evals files change)
+- Pull requests to `main` or `develop` branches (when evals files change)
+- Manual dispatch with options
+
+**Jobs**:
+
+#### `build-and-test`
+
+- Builds Docker images for web and runner services
+- Starts PostgreSQL, Redis, and web services
+- Waits for all services to be ready
+- Runs database migrations
+- Executes test suite
+- Provides detailed logging on failure
+
+#### `run-sample-evals` (conditional)
+
+- Only runs when manually triggered with `run_full_evals: true`
+- Requires `OPENROUTER_API_KEY` secret to be configured
+- Runs a limited set of evaluations for CI testing
+- Uploads evaluation results as artifacts
+- Configurable concurrency level
+
+#### `security-scan`
+
+- Runs Trivy vulnerability scanner on the evals package
+- Uploads results to GitHub Security tab
+
+#### `docker-compose-validate`
+
+- Validates Docker Compose file syntax
+- Verifies all expected services are defined
+
+**Required Secrets**:
+
+- `OPENROUTER_API_KEY` (only for full evaluation runs)
+
+### 2. `evals-quick-test.yml` - Quick Networking Test
+
+**Purpose**: Fast validation of Docker Compose networking and basic functionality.
+
+**Triggers**:
+
+- Push to `main` or `develop` branches (when evals files change)
+- Pull requests to `main` or `develop` branches (when evals files change)
+
+**Jobs**:
+
+#### `test-docker-compose`
+
+- Tests inter-container networking between all services
+- Verifies database and Redis connectivity
+- Tests Docker socket access in runner container
+- Validates service startup and health
+
+#### `validate-compose-file`
+
+- Validates Docker Compose syntax
+- Checks service definitions and profiles
+
+## Usage Examples
+
+### Manual Workflow Dispatch
+
+To run full evaluations manually:
+
+1. Go to Actions tab in GitHub
+2. Select "Evals Docker Compose" workflow
+3. Click "Run workflow"
+4. Configure options:
+    - `run_full_evals`: Set to `true` to run actual evaluations
+    - `concurrency`: Set evaluation concurrency (default: 2)
+
+### Setting Up Secrets
+
+For full evaluation runs, add the OpenRouter API key:
+
+1. Go to repository Settings → Secrets and variables → Actions
+2. Add new repository secret:
+    - Name: `OPENROUTER_API_KEY`
+    - Value: Your OpenRouter API key (e.g., `sk-or-v1-...`)
+
+## Docker Compose Networking in GitHub Actions
+
+The workflows demonstrate that Docker Compose networking works seamlessly in GitHub Actions:
+
+### Service Communication
+
+- Services communicate using service names as hostnames
+- Database: `postgresql://postgres:password@db:5432/evals_development`
+- Redis: `redis://redis:6379`
+- Web service: `http://web:3000`
+
+### Network Features Tested
+
+- ✅ Container-to-container communication
+- ✅ Service discovery via service names
+- ✅ Port mapping and internal networking
+- ✅ Health checks and service dependencies
+- ✅ Docker socket mounting for Docker-in-Docker
+- ✅ Volume mounts for data persistence
+- ✅ Profile-based service grouping
+
+### Networking Validation
+
+The workflows include comprehensive networking tests:
+
+```bash
+# Test database connectivity
+docker compose exec -T web sh -c 'nc -z db 5432'
+
+# Test Redis connectivity
+docker compose exec -T web sh -c 'nc -z redis 6379'
+
+# Test cross-service communication
+docker compose run --rm runner sh -c 'nc -z web 3000'
+```
+
+## Resource Considerations
+
+GitHub Actions runners have the following limits:
+
+- **Memory**: 7 GB RAM
+- **CPU**: 2-core CPU
+- **Disk**: 14 GB SSD space
+- **Time**: 6 hours maximum job runtime
+
+For the evals system:
+
+- Quick tests typically complete in 5-10 minutes
+- Full evaluation runs may take 30-60 minutes depending on scope
+- Resource usage scales with concurrency settings
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Service startup timeouts**
+
+    - Increase timeout values in workflow
+    - Check service health check configurations
+    - Review service logs in workflow output
+
+2. **Networking failures**
+
+    - Verify service names match docker-compose.yml
+    - Check port configurations
+    - Ensure services are in the same Docker network
+
+3. **Docker socket access issues**
+    - Verify `/var/run/docker.sock` mount in docker-compose.yml
+    - Check Docker-in-Docker permissions
+
+### Debugging
+
+The workflows include comprehensive logging:
+
+- Service status and health checks
+- Network information and container details
+- Service logs on failure
+- Artifact uploads for evaluation results
+
+To debug locally, you can run the same commands used in the workflows:
+
+```bash
+cd packages/evals
+
+# Build and start services
+docker compose --profile server up -d
+
+# Test connectivity
+docker compose exec -T web sh -c 'nc -z db 5432'
+docker compose exec -T redis redis-cli ping
+
+# View logs
+docker compose logs db
+docker compose logs redis
+docker compose logs web
+```
+
+## Performance Optimization
+
+For faster CI runs:
+
+- Use Docker layer caching with `docker/setup-buildx-action`
+- Minimize Docker image sizes
+- Use health checks to avoid unnecessary wait times
+- Run tests in parallel where possible
+- Cache dependencies between workflow runs

From f5d385208edc6e8f7f2d8ec3085f783c89989174 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Mon, 9 Jun 2025 11:09:14 -0700
Subject: [PATCH 02/20] Add GitHub Actions workflows for evals Docker Compose
 testing

- Add comprehensive evals.yml workflow for full testing and evaluation runs
- Add evals-quick-test.yml for fast Docker Compose networking validation
- Include documentation in GITHUB_ACTIONS.md
- Workflows trigger on PRs and support manual dispatch for testing
---
 .github/workflows/evals-quick-test.yml | 5 +++++
 .github/workflows/evals.yml            | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml
index e8f92f6212..99cfa6fae7 100644
--- a/.github/workflows/evals-quick-test.yml
+++ b/.github/workflows/evals-quick-test.yml
@@ -1,6 +1,11 @@
 name: Evals Quick Test
 
 on:
+  pull_request:
+    branches: [main, develop]
+    paths:
+      - 'packages/evals/**'
+      - '.github/workflows/evals*.yml'
   workflow_dispatch:
 
 env:
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index b20633fd2e..0ef2093dcf 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -1,6 +1,11 @@
 name: Evals Docker Compose
 
 on:
+  pull_request:
+    branches: [main, develop]
+    paths:
+      - 'packages/evals/**'
+      - '.github/workflows/evals*.yml'
   workflow_dispatch:
     inputs:
       run_full_evals:

From 1465297c8920de63148f916fa0ee8791a1dbebbd Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Mon, 9 Jun 2025 11:11:27 -0700
Subject: [PATCH 03/20] Revert this

---
 .github/workflows/evals.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index 0ef2093dcf..b20633fd2e 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -1,11 +1,6 @@
 name: Evals Docker Compose
 
 on:
-  pull_request:
-    branches: [main, develop]
-    paths:
-      - 'packages/evals/**'
-      - '.github/workflows/evals*.yml'
   workflow_dispatch:
     inputs:
       run_full_evals:

From 1b02cceb98cecd246a2f1e789d12b1e7c708ad88 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Mon, 9 Jun 2025 11:14:31 -0700
Subject: [PATCH 04/20] Remove "validate-compose-file"

---
 .github/workflows/evals-quick-test.yml | 46 --------------------------
 1 file changed, 46 deletions(-)

diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml
index 99cfa6fae7..ae64c847fd 100644
--- a/.github/workflows/evals-quick-test.yml
+++ b/.github/workflows/evals-quick-test.yml
@@ -105,49 +105,3 @@ jobs:
         run: |
           cd packages/evals
           docker compose down -v --remove-orphans
-
-  validate-compose-file:
-    name: Validate Compose Configuration
-    runs-on: ubuntu-latest
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Validate Docker Compose syntax
-        run: |
-          cd packages/evals
-          docker compose config --quiet
-
-      - name: Check service definitions
-        run: |
-          cd packages/evals
-          
-          # Verify all expected services are defined
-          services=$(docker compose config --services | sort)
-          expected_services="db redis runner web"
-          
-          echo "Defined services: $services"
-          echo "Expected services: $expected_services"
-          
-          for service in $expected_services; do
-            if ! echo "$services" | grep -q "^$service$"; then
-              echo "ERROR: Service '$service' not found"
-              exit 1
-            fi
-          done
-          
-          echo "✓ All expected services found"
-
-      - name: Check profiles
-        run: |
-          cd packages/evals
-          
-          # Test profile configurations
-          echo "Testing server profile..."
-          docker compose --profile server config --services | sort
-          
-          echo "Testing runner profile..."  
-          docker compose --profile runner config --services | sort
-          
-          echo "✓ Profiles validated"
\ No newline at end of file

From dd568b1b970397144f2f95f6c6d33d275178d4a1 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Mon, 9 Jun 2025 11:18:56 -0700
Subject: [PATCH 05/20] Add .env.local file

---
 .github/workflows/evals-quick-test.yml | 205 +++++------
 .github/workflows/evals.yml            | 454 ++++++++++++-------------
 2 files changed, 331 insertions(+), 328 deletions(-)

diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml
index ae64c847fd..b4452e692d 100644
--- a/.github/workflows/evals-quick-test.yml
+++ b/.github/workflows/evals-quick-test.yml
@@ -1,107 +1,112 @@
 name: Evals Quick Test
 
 on:
-  pull_request:
-    branches: [main, develop]
-    paths:
-      - 'packages/evals/**'
-      - '.github/workflows/evals*.yml'
-  workflow_dispatch:
+    pull_request:
+        branches: [main, develop]
+        paths:
+            - "packages/evals/**"
+            - ".github/workflows/evals*.yml"
+    workflow_dispatch:
 
 env:
-  DOCKER_BUILDKIT: 1
-  COMPOSE_DOCKER_CLI_BUILD: 1
+    DOCKER_BUILDKIT: 1
+    COMPOSE_DOCKER_CLI_BUILD: 1
 
 jobs:
-  test-docker-compose:
-    name: Test Docker Compose Networking
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Create test environment
-        run: |
-          cd packages/evals
-          
-          # Create minimal test environment
-          cat > .env.test << EOF
-          NODE_ENV=test
-          DATABASE_URL=postgresql://postgres:password@db:5432/evals_test
-          REDIS_URL=redis://redis:6379
-          HOST_EXECUTION_METHOD=docker
-          EOF
-
-      - name: Build images
-        run: |
-          cd packages/evals
-          docker compose build web runner
-
-      - name: Start server services
-        run: |
-          cd packages/evals
-          docker compose --profile server up -d
-
-      - name: Test service connectivity
-        run: |
-          cd packages/evals
-          
-          # Wait for services
-          echo "Waiting for PostgreSQL..."
-          timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
-          
-          echo "Waiting for Redis..."
-          timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-          
-          # Test inter-container networking
-          echo "Testing database connection from web container..."
-          docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
-          
-          echo "Testing Redis connection from web container..."
-          docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
-          
-          # Test that web service can start (basic health check)
-          echo "Testing web service startup..."
-          timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..."
-
-      - name: Test runner container networking
-        run: |
-          cd packages/evals
-          
-          echo "Testing runner container can connect to services..."
-          docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
-          docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
-          docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'
-
-      - name: Verify Docker socket access
-        run: |
-          cd packages/evals
-          
-          echo "Testing Docker socket access in runner..."
-          docker compose run --rm runner docker --version
-          docker compose run --rm runner docker ps
-
-      - name: Show service status
-        if: always()
-        run: |
-          cd packages/evals
-          echo "=== Service Status ==="
-          docker compose ps
-          
-          echo "=== Network Information ==="
-          docker network ls | grep evals || echo "No evals network found"
-          
-          echo "=== Container Information ==="
-          docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
-          docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'
-
-      - name: Cleanup
-        if: always()
-        run: |
-          cd packages/evals
-          docker compose down -v --remove-orphans
+    test-docker-compose:
+        name: Test Docker Compose Networking
+        runs-on: ubuntu-latest
+        timeout-minutes: 15
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+
+            - name: Set up Docker Buildx
+              uses: docker/setup-buildx-action@v3
+
+            - name: Create test environment
+              run: |
+                  cd packages/evals
+
+                  # Create .env.local (required for Docker build)
+                  cat > .env.local << EOF
+                  OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
+                  EOF
+
+                  # Create development environment
+                  cat > .env.development << EOF
+                  NODE_ENV=development
+                  DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+                  REDIS_URL=redis://redis:6379
+                  HOST_EXECUTION_METHOD=docker
+                  EOF
+
+            - name: Build images
+              run: |
+                  cd packages/evals
+                  docker compose build web runner
+
+            - name: Start server services
+              run: |
+                  cd packages/evals
+                  docker compose --profile server up -d
+
+            - name: Test service connectivity
+              run: |
+                  cd packages/evals
+
+                  # Wait for services
+                  echo "Waiting for PostgreSQL..."
+                  timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
+
+                  echo "Waiting for Redis..."
+                  timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
+
+                  # Test inter-container networking
+                  echo "Testing database connection from web container..."
+                  docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
+
+                  echo "Testing Redis connection from web container..."
+                  docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
+
+                  # Test that web service can start (basic health check)
+                  echo "Testing web service startup..."
+                  timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..."
+
+            - name: Test runner container networking
+              run: |
+                  cd packages/evals
+
+                  echo "Testing runner container can connect to services..."
+                  docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
+                  docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
+                  docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'
+
+            - name: Verify Docker socket access
+              run: |
+                  cd packages/evals
+
+                  echo "Testing Docker socket access in runner..."
+                  docker compose run --rm runner docker --version
+                  docker compose run --rm runner docker ps
+
+            - name: Show service status
+              if: always()
+              run: |
+                  cd packages/evals
+                  echo "=== Service Status ==="
+                  docker compose ps
+
+                  echo "=== Network Information ==="
+                  docker network ls | grep evals || echo "No evals network found"
+
+                  echo "=== Container Information ==="
+                  docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
+                  docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'
+
+            - name: Cleanup
+              if: always()
+              run: |
+                  cd packages/evals
+                  docker compose down -v --remove-orphans
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index b20633fd2e..3148a023c6 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -1,235 +1,233 @@
 name: Evals Docker Compose
 
 on:
-  workflow_dispatch:
-    inputs:
-      run_full_evals:
-        description: 'Run full evaluation suite'
-        required: false
-        default: 'false'
-        type: boolean
-      concurrency:
-        description: 'Evaluation concurrency level'
-        required: false
-        default: '2'
-        type: string
+    workflow_dispatch:
+        inputs:
+            run_full_evals:
+                description: "Run full evaluation suite"
+                required: false
+                default: "false"
+                type: boolean
+            concurrency:
+                description: "Evaluation concurrency level"
+                required: false
+                default: "2"
+                type: string
 
 env:
-  DOCKER_BUILDKIT: 1
-  COMPOSE_DOCKER_CLI_BUILD: 1
+    DOCKER_BUILDKIT: 1
+    COMPOSE_DOCKER_CLI_BUILD: 1
 
 jobs:
-  build-and-test:
-    name: Build and Test Evals
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Create environment files
-        run: |
-          cd packages/evals
-          
-          # Create .env.test for testing
-          cat > .env.test << EOF
-          NODE_ENV=test
-          DATABASE_URL=postgresql://postgres:password@db:5432/evals_test
-          REDIS_URL=redis://redis:6379
-          HOST_EXECUTION_METHOD=docker
-          EOF
-          
-          # Create .env.development for development
-          cat > .env.development << EOF
-          NODE_ENV=development
-          DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
-          REDIS_URL=redis://redis:6379
-          HOST_EXECUTION_METHOD=docker
-          EOF
-
-      - name: Build Docker images
-        run: |
-          cd packages/evals
-          docker compose build web runner
-
-      - name: Start server services
-        run: |
-          cd packages/evals
-          docker compose --profile server up -d
-
-      - name: Wait for services to be ready
-        run: |
-          cd packages/evals
-          
-          # Wait for PostgreSQL to be ready
-          echo "Waiting for PostgreSQL..."
-          timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
-          
-          # Wait for Redis to be ready
-          echo "Waiting for Redis..."
-          timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-          
-          # Wait for web service to be ready
-          echo "Waiting for web service..."
-          timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
-
-      - name: Run database migrations
-        run: |
-          cd packages/evals
-          docker compose exec -T web pnpm db:push
-
-      - name: Run tests
-        run: |
-          cd packages/evals
-          docker compose run --rm runner pnpm _test
-
-      - name: Check service logs on failure
-        if: failure()
-        run: |
-          cd packages/evals
-          echo "=== Database logs ==="
-          docker compose logs db
-          echo "=== Redis logs ==="
-          docker compose logs redis
-          echo "=== Web service logs ==="
-          docker compose logs web
-
-      - name: Cleanup
-        if: always()
-        run: |
-          cd packages/evals
-          docker compose down -v --remove-orphans
-
-  run-sample-evals:
-    name: Run Sample Evaluations
-    runs-on: ubuntu-latest
-    needs: build-and-test
-    if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch'
-    timeout-minutes: 60
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Create environment files
-        run: |
-          cd packages/evals
-          
-          cat > .env.local << EOF
-          OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
-          EOF
-          
-          cat > .env.development << EOF
-          NODE_ENV=development
-          DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
-          REDIS_URL=redis://redis:6379
-          HOST_EXECUTION_METHOD=docker
-          EOF
-
-      - name: Build and start services
-        run: |
-          cd packages/evals
-          docker compose --profile server --profile runner up --build -d --scale runner=0
-
-      - name: Wait for services
-        run: |
-          cd packages/evals
-          timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
-          timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-          timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
-
-      - name: Run database setup
-        run: |
-          cd packages/evals
-          docker compose exec -T web pnpm db:push
-
-      - name: Run sample evaluation
-        env:
-          CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }}
-        run: |
-          cd packages/evals
-          
-          # Run a limited set of evaluations for CI
-          docker compose run --rm runner pnpm cli run \
-            --concurrency $CONCURRENCY \
-            --timeout 300 \
-            --max-exercises 3 \
-            --model "anthropic/claude-3-5-sonnet-20241022"
-
-      - name: Upload evaluation results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: evaluation-results
-          path: |
-            packages/evals/results/
-            packages/evals/logs/
-          retention-days: 7
-
-      - name: Cleanup
-        if: always()
-        run: |
-          cd packages/evals
-          docker compose down -v --remove-orphans
-
-  security-scan:
-    name: Security Scan
-    runs-on: ubuntu-latest
-    needs: build-and-test
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@master
-        with:
-          scan-type: 'fs'
-          scan-ref: 'packages/evals'
-          format: 'sarif'
-          output: 'trivy-results.sarif'
-
-      - name: Upload Trivy scan results
-        uses: github/codeql-action/upload-sarif@v3
-        if: always()
-        with:
-          sarif_file: 'trivy-results.sarif'
-
-  docker-compose-validate:
-    name: Validate Docker Compose
-    runs-on: ubuntu-latest
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Validate Docker Compose file
-        run: |
-          cd packages/evals
-          docker compose config --quiet
-
-      - name: Check Docker Compose services
-        run: |
-          cd packages/evals
-          docker compose config --services | sort > services.txt
-          echo "Available services:"
-          cat services.txt
-          
-          # Verify expected services exist
-          for service in db redis web runner; do
-            if ! grep -q "^$service$" services.txt; then
-              echo "ERROR: Service '$service' not found in docker-compose.yml"
-              exit 1
-            fi
-          done
-          
-          echo "All expected services found ✓"
\ No newline at end of file
+    build-and-test:
+        name: Build and Test Evals
+        runs-on: ubuntu-latest
+        timeout-minutes: 30
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+              with:
+                  fetch-depth: 0
+
+            - name: Set up Docker Buildx
+              uses: docker/setup-buildx-action@v3
+
+            - name: Create environment files
+              run: |
+                  cd packages/evals
+
+                  # Create .env.local (required for Docker build)
+                  cat > .env.local << EOF
+                  OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
+                  EOF
+
+                  # Create .env.development for development
+                  cat > .env.development << EOF
+                  NODE_ENV=development
+                  DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+                  REDIS_URL=redis://redis:6379
+                  HOST_EXECUTION_METHOD=docker
+                  EOF
+
+            - name: Build Docker images
+              run: |
+                  cd packages/evals
+                  docker compose build web runner
+
+            - name: Start server services
+              run: |
+                  cd packages/evals
+                  docker compose --profile server up -d
+
+            - name: Wait for services to be ready
+              run: |
+                  cd packages/evals
+
+                  # Wait for PostgreSQL to be ready
+                  echo "Waiting for PostgreSQL..."
+                  timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
+
+                  # Wait for Redis to be ready
+                  echo "Waiting for Redis..."
+                  timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
+
+                  # Wait for web service to be ready
+                  echo "Waiting for web service..."
+                  timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
+
+            - name: Run database migrations
+              run: |
+                  cd packages/evals
+                  docker compose exec -T web pnpm db:push
+
+            - name: Run tests
+              run: |
+                  cd packages/evals
+                  docker compose run --rm runner pnpm _test
+
+            - name: Check service logs on failure
+              if: failure()
+              run: |
+                  cd packages/evals
+                  echo "=== Database logs ==="
+                  docker compose logs db
+                  echo "=== Redis logs ==="
+                  docker compose logs redis
+                  echo "=== Web service logs ==="
+                  docker compose logs web
+
+            - name: Cleanup
+              if: always()
+              run: |
+                  cd packages/evals
+                  docker compose down -v --remove-orphans
+
+    run-sample-evals:
+        name: Run Sample Evaluations
+        runs-on: ubuntu-latest
+        needs: build-and-test
+        if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch'
+        timeout-minutes: 60
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+
+            - name: Set up Docker Buildx
+              uses: docker/setup-buildx-action@v3
+
+            - name: Create environment files
+              run: |
+                  cd packages/evals
+
+                  # Create .env.local with actual API key for evaluations
+                  cat > .env.local << EOF
+                  OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
+                  EOF
+
+                  cat > .env.development << EOF
+                  NODE_ENV=development
+                  DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+                  REDIS_URL=redis://redis:6379
+                  HOST_EXECUTION_METHOD=docker
+                  EOF
+
+            - name: Build and start services
+              run: |
+                  cd packages/evals
+                  docker compose --profile server --profile runner up --build -d --scale runner=0
+
+            - name: Wait for services
+              run: |
+                  cd packages/evals
+                  timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
+                  timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
+                  timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
+
+            - name: Run database setup
+              run: |
+                  cd packages/evals
+                  docker compose exec -T web pnpm db:push
+
+            - name: Run sample evaluation
+              env:
+                  CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }}
+              run: |
+                  cd packages/evals
+
+                  # Run a limited set of evaluations for CI
+                  docker compose run --rm runner pnpm cli run \
+                    --concurrency $CONCURRENCY \
+                    --timeout 300 \
+                    --max-exercises 3 \
+                    --model "anthropic/claude-3-5-sonnet-20241022"
+
+            - name: Upload evaluation results
+              if: always()
+              uses: actions/upload-artifact@v4
+              with:
+                  name: evaluation-results
+                  path: |
+                      packages/evals/results/
+                      packages/evals/logs/
+                  retention-days: 7
+
+            - name: Cleanup
+              if: always()
+              run: |
+                  cd packages/evals
+                  docker compose down -v --remove-orphans
+
+    security-scan:
+        name: Security Scan
+        runs-on: ubuntu-latest
+        needs: build-and-test
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+
+            - name: Run Trivy vulnerability scanner
+              uses: aquasecurity/trivy-action@master
+              with:
+                  scan-type: "fs"
+                  scan-ref: "packages/evals"
+                  format: "sarif"
+                  output: "trivy-results.sarif"
+
+            - name: Upload Trivy scan results
+              uses: github/codeql-action/upload-sarif@v3
+              if: always()
+              with:
+                  sarif_file: "trivy-results.sarif"
+
+    docker-compose-validate:
+        name: Validate Docker Compose
+        runs-on: ubuntu-latest
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+
+            - name: Validate Docker Compose file
+              run: |
+                  cd packages/evals
+                  docker compose config --quiet
+
+            - name: Check Docker Compose services
+              run: |
+                  cd packages/evals
+                  docker compose config --services | sort > services.txt
+                  echo "Available services:"
+                  cat services.txt
+
+                  # Verify expected services exist
+                  for service in db redis web runner; do
+                    if ! grep -q "^$service$" services.txt; then
+                      echo "ERROR: Service '$service' not found in docker-compose.yml"
+                      exit 1
+                    fi
+                  done
+
+                  echo "All expected services found ✓"

From e369005131406ca165cedd370103b3c5822b2273 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Mon, 9 Jun 2025 11:37:07 -0700
Subject: [PATCH 06/20] Install nc

---
 packages/evals/Dockerfile.runner | 1 +
 packages/evals/Dockerfile.web    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/evals/Dockerfile.runner b/packages/evals/Dockerfile.runner
index c68b4f80c0..ec3277461c 100644
--- a/packages/evals/Dockerfile.runner
+++ b/packages/evals/Dockerfile.runner
@@ -13,6 +13,7 @@ RUN apt update && \
   git \
   vim \
   jq \
+  netcat-openbsd \
   apt-transport-https \
   ca-certificates \
   gnupg \
diff --git a/packages/evals/Dockerfile.web b/packages/evals/Dockerfile.web
index 55e8b5a298..b8713f69b9 100644
--- a/packages/evals/Dockerfile.web
+++ b/packages/evals/Dockerfile.web
@@ -8,7 +8,7 @@ RUN npm install -g npm@latest
 RUN npm install -g npm-run-all
 
 # Install system packages
-RUN apt update && apt install -y curl git vim jq postgresql-client
+RUN apt update && apt install -y curl git vim jq netcat-openbsd postgresql-client
 
 # Install Docker cli
 RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release

From 92e70b1803807833d831dceecaa9591c6c6e3c06 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Mon, 9 Jun 2025 11:42:52 -0700
Subject: [PATCH 07/20] Add Docker layer caching

---
 .github/workflows/evals-quick-test.yml | 29 +++++++++++--
 .github/workflows/evals.yml            | 59 +++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml
index b4452e692d..9dfaff9257 100644
--- a/.github/workflows/evals-quick-test.yml
+++ b/.github/workflows/evals-quick-test.yml
@@ -42,10 +42,33 @@ jobs:
                   HOST_EXECUTION_METHOD=docker
                   EOF
 
-            - name: Build images
+            - name: Build images with cache
+              uses: docker/build-push-action@v5
+              with:
+                context: .
+                file: packages/evals/Dockerfile.web
+                tags: evals-web:latest
+                cache-from: type=gha
+                cache-to: type=gha,mode=max
+                push: false
+                load: true
+      
+            - name: Build runner image with cache
+              uses: docker/build-push-action@v5
+              with:
+                context: .
+                file: packages/evals/Dockerfile.runner
+                tags: evals-runner:latest
+                cache-from: type=gha
+                cache-to: type=gha,mode=max
+                push: false
+                load: true
+      
+            - name: Tag images for docker-compose
               run: |
-                  cd packages/evals
-                  docker compose build web runner
+                cd packages/evals
+                docker tag evals-web:latest evals-web
+                docker tag evals-runner:latest evals-runner
 
             - name: Start server services
               run: |
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index 3148a023c6..ce2505db22 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -50,10 +50,33 @@ jobs:
                   HOST_EXECUTION_METHOD=docker
                   EOF
 
-            - name: Build Docker images
+            - name: Build web image with cache
+              uses: docker/build-push-action@v5
+              with:
+                context: .
+                file: packages/evals/Dockerfile.web
+                tags: evals-web:latest
+                cache-from: type=gha
+                cache-to: type=gha,mode=max
+                push: false
+                load: true
+      
+            - name: Build runner image with cache
+              uses: docker/build-push-action@v5
+              with:
+                context: .
+                file: packages/evals/Dockerfile.runner
+                tags: evals-runner:latest
+                cache-from: type=gha
+                cache-to: type=gha,mode=max
+                push: false
+                load: true
+      
+            - name: Tag images for docker-compose
               run: |
-                  cd packages/evals
-                  docker compose build web runner
+                cd packages/evals
+                docker tag evals-web:latest evals-web
+                docker tag evals-runner:latest evals-runner
 
             - name: Start server services
               run: |
@@ -133,10 +156,34 @@ jobs:
                   HOST_EXECUTION_METHOD=docker
                   EOF
 
-            - name: Build and start services
+            - name: Build web image with cache
+              uses: docker/build-push-action@v5
+              with:
+                context: .
+                file: packages/evals/Dockerfile.web
+                tags: evals-web:latest
+                cache-from: type=gha
+                cache-to: type=gha,mode=max
+                push: false
+                load: true
+      
+            - name: Build runner image with cache
+              uses: docker/build-push-action@v5
+              with:
+                context: .
+                file: packages/evals/Dockerfile.runner
+                tags: evals-runner:latest
+                cache-from: type=gha
+                cache-to: type=gha,mode=max
+                push: false
+                load: true
+      
+            - name: Tag images and start services
               run: |
-                  cd packages/evals
-                  docker compose --profile server --profile runner up --build -d --scale runner=0
+                cd packages/evals
+                docker tag evals-web:latest evals-web
+                docker tag evals-runner:latest evals-runner
+                docker compose --profile server --profile runner up -d --scale runner=0
 
             - name: Wait for services
               run: |

From 1c77f66710bae09772e555d99b4b3c9e0257ecd1 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 06:28:21 -0700
Subject: [PATCH 08/20] Retry logic fix

---
 packages/evals/src/cli/runTask.ts | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts
index 3b1ba61104..14e73dc59c 100644
--- a/packages/evals/src/cli/runTask.ts
+++ b/packages/evals/src/cli/runTask.ts
@@ -87,6 +87,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 	let taskStartedAt = Date.now()
 	let taskFinishedAt: number | undefined
 	let taskAbortedAt: number | undefined
+	let taskTimedOut: boolean = false
 	let taskMetricsId: number | undefined
 	let rooTaskId: string | undefined
 	let isClientDisconnected = false
@@ -196,6 +197,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 			timeout: EVALS_TIMEOUT,
 		})
 	} catch (_error) {
+		taskTimedOut = true
 		logger.error("time limit reached")
 
 		if (rooTaskId && !isClientDisconnected) {
@@ -207,16 +209,16 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 		taskFinishedAt = Date.now()
 	}
 
-	if (taskFinishedAt) {
-		logger.info("setting task finished at")
-		await updateTask(task.id, { finishedAt: new Date() })
-	}
-
-	if (!taskFinishedAt && isClientDisconnected) {
+	if (!taskFinishedAt && !taskTimedOut) {
 		logger.error("client disconnected before task finished")
 		throw new Error("Client disconnected before task completion.")
 	}
 
+	// If the task was aborted unexpectedly or the client disconnected
+	// unexpectedly, then throw to trigger a retry.
+	logger.info("setting task finished at")
+	await updateTask(task.id, { finishedAt: new Date() })
+
 	if (rooTaskId && !isClientDisconnected) {
 		logger.info("closing task")
 		client.sendCommand({ commandName: TaskCommandName.CloseTask, data: rooTaskId })

From 88f93ca07baa5f73f4a99337eb9305b5d456780f Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 07:50:29 -0700
Subject: [PATCH 09/20] Add ci mode to evals cli

---
 .github/workflows/evals-quick-test.yml  | 135 ------------
 .github/workflows/evals.yml             | 263 ++++++------------------
 apps/web-evals/src/actions/exercises.ts |  23 +--
 apps/web-evals/src/actions/runs.ts      |  11 +-
 packages/evals/src/cli/FileLogger.ts    |  86 --------
 packages/evals/src/cli/index.ts         |  30 ++-
 packages/evals/src/cli/processTask.ts   | 112 ----------
 packages/evals/src/cli/runCi.ts         |  25 +++
 packages/evals/src/cli/runEvals.ts      |  13 +-
 packages/evals/src/cli/runTask.ts       | 127 +++++++++++-
 packages/evals/src/cli/runUnitTest.ts   |  37 ++--
 packages/evals/src/cli/utils.ts         |  85 ++++++++
 packages/evals/src/exercises/index.ts   |  10 +-
 13 files changed, 346 insertions(+), 611 deletions(-)
 delete mode 100644 .github/workflows/evals-quick-test.yml
 delete mode 100644 packages/evals/src/cli/FileLogger.ts
 delete mode 100644 packages/evals/src/cli/processTask.ts
 create mode 100644 packages/evals/src/cli/runCi.ts

diff --git a/.github/workflows/evals-quick-test.yml b/.github/workflows/evals-quick-test.yml
deleted file mode 100644
index 9dfaff9257..0000000000
--- a/.github/workflows/evals-quick-test.yml
+++ /dev/null
@@ -1,135 +0,0 @@
-name: Evals Quick Test
-
-on:
-    pull_request:
-        branches: [main, develop]
-        paths:
-            - "packages/evals/**"
-            - ".github/workflows/evals*.yml"
-    workflow_dispatch:
-
-env:
-    DOCKER_BUILDKIT: 1
-    COMPOSE_DOCKER_CLI_BUILD: 1
-
-jobs:
-    test-docker-compose:
-        name: Test Docker Compose Networking
-        runs-on: ubuntu-latest
-        timeout-minutes: 15
-
-        steps:
-            - name: Checkout repository
-              uses: actions/checkout@v4
-
-            - name: Set up Docker Buildx
-              uses: docker/setup-buildx-action@v3
-
-            - name: Create test environment
-              run: |
-                  cd packages/evals
-
-                  # Create .env.local (required for Docker build)
-                  cat > .env.local << EOF
-                  OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
-                  EOF
-
-                  # Create development environment
-                  cat > .env.development << EOF
-                  NODE_ENV=development
-                  DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
-                  REDIS_URL=redis://redis:6379
-                  HOST_EXECUTION_METHOD=docker
-                  EOF
-
-            - name: Build images with cache
-              uses: docker/build-push-action@v5
-              with:
-                context: .
-                file: packages/evals/Dockerfile.web
-                tags: evals-web:latest
-                cache-from: type=gha
-                cache-to: type=gha,mode=max
-                push: false
-                load: true
-      
-            - name: Build runner image with cache
-              uses: docker/build-push-action@v5
-              with:
-                context: .
-                file: packages/evals/Dockerfile.runner
-                tags: evals-runner:latest
-                cache-from: type=gha
-                cache-to: type=gha,mode=max
-                push: false
-                load: true
-      
-            - name: Tag images for docker-compose
-              run: |
-                cd packages/evals
-                docker tag evals-web:latest evals-web
-                docker tag evals-runner:latest evals-runner
-
-            - name: Start server services
-              run: |
-                  cd packages/evals
-                  docker compose --profile server up -d
-
-            - name: Test service connectivity
-              run: |
-                  cd packages/evals
-
-                  # Wait for services
-                  echo "Waiting for PostgreSQL..."
-                  timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
-
-                  echo "Waiting for Redis..."
-                  timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-
-                  # Test inter-container networking
-                  echo "Testing database connection from web container..."
-                  docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
-
-                  echo "Testing Redis connection from web container..."
-                  docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
-
-                  # Test that web service can start (basic health check)
-                  echo "Testing web service startup..."
-                  timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..."
-
-            - name: Test runner container networking
-              run: |
-                  cd packages/evals
-
-                  echo "Testing runner container can connect to services..."
-                  docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
-                  docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
-                  docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'
-
-            - name: Verify Docker socket access
-              run: |
-                  cd packages/evals
-
-                  echo "Testing Docker socket access in runner..."
-                  docker compose run --rm runner docker --version
-                  docker compose run --rm runner docker ps
-
-            - name: Show service status
-              if: always()
-              run: |
-                  cd packages/evals
-                  echo "=== Service Status ==="
-                  docker compose ps
-
-                  echo "=== Network Information ==="
-                  docker network ls | grep evals || echo "No evals network found"
-
-                  echo "=== Container Information ==="
-                  docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
-                  docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'
-
-            - name: Cleanup
-              if: always()
-              run: |
-                  cd packages/evals
-                  docker compose down -v --remove-orphans
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index ce2505db22..af039e5662 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -1,39 +1,31 @@
-name: Evals Docker Compose
+name: Evals Quick Test
 
 on:
+    pull_request:
+        branches: [main, develop]
+        paths:
+            - "packages/evals/**"
+            - ".github/workflows/evals*.yml"
     workflow_dispatch:
-        inputs:
-            run_full_evals:
-                description: "Run full evaluation suite"
-                required: false
-                default: "false"
-                type: boolean
-            concurrency:
-                description: "Evaluation concurrency level"
-                required: false
-                default: "2"
-                type: string
 
 env:
     DOCKER_BUILDKIT: 1
     COMPOSE_DOCKER_CLI_BUILD: 1
 
 jobs:
-    build-and-test:
-        name: Build and Test Evals
+    test-docker-compose:
+        name: Test Docker Compose Networking
         runs-on: ubuntu-latest
-        timeout-minutes: 30
+        timeout-minutes: 15
 
         steps:
             - name: Checkout repository
               uses: actions/checkout@v4
-              with:
-                  fetch-depth: 0
 
             - name: Set up Docker Buildx
               uses: docker/setup-buildx-action@v3
 
-            - name: Create environment files
+            - name: Create test environment
               run: |
                   cd packages/evals
 
@@ -42,7 +34,7 @@ jobs:
                   OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
                   EOF
 
-                  # Create .env.development for development
+                  # Create development environment
                   cat > .env.development << EOF
                   NODE_ENV=development
                   DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
@@ -50,231 +42,100 @@ jobs:
                   HOST_EXECUTION_METHOD=docker
                   EOF
 
-            - name: Build web image with cache
+            - name: Build images with cache
               uses: docker/build-push-action@v5
               with:
-                context: .
-                file: packages/evals/Dockerfile.web
-                tags: evals-web:latest
-                cache-from: type=gha
-                cache-to: type=gha,mode=max
-                push: false
-                load: true
-      
+                  context: .
+                  file: packages/evals/Dockerfile.web
+                  tags: evals-web:latest
+                  cache-from: type=gha
+                  cache-to: type=gha,mode=max
+                  push: false
+                  load: true
+
             - name: Build runner image with cache
               uses: docker/build-push-action@v5
               with:
-                context: .
-                file: packages/evals/Dockerfile.runner
-                tags: evals-runner:latest
-                cache-from: type=gha
-                cache-to: type=gha,mode=max
-                push: false
-                load: true
-      
+                  context: .
+                  file: packages/evals/Dockerfile.runner
+                  tags: evals-runner:latest
+                  cache-from: type=gha
+                  cache-to: type=gha,mode=max
+                  push: false
+                  load: true
+
             - name: Tag images for docker-compose
               run: |
-                cd packages/evals
-                docker tag evals-web:latest evals-web
-                docker tag evals-runner:latest evals-runner
+                  cd packages/evals
+                  docker tag evals-web:latest evals-web
+                  docker tag evals-runner:latest evals-runner
 
             - name: Start server services
               run: |
                   cd packages/evals
                   docker compose --profile server up -d
 
-            - name: Wait for services to be ready
+            - name: Test service connectivity
               run: |
                   cd packages/evals
 
-                  # Wait for PostgreSQL to be ready
+                  # Wait for services
                   echo "Waiting for PostgreSQL..."
-                  timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
+                  timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
 
-                  # Wait for Redis to be ready
                   echo "Waiting for Redis..."
                   timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
 
-                  # Wait for web service to be ready
-                  echo "Waiting for web service..."
-                  timeout 60 bash -c 'until curl -f http://localhost:3000/health 2>/dev/null || curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
+                  # Test inter-container networking
+                  echo "Testing database connection from web container..."
+                  docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
 
-            - name: Run database migrations
-              run: |
-                  cd packages/evals
-                  docker compose exec -T web pnpm db:push
+                  echo "Testing Redis connection from web container..."
+                  docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
 
-            - name: Run tests
-              run: |
-                  cd packages/evals
-                  docker compose run --rm runner pnpm _test
+                  # Test that web service can start (basic health check)
+                  echo "Testing web service startup..."
+                  timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..."
 
-            - name: Check service logs on failure
-              if: failure()
+            - name: Test runner container networking
               run: |
                   cd packages/evals
-                  echo "=== Database logs ==="
-                  docker compose logs db
-                  echo "=== Redis logs ==="
-                  docker compose logs redis
-                  echo "=== Web service logs ==="
-                  docker compose logs web
 
-            - name: Cleanup
-              if: always()
+                  echo "Testing runner container can connect to services..."
+                  docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
+                  docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
+                  docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'
+
+            - name: Verify Docker socket access
               run: |
                   cd packages/evals
-                  docker compose down -v --remove-orphans
-
-    run-sample-evals:
-        name: Run Sample Evaluations
-        runs-on: ubuntu-latest
-        needs: build-and-test
-        if: github.event.inputs.run_full_evals == 'true' || github.event_name == 'workflow_dispatch'
-        timeout-minutes: 60
-
-        steps:
-            - name: Checkout repository
-              uses: actions/checkout@v4
 
-            - name: Set up Docker Buildx
-              uses: docker/setup-buildx-action@v3
+                  echo "Testing Docker socket access in runner..."
+                  docker compose run --rm runner docker --version
+                  docker compose run --rm runner docker ps
 
-            - name: Create environment files
+            - name: Show service status
+              if: always()
               run: |
                   cd packages/evals
 
-                  # Create .env.local with actual API key for evaluations
-                  cat > .env.local << EOF
-                  OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
-                  EOF
-
-                  cat > .env.development << EOF
-                  NODE_ENV=development
-                  DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
-                  REDIS_URL=redis://redis:6379
-                  HOST_EXECUTION_METHOD=docker
-                  EOF
-
-            - name: Build web image with cache
-              uses: docker/build-push-action@v5
-              with:
-                context: .
-                file: packages/evals/Dockerfile.web
-                tags: evals-web:latest
-                cache-from: type=gha
-                cache-to: type=gha,mode=max
-                push: false
-                load: true
-      
-            - name: Build runner image with cache
-              uses: docker/build-push-action@v5
-              with:
-                context: .
-                file: packages/evals/Dockerfile.runner
-                tags: evals-runner:latest
-                cache-from: type=gha
-                cache-to: type=gha,mode=max
-                push: false
-                load: true
-      
-            - name: Tag images and start services
-              run: |
-                cd packages/evals
-                docker tag evals-web:latest evals-web
-                docker tag evals-runner:latest evals-runner
-                docker compose --profile server --profile runner up -d --scale runner=0
+                  echo "=== Service Status ==="
+                  docker compose ps
 
-            - name: Wait for services
-              run: |
-                  cd packages/evals
-                  timeout 120 bash -c 'until docker compose exec -T db pg_isready -U postgres -d evals_development; do sleep 2; done'
-                  timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-                  timeout 60 bash -c 'until curl -f http://localhost:3000 2>/dev/null; do sleep 2; done'
+                  echo "=== Network Information ==="
+                  docker network ls | grep evals || echo "No evals network found"
 
-            - name: Run database setup
-              run: |
-                  cd packages/evals
-                  docker compose exec -T web pnpm db:push
+                  echo "=== Container Information ==="
+                  docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
+                  docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'
 
             - name: Run sample evaluation
-              env:
-                  CONCURRENCY: ${{ github.event.inputs.concurrency || '2' }}
               run: |
                   cd packages/evals
-
-                  # Run a limited set of evaluations for CI
-                  docker compose run --rm runner pnpm cli run \
-                    --concurrency $CONCURRENCY \
-                    --timeout 300 \
-                    --max-exercises 3 \
-                    --model "anthropic/claude-3-5-sonnet-20241022"
-
-            - name: Upload evaluation results
-              if: always()
-              uses: actions/upload-artifact@v4
-              with:
-                  name: evaluation-results
-                  path: |
-                      packages/evals/results/
-                      packages/evals/logs/
-                  retention-days: 7
+                  docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci
 
             - name: Cleanup
               if: always()
               run: |
                   cd packages/evals
                   docker compose down -v --remove-orphans
-
-    security-scan:
-        name: Security Scan
-        runs-on: ubuntu-latest
-        needs: build-and-test
-
-        steps:
-            - name: Checkout repository
-              uses: actions/checkout@v4
-
-            - name: Run Trivy vulnerability scanner
-              uses: aquasecurity/trivy-action@master
-              with:
-                  scan-type: "fs"
-                  scan-ref: "packages/evals"
-                  format: "sarif"
-                  output: "trivy-results.sarif"
-
-            - name: Upload Trivy scan results
-              uses: github/codeql-action/upload-sarif@v3
-              if: always()
-              with:
-                  sarif_file: "trivy-results.sarif"
-
-    docker-compose-validate:
-        name: Validate Docker Compose
-        runs-on: ubuntu-latest
-
-        steps:
-            - name: Checkout repository
-              uses: actions/checkout@v4
-
-            - name: Validate Docker Compose file
-              run: |
-                  cd packages/evals
-                  docker compose config --quiet
-
-            - name: Check Docker Compose services
-              run: |
-                  cd packages/evals
-                  docker compose config --services | sort > services.txt
-                  echo "Available services:"
-                  cat services.txt
-
-                  # Verify expected services exist
-                  for service in db redis web runner; do
-                    if ! grep -q "^$service$" services.txt; then
-                      echo "ERROR: Service '$service' not found in docker-compose.yml"
-                      exit 1
-                    fi
-                  done
-
-                  echo "All expected services found ✓"
diff --git a/apps/web-evals/src/actions/exercises.ts b/apps/web-evals/src/actions/exercises.ts
index 8cffa40ba3..17eb1ff085 100644
--- a/apps/web-evals/src/actions/exercises.ts
+++ b/apps/web-evals/src/actions/exercises.ts
@@ -1,37 +1,22 @@
 "use server"
 
-import * as fs from "fs/promises"
 import * as path from "path"
 import { fileURLToPath } from "url"
 
-import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals"
+import { exerciseLanguages, listDirectories } from "@roo-code/evals"
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url)) // <repo>/apps/web-evals/src/actions
 
-const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals")
-
-export const listDirectories = async (relativePath: string) => {
-	try {
-		const targetPath = path.resolve(__dirname, relativePath)
-		const entries = await fs.readdir(targetPath, { withFileTypes: true })
-		return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name)
-	} catch (error) {
-		console.error(`Error listing directories at ${relativePath}:`, error)
-		return []
-	}
-}
+const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals")
 
 export const getExercises = async () => {
 	const result = await Promise.all(
 		exerciseLanguages.map(async (language) => {
-			const languagePath = path.join(EXERCISES_BASE_PATH, language)
-			const exercises = await listDirectories(languagePath)
+			const languagePath = path.join(EVALS_REPO_PATH, language)
+			const exercises = await listDirectories(__dirname, languagePath)
 			return exercises.map((exercise) => `${language}/${exercise}`)
 		}),
 	)
 
 	return result.flat()
 }
-
-export const getExercisesForLanguage = async (language: ExerciseLanguage) =>
-	listDirectories(path.join(EXERCISES_BASE_PATH, language))
diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts
index 80a4659567..90387d3257 100644
--- a/apps/web-evals/src/actions/runs.ts
+++ b/apps/web-evals/src/actions/runs.ts
@@ -1,7 +1,9 @@
 "use server"
 
-import { spawn } from "child_process"
+import * as path from "path"
 import fs from "fs"
+import { fileURLToPath } from "url"
+import { spawn } from "child_process"
 
 import { revalidatePath } from "next/cache"
 import pMap from "p-map"
@@ -12,11 +14,12 @@ import {
 	createRun as _createRun,
 	deleteRun as _deleteRun,
 	createTask,
+	getExercisesForLanguage,
 } from "@roo-code/evals"
 
 import { CreateRun } from "@/lib/schemas"
 
-import { getExercisesForLanguage } from "./exercises"
+const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 
 // eslint-disable-next-line @typescript-eslint/no-unused-vars
 export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) {
@@ -37,9 +40,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values
 		}
 	} else {
 		for (const language of exerciseLanguages) {
-			const exercises = await getExercisesForLanguage(language)
+			const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
 
-			await pMap(exercises, (exercise) => createTask({ ...values, runId: run.id, language, exercise }), {
+			await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), {
 				concurrency: 10,
 			})
 		}
diff --git a/packages/evals/src/cli/FileLogger.ts b/packages/evals/src/cli/FileLogger.ts
deleted file mode 100644
index 443c1d2c53..0000000000
--- a/packages/evals/src/cli/FileLogger.ts
+++ /dev/null
@@ -1,86 +0,0 @@
-import * as fs from "fs"
-import * as path from "path"
-
-export enum LogLevel {
-	INFO = "INFO",
-	ERROR = "ERROR",
-	WARN = "WARN",
-	DEBUG = "DEBUG",
-}
-
-export interface LoggerOptions {
-	logDir: string
-	filename: string
-	tag: string
-}
-
-export class FileLogger {
-	private logStream: fs.WriteStream | undefined
-	private logFilePath: string
-	private tag: string
-
-	constructor({ logDir, filename, tag }: LoggerOptions) {
-		this.tag = tag
-		this.logFilePath = path.join(logDir, filename)
-		this.initializeLogger(logDir)
-	}
-
-	private initializeLogger(logDir: string): void {
-		try {
-			fs.mkdirSync(logDir, { recursive: true })
-		} catch (error) {
-			console.error(`Failed to create log directory ${logDir}:`, error)
-		}
-
-		try {
-			this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" })
-		} catch (error) {
-			console.error(`Failed to create log file ${this.logFilePath}:`, error)
-		}
-	}
-
-	private writeToLog(level: LogLevel, message: string, ...args: unknown[]) {
-		try {
-			const timestamp = new Date().toISOString()
-
-			const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${
-				args.length > 0 ? JSON.stringify(args) : ""
-			}\n`
-
-			console.log(logLine.trim())
-
-			if (this.logStream) {
-				this.logStream.write(logLine)
-			}
-		} catch (error) {
-			console.error(`Failed to write to log file ${this.logFilePath}:`, error)
-		}
-	}
-
-	public info(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.INFO, message, ...args)
-	}
-
-	public error(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.ERROR, message, ...args)
-	}
-
-	public warn(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.WARN, message, ...args)
-	}
-
-	public debug(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.DEBUG, message, ...args)
-	}
-
-	public log(message: string, ...args: unknown[]): void {
-		this.info(message, ...args)
-	}
-
-	public close(): void {
-		if (this.logStream) {
-			this.logStream.end()
-			this.logStream = undefined
-		}
-	}
-}
diff --git a/packages/evals/src/cli/index.ts b/packages/evals/src/cli/index.ts
index 35c9203093..0a0be28390 100644
--- a/packages/evals/src/cli/index.ts
+++ b/packages/evals/src/cli/index.ts
@@ -1,11 +1,12 @@
 import * as fs from "fs"
 
-import { command, run, number, option } from "cmd-ts"
+import { run, command, option, flag, number, boolean } from "cmd-ts"
 
-import { exercisesPath } from "../exercises/index.js"
+import { EVALS_REPO_PATH } from "../exercises/index.js"
 
+import { runCi } from "./runCi.js"
 import { runEvals } from "./runEvals.js"
-import { processTask } from "./processTask.js"
+import { processTask } from "./runTask.js"
 
 const main = async () => {
 	await run(
@@ -14,25 +15,22 @@ const main = async () => {
 			description: "Execute an eval run.",
 			version: "0.0.0",
 			args: {
+				ci: flag({ type: boolean, long: "ci", defaultValue: () => false }),
 				runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }),
 				taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }),
 			},
 			handler: async (args) => {
-				const { runId, taskId } = args
-
-				if (runId === -1 && taskId === -1) {
-					throw new Error("Either runId or taskId must be provided.")
-				}
-
-				if (runId !== -1 && taskId !== -1) {
-					throw new Error("Only one of runId or taskId must be provided.")
-				}
+				const { runId, taskId, ci } = args
 
 				try {
-					if (runId !== -1) {
+					if (ci) {
+						await runCi()
+					} else if (runId !== -1) {
 						await runEvals(runId)
-					} else {
+					} else if (taskId !== -1) {
 						await processTask({ taskId })
+					} else {
+						throw new Error("Either runId or taskId must be provided.")
 					}
 				} catch (error) {
 					console.error(error)
@@ -46,9 +44,9 @@ const main = async () => {
 	process.exit(0)
 }
 
-if (!fs.existsSync(exercisesPath)) {
+if (!fs.existsSync(EVALS_REPO_PATH)) {
 	console.error(
-		`Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
+		`Exercises do not exist at ${EVALS_REPO_PATH}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
 	)
 
 	process.exit(1)
diff --git a/packages/evals/src/cli/processTask.ts b/packages/evals/src/cli/processTask.ts
deleted file mode 100644
index 2b70013864..0000000000
--- a/packages/evals/src/cli/processTask.ts
+++ /dev/null
@@ -1,112 +0,0 @@
-import { execa } from "execa"
-
-import { RooCodeEventName, type TaskEvent } from "@roo-code/types"
-
-import { findTask, updateTask, findRun } from "../db/index.js"
-
-import { getTag } from "./utils.js"
-import { FileLogger } from "./FileLogger.js"
-import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
-import { runTask } from "./runTask.js"
-import { runUnitTest } from "./runUnitTest.js"
-
-export const processTask = async ({ taskId, logger }: { taskId: number; logger?: FileLogger }) => {
-	const task = await findTask(taskId)
-	const { language, exercise } = task
-	const run = await findRun(task.runId)
-	await registerRunner({ runId: run.id, taskId })
-
-	logger =
-		logger ||
-		new FileLogger({
-			logDir: `/var/log/evals/runs/${run.id}`,
-			filename: `${language}-${exercise}.log`,
-			tag: getTag("runTask", { run, task }),
-		})
-
-	try {
-		const publish = async (e: TaskEvent) => {
-			const redis = await redisClient()
-			await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
-		}
-
-		logger.info(`running task ${task.id} (${language}/${exercise})...`)
-		await runTask({ run, task, publish, logger })
-
-		logger.info(`testing task ${task.id} (${language}/${exercise})...`)
-		const passed = await runUnitTest({ run, task })
-
-		logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
-		await updateTask(task.id, { passed })
-
-		await publish({
-			eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail,
-			taskId: task.id,
-		})
-	} finally {
-		await deregisterRunner({ runId: run.id, taskId })
-	}
-}
-
-export const processTaskInContainer = async ({
-	taskId,
-	logger,
-	maxRetries = 10,
-}: {
-	taskId: number
-	logger: FileLogger
-	maxRetries?: number
-}) => {
-	const baseArgs = [
-		"--rm",
-		"--network evals_default",
-		"-v /var/run/docker.sock:/var/run/docker.sock",
-		"-v /tmp/evals:/var/log/evals",
-		"-e HOST_EXECUTION_METHOD=docker",
-	]
-
-	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
-	logger.info(command)
-
-	for (let attempt = 0; attempt <= maxRetries; attempt++) {
-		const containerName = `evals-task-${taskId}.${attempt}`
-		const args = [`--name ${containerName}`, ...baseArgs]
-		const isRetry = attempt > 0
-
-		if (isRetry) {
-			const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
-			logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
-			await new Promise((resolve) => setTimeout(resolve, delayMs))
-		}
-
-		logger.info(
-			`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
-		)
-
-		const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
-		// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
-		// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
-
-		try {
-			const result = await subprocess
-			logger.info(`container process completed with exit code: ${result.exitCode}`)
-			return
-		} catch (error) {
-			if (error && typeof error === "object" && "exitCode" in error) {
-				logger.error(
-					`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
-				)
-			} else {
-				logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
-			}
-
-			if (attempt === maxRetries) {
-				break
-			}
-		}
-	}
-
-	logger.error(`all ${maxRetries + 1} attempts failed, giving up`)
-
-	// TODO: Mark task as failed.
-}
diff --git a/packages/evals/src/cli/runCi.ts b/packages/evals/src/cli/runCi.ts
new file mode 100644
index 0000000000..da9fbca92f
--- /dev/null
+++ b/packages/evals/src/cli/runCi.ts
@@ -0,0 +1,25 @@
+import pMap from "p-map"
+
+import { EVALS_REPO_PATH, exerciseLanguages, getExercisesForLanguage } from "../exercises/index.js"
+import { createRun, createTask } from "../db/index.js"
+
+import { runEvals } from "./runEvals.js"
+
+export const runCi = async ({
+	concurrency = 3,
+	exercisesPerLanguage = 3,
+}: {
+	concurrency?: number
+	exercisesPerLanguage?: number
+} = {}) => {
+	console.log("Running evals in CI mode.")
+
+	const run = await createRun({ model: "anthropic/claude-sonnet-4", socketPath: "", concurrency })
+
+	for (const language of exerciseLanguages) {
+		const exercises = (await getExercisesForLanguage(EVALS_REPO_PATH, language)).slice(0, exercisesPerLanguage)
+		await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }))
+	}
+
+	await runEvals(run.id)
+}
diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts
index 56bc6ce222..00199bbb44 100644
--- a/packages/evals/src/cli/runEvals.ts
+++ b/packages/evals/src/cli/runEvals.ts
@@ -1,12 +1,11 @@
 import PQueue from "p-queue"
 
 import { findRun, finishRun, getTasks } from "../db/index.js"
-import { exercisesPath } from "../exercises/index.js"
+import { EVALS_REPO_PATH } from "../exercises/index.js"
 
-import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
-import { processTask, processTaskInContainer } from "./processTask.js"
+import { Logger, getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
 import { startHeartbeat, stopHeartbeat } from "./redis.js"
-import { FileLogger } from "./FileLogger.js"
+import { processTask, processTaskInContainer } from "./runTask.js"
 
 export const runEvals = async (runId: number) => {
 	const run = await findRun(runId)
@@ -21,7 +20,7 @@ export const runEvals = async (runId: number) => {
 		throw new Error(`Run ${run.id} has no tasks.`)
 	}
 
-	const logger = new FileLogger({
+	const logger = new Logger({
 		logDir: `/var/log/evals/runs/${run.id}`,
 		filename: `controller.log`,
 		tag: getTag("runEvals", { run }),
@@ -32,7 +31,7 @@ export const runEvals = async (runId: number) => {
 	const containerized = isDockerContainer()
 
 	if (!containerized) {
-		await resetEvalsRepo({ run, cwd: exercisesPath })
+		await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH })
 	}
 
 	const heartbeat = await startHeartbeat(run.id)
@@ -63,7 +62,7 @@ export const runEvals = async (runId: number) => {
 		// will lost when the container is destroyed. I think we should
 		// store the diffs in the database instead.
 		if (!containerized) {
-			await commitEvalsRepoChanges({ run, cwd: exercisesPath })
+			await commitEvalsRepoChanges({ run, cwd: EVALS_REPO_PATH })
 		}
 	} finally {
 		logger.info("cleaning up")
diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts
index 14e73dc59c..14028b493a 100644
--- a/packages/evals/src/cli/runTask.ts
+++ b/packages/evals/src/cli/runTask.ts
@@ -15,11 +15,21 @@ import {
 } from "@roo-code/types"
 import { IpcClient } from "@roo-code/ipc"
 
-import { type Run, type Task, updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js"
-import { exercisesPath } from "../exercises/index.js"
-
-import { isDockerContainer } from "./utils.js"
-import { FileLogger } from "./FileLogger.js"
+import {
+	type Run,
+	type Task,
+	findRun,
+	findTask,
+	updateTask,
+	createTaskMetrics,
+	updateTaskMetrics,
+	createToolError,
+} from "../db/index.js"
+import { EVALS_REPO_PATH } from "../exercises/index.js"
+
+import { Logger, getTag, isDockerContainer } from "./utils.js"
+import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
+import { runUnitTest } from "./runUnitTest.js"
 
 class SubprocessTimeoutError extends Error {
 	constructor(timeout: number) {
@@ -28,17 +38,118 @@ class SubprocessTimeoutError extends Error {
 	}
 }
 
+export const processTask = async ({ taskId, logger }: { taskId: number; logger?: Logger }) => {
+	const task = await findTask(taskId)
+	const { language, exercise } = task
+	const run = await findRun(task.runId)
+	await registerRunner({ runId: run.id, taskId })
+
+	logger =
+		logger ||
+		new Logger({
+			logDir: `/var/log/evals/runs/${run.id}`,
+			filename: `${language}-${exercise}.log`,
+			tag: getTag("runTask", { run, task }),
+		})
+
+	try {
+		const publish = async (e: TaskEvent) => {
+			const redis = await redisClient()
+			await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
+		}
+
+		logger.info(`running task ${task.id} (${language}/${exercise})...`)
+		await runTask({ run, task, publish, logger })
+
+		logger.info(`testing task ${task.id} (${language}/${exercise})...`)
+		const passed = await runUnitTest({ task, logger })
+
+		logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
+		await updateTask(task.id, { passed })
+
+		await publish({
+			eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail,
+			taskId: task.id,
+		})
+	} finally {
+		await deregisterRunner({ runId: run.id, taskId })
+	}
+}
+
+export const processTaskInContainer = async ({
+	taskId,
+	logger,
+	maxRetries = 10,
+}: {
+	taskId: number
+	logger: Logger
+	maxRetries?: number
+}) => {
+	const baseArgs = [
+		"--rm",
+		"--network evals_default",
+		"-v /var/run/docker.sock:/var/run/docker.sock",
+		"-v /tmp/evals:/var/log/evals",
+		"-e HOST_EXECUTION_METHOD=docker",
+	]
+
+	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
+	logger.info(command)
+
+	for (let attempt = 0; attempt <= maxRetries; attempt++) {
+		const containerName = `evals-task-${taskId}.${attempt}`
+		const args = [`--name ${containerName}`, ...baseArgs]
+		const isRetry = attempt > 0
+
+		if (isRetry) {
+			const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
+			logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
+			await new Promise((resolve) => setTimeout(resolve, delayMs))
+		}
+
+		logger.info(
+			`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
+		)
+
+		const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
+		// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
+		// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
+
+		try {
+			const result = await subprocess
+			logger.info(`container process completed with exit code: ${result.exitCode}`)
+			return
+		} catch (error) {
+			if (error && typeof error === "object" && "exitCode" in error) {
+				logger.error(
+					`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
+				)
+			} else {
+				logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
+			}
+
+			if (attempt === maxRetries) {
+				break
+			}
+		}
+	}
+
+	logger.error(`all ${maxRetries + 1} attempts failed, giving up`)
+
+	// TODO: Mark task as failed.
+}
+
 type RunTaskOptions = {
 	run: Run
 	task: Task
 	publish: (taskEvent: TaskEvent) => Promise<void>
-	logger: FileLogger
+	logger: Logger
 }
 
 export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => {
 	const { language, exercise } = task
-	const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
-	const workspacePath = path.resolve(exercisesPath, language, exercise)
+	const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8")
+	const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise)
 	const ipcSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`)
 	const env = { ROO_CODE_IPC_SOCKET_PATH: ipcSocketPath }
 	const controller = new AbortController()
diff --git a/packages/evals/src/cli/runUnitTest.ts b/packages/evals/src/cli/runUnitTest.ts
index 7785312e76..6f8fbac619 100644
--- a/packages/evals/src/cli/runUnitTest.ts
+++ b/packages/evals/src/cli/runUnitTest.ts
@@ -3,14 +3,14 @@ import * as path from "path"
 import { execa, parseCommandString } from "execa"
 import psTree from "ps-tree"
 
-import type { Run, Task } from "../db/index.js"
-import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js"
+import type { Task } from "../db/index.js"
+import { type ExerciseLanguage, EVALS_REPO_PATH } from "../exercises/index.js"
 
-import { getTag } from "./utils.js"
+import { Logger } from "./utils.js"
 
 const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
 
-const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
+const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number }> = {
 	go: { commands: ["go test"] },
 	java: { commands: ["./gradlew test"] },
 	javascript: { commands: ["pnpm install", "pnpm test"] },
@@ -18,22 +18,21 @@ const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: num
 	rust: { commands: ["cargo test"] },
 }
 
-export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => {
-	const tag = getTag("runUnitTest", { run, task })
-	const log = (message: string, ...args: unknown[]) => console.log(`[${Date.now()} | ${tag}] ${message}`, ...args)
-	const logError = (message: string, ...args: unknown[]) =>
-		console.error(`[${Date.now()} | ${tag}] ${message}`, ...args)
+type RunUnitTestOptions = {
+	task: Task
+	logger: Logger
+}
 
+export const runUnitTest = async ({ task, logger }: RunUnitTestOptions) => {
 	const cmd = testCommands[task.language]
-	const exercisePath = path.resolve(exercisesPath, task.language, task.exercise)
-	const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath
+	const cwd = path.resolve(EVALS_REPO_PATH, task.language, task.exercise)
 	const commands = cmd.commands.map((cs) => parseCommandString(cs))
 
 	let passed = true
 
 	for (const command of commands) {
 		try {
-			log(`running "${command.join(" ")}"`)
+			logger.info(`running "${command.join(" ")}"`)
 			const subprocess = execa({ cwd, shell: "/bin/bash", reject: false })`${command}`
 			subprocess.stdout.pipe(process.stdout)
 			subprocess.stderr.pipe(process.stderr)
@@ -49,25 +48,27 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => {
 					})
 				})
 
-				log(`"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`)
+				logger.info(
+					`"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`,
+				)
 
 				if (descendants.length > 0) {
 					for (const descendant of descendants) {
 						try {
-							log(`killing descendant process ${descendant}`)
+							logger.info(`killing descendant process ${descendant}`)
 							await execa`kill -9 ${descendant}`
 						} catch (error) {
-							logError(`failed to kill descendant process ${descendant}:`, error)
+							logger.error(`failed to kill descendant process ${descendant}:`, error)
 						}
 					}
 				}
 
-				log(`killing main process ${subprocess.pid}`)
+				logger.info(`killing main process ${subprocess.pid}`)
 
 				try {
 					await execa`kill -9 ${subprocess.pid!}`
 				} catch (error) {
-					logError(`failed to kill main process ${subprocess.pid}:`, error)
+					logger.error(`failed to kill main process ${subprocess.pid}:`, error)
 				}
 			}, UNIT_TEST_TIMEOUT)
 
@@ -80,7 +81,7 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => {
 				break
 			}
 		} catch (error) {
-			logError(`unexpected error:`, error)
+			logger.error(`unexpected error:`, error)
 			passed = false
 			break
 		}
diff --git a/packages/evals/src/cli/utils.ts b/packages/evals/src/cli/utils.ts
index cbabb451b9..bf1489d09b 100644
--- a/packages/evals/src/cli/utils.ts
+++ b/packages/evals/src/cli/utils.ts
@@ -1,4 +1,5 @@
 import * as fs from "fs"
+import * as path from "path"
 
 import { execa } from "execa"
 
@@ -29,3 +30,87 @@ export const commitEvalsRepoChanges = async ({ run, cwd }: { run: Run; cwd: stri
 	await execa({ cwd })`git add .`
 	await execa({ cwd })`git commit -m ${`Run #${run.id}`} --no-verify`
 }
+
+enum LogLevel {
+	INFO = "INFO",
+	ERROR = "ERROR",
+	WARN = "WARN",
+	DEBUG = "DEBUG",
+}
+
+interface LoggerOptions {
+	logDir: string
+	filename: string
+	tag: string
+}
+
+export class Logger {
+	private logStream: fs.WriteStream | undefined
+	private logFilePath: string
+	private tag: string
+
+	constructor({ logDir, filename, tag }: LoggerOptions) {
+		this.tag = tag
+		this.logFilePath = path.join(logDir, filename)
+		this.initializeLogger(logDir)
+	}
+
+	private initializeLogger(logDir: string): void {
+		try {
+			fs.mkdirSync(logDir, { recursive: true })
+		} catch (error) {
+			console.error(`Failed to create log directory ${logDir}:`, error)
+		}
+
+		try {
+			this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" })
+		} catch (error) {
+			console.error(`Failed to create log file ${this.logFilePath}:`, error)
+		}
+	}
+
+	private writeToLog(level: LogLevel, message: string, ...args: unknown[]) {
+		try {
+			const timestamp = new Date().toISOString()
+
+			const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${
+				args.length > 0 ? JSON.stringify(args) : ""
+			}\n`
+
+			console.log(logLine.trim())
+
+			if (this.logStream) {
+				this.logStream.write(logLine)
+			}
+		} catch (error) {
+			console.error(`Failed to write to log file ${this.logFilePath}:`, error)
+		}
+	}
+
+	public info(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.INFO, message, ...args)
+	}
+
+	public error(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.ERROR, message, ...args)
+	}
+
+	public warn(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.WARN, message, ...args)
+	}
+
+	public debug(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.DEBUG, message, ...args)
+	}
+
+	public log(message: string, ...args: unknown[]): void {
+		this.info(message, ...args)
+	}
+
+	public close(): void {
+		if (this.logStream) {
+			this.logStream.end()
+			this.logStream = undefined
+		}
+	}
+}
diff --git a/packages/evals/src/exercises/index.ts b/packages/evals/src/exercises/index.ts
index 17e339f21a..7ba34f2a2b 100644
--- a/packages/evals/src/exercises/index.ts
+++ b/packages/evals/src/exercises/index.ts
@@ -4,15 +4,15 @@ import { fileURLToPath } from "url"
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url))
 
-export const exercisesPath = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals")
+export const EVALS_REPO_PATH = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals")
 
 export const exerciseLanguages = ["go", "java", "javascript", "python", "rust"] as const
 
 export type ExerciseLanguage = (typeof exerciseLanguages)[number]
 
-const listDirectories = async (relativePath: string) => {
+export const listDirectories = async (basePath: string, relativePath: string) => {
 	try {
-		const targetPath = path.resolve(__dirname, relativePath)
+		const targetPath = path.resolve(basePath, relativePath)
 		const entries = await fs.readdir(targetPath, { withFileTypes: true })
 		return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name)
 	} catch (error) {
@@ -21,5 +21,5 @@ const listDirectories = async (relativePath: string) => {
 	}
 }
 
-export const getExercisesForLanguage = async (language: ExerciseLanguage) =>
-	listDirectories(path.join(exercisesPath, language))
+export const getExercisesForLanguage = async (basePath: string, language: ExerciseLanguage) =>
+	listDirectories(__dirname, path.join(basePath, language))

From 2bc65ca8f0fb9b6736b056422fc52329ca495bd9 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 08:07:43 -0700
Subject: [PATCH 10/20] Increase timeout

---
 .github/workflows/evals.yml | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index af039e5662..ff315b4194 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -1,4 +1,4 @@
-name: Evals Quick Test
+name: Evals
 
 on:
     pull_request:
@@ -13,10 +13,9 @@ env:
     COMPOSE_DOCKER_CLI_BUILD: 1
 
 jobs:
-    test-docker-compose:
-        name: Test Docker Compose Networking
+    evals:
         runs-on: ubuntu-latest
-        timeout-minutes: 15
+        timeout-minutes: 30
 
         steps:
             - name: Checkout repository
@@ -25,16 +24,14 @@ jobs:
             - name: Set up Docker Buildx
               uses: docker/setup-buildx-action@v3
 
-            - name: Create test environment
+            - name: Create environment
               run: |
                   cd packages/evals
 
-                  # Create .env.local (required for Docker build)
                   cat > .env.local << EOF
                   OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
                   EOF
 
-                  # Create development environment
                   cat > .env.development << EOF
                   NODE_ENV=development
                   DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
@@ -42,7 +39,7 @@ jobs:
                   HOST_EXECUTION_METHOD=docker
                   EOF
 
-            - name: Build images with cache
+            - name: Build web image
               uses: docker/build-push-action@v5
               with:
                   context: .
@@ -53,7 +50,7 @@ jobs:
                   push: false
                   load: true
 
-            - name: Build runner image with cache
+            - name: Build runner image
               uses: docker/build-push-action@v5
               with:
                   context: .
@@ -64,18 +61,18 @@ jobs:
                   push: false
                   load: true
 
-            - name: Tag images for docker-compose
+            - name: Tag images
               run: |
                   cd packages/evals
                   docker tag evals-web:latest evals-web
                   docker tag evals-runner:latest evals-runner
 
-            - name: Start server services
+            - name: Start containers
               run: |
                   cd packages/evals
                   docker compose --profile server up -d
 
-            - name: Test service connectivity
+            - name: Wait for containers
               run: |
                   cd packages/evals
 
@@ -86,14 +83,12 @@ jobs:
                   echo "Waiting for Redis..."
                   timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
 
-                  # Test inter-container networking
                   echo "Testing database connection from web container..."
                   docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
 
                   echo "Testing Redis connection from web container..."
                   docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
 
-                  # Test that web service can start (basic health check)
                   echo "Testing web service startup..."
                   timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..."
 
@@ -106,7 +101,7 @@ jobs:
                   docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
                   docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'
 
-            - name: Verify Docker socket access
+            - name: Test Docker socket access
               run: |
                   cd packages/evals
 
@@ -129,7 +124,7 @@ jobs:
                   docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
                   docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'
 
-            - name: Run sample evaluation
+            - name: Run evals
               run: |
                   cd packages/evals
                   docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci

From 6adb2a6adbaca22ade09b2735f00013525155ab0 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 08:35:11 -0700
Subject: [PATCH 11/20] Add web health check

---
 .github/workflows/evals.yml                |  6 +++++-
 apps/web-evals/src/app/api/health/route.ts | 24 ++++++++++++++++++++++
 packages/evals/package.json                |  3 ++-
 packages/evals/src/cli/runCi.ts            |  4 ++--
 4 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 apps/web-evals/src/app/api/health/route.ts

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index ff315b4194..c3d9983417 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -90,7 +90,11 @@ jobs:
                   docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
 
                   echo "Testing web service startup..."
-                  timeout 30 bash -c 'until curl -f http://localhost:3000 2>/dev/null || curl -f http://localhost:3000/health 2>/dev/null; do sleep 2; done' || echo "Web service may not have health endpoint, continuing..."
+                  timeout 60 bash -c 'until curl -f http://localhost:3000/api/health 2>/dev/null; do echo "Waiting for web service..."; sleep 2; done'
+                  
+                  echo "✓ Web service is healthy"
+                  echo "Health check response:"
+                  curl -s http://localhost:3000/api/health | jq 2>/dev/null || curl -s http://localhost:3000/api/health
 
             - name: Test runner container networking
               run: |
diff --git a/apps/web-evals/src/app/api/health/route.ts b/apps/web-evals/src/app/api/health/route.ts
new file mode 100644
index 0000000000..ca8a833942
--- /dev/null
+++ b/apps/web-evals/src/app/api/health/route.ts
@@ -0,0 +1,24 @@
+import { NextResponse } from "next/server"
+
+export async function GET() {
+	try {
+		return NextResponse.json(
+			{
+				status: "healthy",
+				timestamp: new Date().toISOString(),
+				uptime: process.uptime(),
+				environment: process.env.NODE_ENV || "production",
+			},
+			{ status: 200 },
+		)
+	} catch (error) {
+		return NextResponse.json(
+			{
+				status: "unhealthy",
+				timestamp: new Date().toISOString(),
+				error: error instanceof Error ? error.message : "Unknown error",
+			},
+			{ status: 503 },
+		)
+	}
+}
diff --git a/packages/evals/package.json b/packages/evals/package.json
index 554356e5b1..88195b134b 100644
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -21,7 +21,8 @@
 		"db:start": "docker compose up -d db",
 		"db:stop": "docker compose down db",
 		"redis:start": "docker compose up -d redis",
-		"redis:stop": "docker compose down redis"
+		"redis:stop": "docker compose down redis",
+		"services:start": "docker compose up -d db redis"
 	},
 	"dependencies": {
 		"@roo-code/ipc": "workspace:^",
diff --git a/packages/evals/src/cli/runCi.ts b/packages/evals/src/cli/runCi.ts
index da9fbca92f..1b16d52aee 100644
--- a/packages/evals/src/cli/runCi.ts
+++ b/packages/evals/src/cli/runCi.ts
@@ -6,8 +6,8 @@ import { createRun, createTask } from "../db/index.js"
 import { runEvals } from "./runEvals.js"
 
 export const runCi = async ({
-	concurrency = 3,
-	exercisesPerLanguage = 3,
+	concurrency = 1,
+	exercisesPerLanguage = 1,
 }: {
 	concurrency?: number
 	exercisesPerLanguage?: number

From 43a411012f8bf1be2a6b8edb61ab8ac0da35a74a Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 08:36:27 -0700
Subject: [PATCH 12/20] Forward ports for running locally

---
 packages/evals/docker-compose.yml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml
index 37d95dbb59..93e643e44b 100644
--- a/packages/evals/docker-compose.yml
+++ b/packages/evals/docker-compose.yml
@@ -17,8 +17,10 @@ services:
     db:
         container_name: evals-db
         image: postgres:15.4
-        expose:
-            - 5432
+        # expose:
+        #     - 5432
+        ports:
+            - "${EVALS_DB_PORT:-5432}:5432"
         volumes:
             - ./.docker/postgres:/var/lib/postgresql/data
             - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d
@@ -38,8 +40,10 @@ services:
     redis:
         container_name: evals-redis
         image: redis:7-alpine
-        expose:
-            - 6379
+        # expose:
+        #     - 6379
+        ports:
+            - "${EVALS_REDIS_PORT:-6379}:6379"
         volumes:
             - ./.docker/redis:/data
         command: redis-server --appendonly yes

From bacb17d5e8f49be9b5ff97241fe01478962afeaf Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 08:44:38 -0700
Subject: [PATCH 13/20] Trigger on label

---
 .github/workflows/evals.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index c3d9983417..e96b414c6c 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -2,10 +2,7 @@ name: Evals
 
 on:
     pull_request:
-        branches: [main, develop]
-        paths:
-            - "packages/evals/**"
-            - ".github/workflows/evals*.yml"
+        types: [labeled]
     workflow_dispatch:
 
 env:
@@ -14,6 +11,8 @@ env:
 
 jobs:
     evals:
+        # Run if triggered manually or if PR has 'evals' label.
+        if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals')
         runs-on: ubuntu-latest
         timeout-minutes: 30
 
@@ -91,7 +90,7 @@ jobs:
 
                   echo "Testing web service startup..."
                   timeout 60 bash -c 'until curl -f http://localhost:3000/api/health 2>/dev/null; do echo "Waiting for web service..."; sleep 2; done'
-                  
+
                   echo "✓ Web service is healthy"
                   echo "Health check response:"
                   curl -s http://localhost:3000/api/health | jq 2>/dev/null || curl -s http://localhost:3000/api/health

From 8453c4a85a8785e31a60aeb47facfd691548ac92 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 08:49:24 -0700
Subject: [PATCH 14/20] Remove this

---
 packages/evals/GITHUB_ACTIONS.md | 201 -------------------------------
 1 file changed, 201 deletions(-)
 delete mode 100644 packages/evals/GITHUB_ACTIONS.md

diff --git a/packages/evals/GITHUB_ACTIONS.md b/packages/evals/GITHUB_ACTIONS.md
deleted file mode 100644
index 454c42ec87..0000000000
--- a/packages/evals/GITHUB_ACTIONS.md
+++ /dev/null
@@ -1,201 +0,0 @@
-# GitHub Actions for Evals
-
-This document describes the GitHub Actions workflows available for the Roo Code Evals system.
-
-## Workflows
-
-### 1. `evals.yml` - Full Evaluation Workflow
-
-**Purpose**: Comprehensive testing and evaluation workflow that builds, tests, and optionally runs full evaluations.
-
-**Triggers**:
-
-- Push to `main` or `develop` branches (when evals files change)
-- Pull requests to `main` or `develop` branches (when evals files change)
-- Manual dispatch with options
-
-**Jobs**:
-
-#### `build-and-test`
-
-- Builds Docker images for web and runner services
-- Starts PostgreSQL, Redis, and web services
-- Waits for all services to be ready
-- Runs database migrations
-- Executes test suite
-- Provides detailed logging on failure
-
-#### `run-sample-evals` (conditional)
-
-- Only runs when manually triggered with `run_full_evals: true`
-- Requires `OPENROUTER_API_KEY` secret to be configured
-- Runs a limited set of evaluations for CI testing
-- Uploads evaluation results as artifacts
-- Configurable concurrency level
-
-#### `security-scan`
-
-- Runs Trivy vulnerability scanner on the evals package
-- Uploads results to GitHub Security tab
-
-#### `docker-compose-validate`
-
-- Validates Docker Compose file syntax
-- Verifies all expected services are defined
-
-**Required Secrets**:
-
-- `OPENROUTER_API_KEY` (only for full evaluation runs)
-
-### 2. `evals-quick-test.yml` - Quick Networking Test
-
-**Purpose**: Fast validation of Docker Compose networking and basic functionality.
-
-**Triggers**:
-
-- Push to `main` or `develop` branches (when evals files change)
-- Pull requests to `main` or `develop` branches (when evals files change)
-
-**Jobs**:
-
-#### `test-docker-compose`
-
-- Tests inter-container networking between all services
-- Verifies database and Redis connectivity
-- Tests Docker socket access in runner container
-- Validates service startup and health
-
-#### `validate-compose-file`
-
-- Validates Docker Compose syntax
-- Checks service definitions and profiles
-
-## Usage Examples
-
-### Manual Workflow Dispatch
-
-To run full evaluations manually:
-
-1. Go to Actions tab in GitHub
-2. Select "Evals Docker Compose" workflow
-3. Click "Run workflow"
-4. Configure options:
-    - `run_full_evals`: Set to `true` to run actual evaluations
-    - `concurrency`: Set evaluation concurrency (default: 2)
-
-### Setting Up Secrets
-
-For full evaluation runs, add the OpenRouter API key:
-
-1. Go to repository Settings → Secrets and variables → Actions
-2. Add new repository secret:
-    - Name: `OPENROUTER_API_KEY`
-    - Value: Your OpenRouter API key (e.g., `sk-or-v1-...`)
-
-## Docker Compose Networking in GitHub Actions
-
-The workflows demonstrate that Docker Compose networking works seamlessly in GitHub Actions:
-
-### Service Communication
-
-- Services communicate using service names as hostnames
-- Database: `postgresql://postgres:password@db:5432/evals_development`
-- Redis: `redis://redis:6379`
-- Web service: `http://web:3000`
-
-### Network Features Tested
-
-- ✅ Container-to-container communication
-- ✅ Service discovery via service names
-- ✅ Port mapping and internal networking
-- ✅ Health checks and service dependencies
-- ✅ Docker socket mounting for Docker-in-Docker
-- ✅ Volume mounts for data persistence
-- ✅ Profile-based service grouping
-
-### Networking Validation
-
-The workflows include comprehensive networking tests:
-
-```bash
-# Test database connectivity
-docker compose exec -T web sh -c 'nc -z db 5432'
-
-# Test Redis connectivity
-docker compose exec -T web sh -c 'nc -z redis 6379'
-
-# Test cross-service communication
-docker compose run --rm runner sh -c 'nc -z web 3000'
-```
-
-## Resource Considerations
-
-GitHub Actions runners have the following limits:
-
-- **Memory**: 7 GB RAM
-- **CPU**: 2-core CPU
-- **Disk**: 14 GB SSD space
-- **Time**: 6 hours maximum job runtime
-
-For the evals system:
-
-- Quick tests typically complete in 5-10 minutes
-- Full evaluation runs may take 30-60 minutes depending on scope
-- Resource usage scales with concurrency settings
-
-## Troubleshooting
-
-### Common Issues
-
-1. **Service startup timeouts**
-
-    - Increase timeout values in workflow
-    - Check service health check configurations
-    - Review service logs in workflow output
-
-2. **Networking failures**
-
-    - Verify service names match docker-compose.yml
-    - Check port configurations
-    - Ensure services are in the same Docker network
-
-3. **Docker socket access issues**
-    - Verify `/var/run/docker.sock` mount in docker-compose.yml
-    - Check Docker-in-Docker permissions
-
-### Debugging
-
-The workflows include comprehensive logging:
-
-- Service status and health checks
-- Network information and container details
-- Service logs on failure
-- Artifact uploads for evaluation results
-
-To debug locally, you can run the same commands used in the workflows:
-
-```bash
-cd packages/evals
-
-# Build and start services
-docker compose --profile server up -d
-
-# Test connectivity
-docker compose exec -T web sh -c 'nc -z db 5432'
-docker compose exec -T redis redis-cli ping
-
-# View logs
-docker compose logs db
-docker compose logs redis
-docker compose logs web
-```
-
-## Performance Optimization
-
-For faster CI runs:
-
-- Use Docker layer caching with `docker/setup-buildx-action`
-- Minimize Docker image sizes
-- Use health checks to avoid unnecessary wait times
-- Run tests in parallel where possible
-- Cache dependencies between workflow runs

From 1d305ab371c448f53d047eaf0106b11aad043470 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 09:37:45 -0700
Subject: [PATCH 15/20] More powerful runner

---
 .github/workflows/evals.yml     |  2 +-
 packages/evals/src/cli/index.ts |  2 +-
 packages/evals/src/cli/runCi.ts | 11 ++++++++---
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index e96b414c6c..fd115df964 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -13,7 +13,7 @@ jobs:
     evals:
         # Run if triggered manually or if PR has 'evals' label.
         if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals')
-        runs-on: ubuntu-latest
+        runs-on: blacksmith-16vcpu-ubuntu-2404
         timeout-minutes: 30
 
         steps:
diff --git a/packages/evals/src/cli/index.ts b/packages/evals/src/cli/index.ts
index 0a0be28390..de62be8ae0 100644
--- a/packages/evals/src/cli/index.ts
+++ b/packages/evals/src/cli/index.ts
@@ -24,7 +24,7 @@ const main = async () => {
 
 				try {
 					if (ci) {
-						await runCi()
+						await runCi({ concurrency: 3, exercisesPerLanguage: 5 })
 					} else if (runId !== -1) {
 						await runEvals(runId)
 					} else if (taskId !== -1) {
diff --git a/packages/evals/src/cli/runCi.ts b/packages/evals/src/cli/runCi.ts
index 1b16d52aee..ca8a88e0e0 100644
--- a/packages/evals/src/cli/runCi.ts
+++ b/packages/evals/src/cli/runCi.ts
@@ -7,7 +7,7 @@ import { runEvals } from "./runEvals.js"
 
 export const runCi = async ({
 	concurrency = 1,
-	exercisesPerLanguage = 1,
+	exercisesPerLanguage,
 }: {
 	concurrency?: number
 	exercisesPerLanguage?: number
@@ -17,8 +17,13 @@ export const runCi = async ({
 	const run = await createRun({ model: "anthropic/claude-sonnet-4", socketPath: "", concurrency })
 
 	for (const language of exerciseLanguages) {
-		const exercises = (await getExercisesForLanguage(EVALS_REPO_PATH, language)).slice(0, exercisesPerLanguage)
-		await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }))
+		let exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
+
+		if (exercisesPerLanguage) {
+			exercises = exercises.slice(0, exercisesPerLanguage)
+		}
+
+		await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), { concurrency })
 	}
 
 	await runEvals(run.id)

From 1ffe32462bb5af9a8a90adafb4956a4befcdd984 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 09:44:03 -0700
Subject: [PATCH 16/20] Remove web container

---
 .github/workflows/evals.yml | 58 ++-----------------------------------
 1 file changed, 3 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index fd115df964..39267edb92 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -38,18 +38,7 @@ jobs:
                   HOST_EXECUTION_METHOD=docker
                   EOF
 
-            - name: Build web image
-              uses: docker/build-push-action@v5
-              with:
-                  context: .
-                  file: packages/evals/Dockerfile.web
-                  tags: evals-web:latest
-                  cache-from: type=gha
-                  cache-to: type=gha,mode=max
-                  push: false
-                  load: true
-
-            - name: Build runner image
+            - name: Build image
               uses: docker/build-push-action@v5
               with:
                   context: .
@@ -60,10 +49,9 @@ jobs:
                   push: false
                   load: true
 
-            - name: Tag images
+            - name: Tag image
               run: |
                   cd packages/evals
-                  docker tag evals-web:latest evals-web
                   docker tag evals-runner:latest evals-runner
 
             - name: Start containers
@@ -74,59 +62,19 @@ jobs:
             - name: Wait for containers
               run: |
                   cd packages/evals
-
-                  # Wait for services
-                  echo "Waiting for PostgreSQL..."
                   timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
-
-                  echo "Waiting for Redis..."
                   timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-
-                  echo "Testing database connection from web container..."
                   docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
-
-                  echo "Testing Redis connection from web container..."
                   docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
 
-                  echo "Testing web service startup..."
-                  timeout 60 bash -c 'until curl -f http://localhost:3000/api/health 2>/dev/null; do echo "Waiting for web service..."; sleep 2; done'
-
-                  echo "✓ Web service is healthy"
-                  echo "Health check response:"
-                  curl -s http://localhost:3000/api/health | jq 2>/dev/null || curl -s http://localhost:3000/api/health
-
-            - name: Test runner container networking
+            - name: Test runner
               run: |
                   cd packages/evals
-
-                  echo "Testing runner container can connect to services..."
                   docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
                   docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
-                  docker compose run --rm runner sh -c 'nc -z web 3000 && echo "✓ Runner -> Web service connection successful"'
-
-            - name: Test Docker socket access
-              run: |
-                  cd packages/evals
-
-                  echo "Testing Docker socket access in runner..."
                   docker compose run --rm runner docker --version
                   docker compose run --rm runner docker ps
 
-            - name: Show service status
-              if: always()
-              run: |
-                  cd packages/evals
-
-                  echo "=== Service Status ==="
-                  docker compose ps
-
-                  echo "=== Network Information ==="
-                  docker network ls | grep evals || echo "No evals network found"
-
-                  echo "=== Container Information ==="
-                  docker compose exec -T db sh -c 'echo "Database container hostname: $(hostname)"'
-                  docker compose exec -T redis sh -c 'echo "Redis container hostname: $(hostname)"'
-
             - name: Run evals
               run: |
                   cd packages/evals

From d0e8f2bcf98a9b8380c9fa73c060d281c013462a Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 09:44:26 -0700
Subject: [PATCH 17/20] Increase timeout

---
 .github/workflows/evals.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index 39267edb92..c472296f9d 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -14,7 +14,7 @@ jobs:
         # Run if triggered manually or if PR has 'evals' label.
         if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals')
         runs-on: blacksmith-16vcpu-ubuntu-2404
-        timeout-minutes: 30
+        timeout-minutes: 45
 
         steps:
             - name: Checkout repository

From e2ced71d43652f186022256145c743555e17d999 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 09:56:59 -0700
Subject: [PATCH 18/20] Don't start the web container

---
 .github/workflows/evals.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index c472296f9d..4c3e4d09f0 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -57,11 +57,7 @@ jobs:
             - name: Start containers
               run: |
                   cd packages/evals
-                  docker compose --profile server up -d
-
-            - name: Wait for containers
-              run: |
-                  cd packages/evals
+                  docker compose up -d db redis
                   timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
                   timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
                   docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'

From e1132c7dcbdb5428776981a5d72eb0d55295bc74 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 10:08:17 -0700
Subject: [PATCH 19/20] Oops

---
 .github/workflows/evals.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index 4c3e4d09f0..d12dcd4eac 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -60,8 +60,6 @@ jobs:
                   docker compose up -d db redis
                   timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
                   timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-                  docker compose exec -T web sh -c 'nc -z db 5432 && echo "✓ Database connection successful"'
-                  docker compose exec -T web sh -c 'nc -z redis 6379 && echo "✓ Redis connection successful"'
 
             - name: Test runner
               run: |

From 41df1286b36f14bf0182673a780ce6c16a9f1b56 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Tue, 10 Jun 2025 10:22:34 -0700
Subject: [PATCH 20/20] More cleanup

---
 .github/workflows/evals.yml | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index d12dcd4eac..9d8f9fb49b 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -16,6 +16,10 @@ jobs:
         runs-on: blacksmith-16vcpu-ubuntu-2404
         timeout-minutes: 45
 
+        defaults:
+            run:
+                working-directory: packages/evals
+
         steps:
             - name: Checkout repository
               uses: actions/checkout@v4
@@ -25,8 +29,6 @@ jobs:
 
             - name: Create environment
               run: |
-                  cd packages/evals
-
                   cat > .env.local << EOF
                   OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
                   EOF
@@ -50,32 +52,23 @@ jobs:
                   load: true
 
             - name: Tag image
-              run: |
-                  cd packages/evals
-                  docker tag evals-runner:latest evals-runner
+              run: docker tag evals-runner:latest evals-runner
 
             - name: Start containers
               run: |
-                  cd packages/evals
                   docker compose up -d db redis
                   timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
                   timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
-
-            - name: Test runner
-              run: |
-                  cd packages/evals
                   docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
                   docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
-                  docker compose run --rm runner docker --version
                   docker compose run --rm runner docker ps
 
+            - name: Run database migrations
+              run: docker compose run --rm runner pnpm --filter @roo-code/evals db:migrate
+
             - name: Run evals
-              run: |
-                  cd packages/evals
-                  docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci
+              run: docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci
 
             - name: Cleanup
               if: always()
-              run: |
-                  cd packages/evals
-                  docker compose down -v --remove-orphans
+              run: docker compose down -v --remove-orphans