Skip to content

Commit 20b945f

Browse files
authored
Evaluations cloud run jobs runner (#547)
Setup of evaluations application used for maintaining a configuration of model evaluations by "tier" (eval runs based on frequency). The code builds and deploys a docker file to Google Cloud Run to run the evaluations on schedule. The only dependency is the olmo-eval-internal cli tool for running evaluations. See the Readme.md for details. - Docker image can be built and evals run locally with the run_local.py utility - Docker image can be deployed manually using typical docker push command. - Jobs and scheduling can be deployed manually using terraform apply - Skiff2 is leveraged running setup/build/deploy on push to main. - ad-hoc evals can be executed with gcloud command line passing `--updated-env-vars` TODOs: - switch setup action to shared GitHub actions when publicly available - switch build action to shared GitHub actions when publicly available AND olmo-eval-internal is publicly available - Remove GH Token access to olmo-eval-internal - add in storage configurations when shared database is accessible from GCP closes allenai/playground-issues-repo#994 closes allenai/playground-issues-repo#990
1 parent 1e5346d commit 20b945f

31 files changed

+4278
-2
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
name: Setup GCP and Docker
2+
description: Sets up Google Cloud authentication, Cloud SDK, and Docker for GCR
3+
author: Skiff
4+
# This is a temp workaround until skiff2 shared actions are accessible from public repos
5+
# Please remove this file and switch to shared action when available.
6+
7+
inputs:
8+
workload_identity_provider:
9+
description: "Workload Identity Provider resource name (e.g. projects/123/locations/global/workloadIdentityPools/my-pool/providers/my-provider)"
10+
required: true
11+
service_account:
12+
description: "Service account email to impersonate"
13+
required: true
14+
project_id:
15+
description: "GCP project ID"
16+
required: true
17+
18+
runs:
19+
using: composite
20+
steps:
21+
- name: Check branch is main
22+
shell: bash
23+
run: |
24+
if [ "${{ github.ref }}" != "refs/heads/main" ]; then
25+
echo "This action can only run on the main branch. Current ref: ${{ github.ref }}"
26+
exit 1
27+
fi
28+
29+
- name: Checkout calling repository
30+
uses: actions/checkout@v4
31+
32+
- name: Authenticate to Google Cloud
33+
uses: google-github-actions/auth@v2
34+
with:
35+
workload_identity_provider: ${{ inputs.workload_identity_provider }}
36+
service_account: ${{ inputs.service_account }}
37+
38+
- name: Set up Cloud SDK
39+
uses: google-github-actions/setup-gcloud@v2
40+
with:
41+
project_id: ${{ inputs.project_id }}
42+
43+
- name: Configure Docker for GCR
44+
shell: bash
45+
run: gcloud auth configure-docker
46+
47+
- name: Set up Docker Buildx
48+
uses: docker/setup-buildx-action@v3
49+
50+
branding:
51+
icon: cloud
52+
color: blue
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
name: Build and Deploy Evaluations Cloud Run Jobs
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
paths:
8+
- 'apps/evaluations/**'
9+
- '.github/workflows/build-and-push-evals.yml'
10+
pull_request:
11+
paths:
12+
- 'apps/evaluations/**'
13+
- '.github/workflows/build-and-push-evals.yml'
14+
workflow_dispatch:
15+
16+
permissions:
17+
contents: read
18+
id-token: write
19+
20+
env:
21+
SERVICE_NAME: evaluations
22+
REGISTRY: us-west1-docker.pkg.dev
23+
REPO: model-evals
24+
25+
jobs:
26+
test:
27+
runs-on: ubuntu-latest
28+
steps:
29+
- uses: actions/checkout@v6
30+
31+
- name: Setup uv
32+
uses: astral-sh/setup-uv@v7
33+
34+
- name: Run Tests
35+
working-directory: apps/evaluations
36+
run: uv run --only-group dev pytest -v
37+
38+
build-and-deploy:
39+
needs: test
40+
if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
41+
runs-on: ubuntu-latest
42+
environment:
43+
name: ${{ github.ref_name }}
44+
steps:
45+
- uses: actions/checkout@v6 # remove this when switching back to shared action
46+
47+
- name: Skiff2 Setup
48+
id: setup
49+
uses: ./.github/actions/skiff2/setup # temporary workaround until share action is available
50+
with:
51+
workload_identity_provider: ${{ vars.SKIFF2_WORKLOAD_IDENTITY_PROVIDER }}
52+
service_account: ${{ vars.SKIFF2_SERVICE_ACCOUNT }}
53+
project_id: ${{ vars.SKIFF2_PROJECT_ID }}
54+
55+
# Configure Docker for Artifact Registry
56+
- name: Configure Docker
57+
run: gcloud auth configure-docker ${REGISTRY} --quiet
58+
59+
- name: Set up Docker Buildx
60+
uses: docker/setup-buildx-action@v3
61+
62+
# Custom build step for evaluations (handles GITHUB_TOKEN for private repo)
63+
# Once olmo-eval-internal is public, this can be replaced with Skiff2 Build
64+
- name: Build and Push Evaluations Image
65+
id: build
66+
uses: docker/build-push-action@v6
67+
with:
68+
context: apps/evaluations
69+
file: apps/evaluations/Dockerfile
70+
platforms: linux/amd64
71+
push: true
72+
tags: |
73+
${{ env.REGISTRY }}/${{ vars.SKIFF2_PROJECT_ID }}/${{ env.REPO }}/${{ env.SERVICE_NAME }}:latest
74+
${{ env.REGISTRY }}/${{ vars.SKIFF2_PROJECT_ID }}/${{ env.REPO }}/${{ env.SERVICE_NAME }}:${{ github.sha }}
75+
cache-from: type=gha
76+
cache-to: type=gha,mode=max
77+
secrets: |
78+
GITHUB_TOKEN=${{ secrets.OLMO_EVAL_INTERNAL_TOKEN }}
79+
80+
# Setup uv for Python package management
81+
- name: Setup uv
82+
uses: astral-sh/setup-uv@v7
83+
84+
# Configure git to use token for private repo access
85+
- name: Configure Git for Private Repos
86+
run: git config --global url."https://${{ secrets.OLMO_EVAL_INTERNAL_TOKEN }}@github.com/".insteadOf "https://github.com/"
87+
88+
# Generate Terraform variables from Python tier configs
89+
- name: Generate Terraform Variables
90+
working-directory: apps/evaluations
91+
run: uv run generate-tfvars -o terraform/terraform.tfvars.json
92+
93+
# Setup Terraform
94+
- name: Setup Terraform
95+
uses: hashicorp/setup-terraform@v3
96+
with:
97+
terraform_version: "1.5"
98+
99+
# Deploy with Terraform
100+
- name: Terraform Init
101+
working-directory: apps/evaluations/terraform
102+
run: terraform init
103+
104+
- name: Terraform Plan
105+
working-directory: apps/evaluations/terraform
106+
run: |
107+
terraform plan \
108+
-var="project_id=${{ vars.SKIFF2_PROJECT_ID }}" \
109+
-var="image_tag=${{ github.sha }}" \
110+
-out=tfplan
111+
112+
- name: Terraform Apply
113+
working-directory: apps/evaluations/terraform
114+
run: terraform apply -auto-approve tfplan

.github/workflows/verify-api.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
uses: ./.github/actions/set-up-uv
2424

2525
- name: Test with pytest
26-
run: uv run pytest --ignore ./apps/flask-api/e2e --ignore ./apps/api/e2e
26+
run: uv run pytest --ignore ./apps/flask-api/e2e --ignore ./apps/api/e2e --ignore ./apps/evaluations
2727

2828
type-check:
2929
runs-on: ubuntu-latest
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Local environment variables for evaluations
2+
# Copy to .env.local and fill in values
3+
4+
# Required for Docker build (private repo access)
5+
GITHUB_TOKEN=
6+
7+
# Required for running evaluations
8+
LITELLM_PROXY_API_KEY=
9+
10+
# Required for storage (Postgres)
11+
PGHOST=
12+
PGPASSWORD=
13+
14+
# Required for storage (S3)
15+
AWS_ACCESS_KEY_ID=
16+
AWS_SECRET_ACCESS_KEY=
17+

apps/evaluations/.gitignore

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*.egg-info/
5+
6+
# Build artifacts
7+
dist/
8+
build/
9+
10+
# Local environment
11+
.env.local

apps/evaluations/Dockerfile

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Evaluations Docker Image for Cloud Run Jobs
2+
#
3+
# Docker image with which to run a list of model evals basedb on tier configuration,
4+
# and will run each individual model eval as it's own Google Cloud Job
5+
#
6+
# Build (requires GitHub token for private repo access):
7+
# docker build --platform linux/amd64 --secret id=GITHUB_TOKEN \
8+
# -t evaluations -f apps/evaluations/Dockerfile apps/evaluations
9+
#
10+
# Once olmo-eval-internal is public, remove --secret GITHUB_TOKEN
11+
#
12+
# Run tier (local mode, no storage):
13+
# docker run -e EVAL_TIER=standard -e CLOUD_RUN_TASK_INDEX=0 -e LOCAL=true \
14+
# -e LITELLM_PROXY_API_KEY=$LITELLM_PROXY_API_KEY evaluations
15+
#
16+
# Run builds/evals locally with helper script:
17+
# uv run run-local --tier standard --build
18+
# uv run run-local --build-only
19+
#
20+
21+
# ============================================================================
22+
# Stage 1: Builder
23+
# ============================================================================
24+
FROM --platform=linux/amd64 ghcr.io/astral-sh/uv:python3.14-bookworm-slim AS builder
25+
26+
ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
27+
ENV UV_PYTHON_DOWNLOADS=0
28+
29+
# Install git for cloning olmo-eval-internal
30+
RUN apt-get update -qq && \
31+
apt-get install -y --no-install-recommends git && \
32+
rm -rf /var/lib/apt/lists/*
33+
34+
# GitHub token for private repo access (mounted as secret)
35+
RUN --mount=type=secret,id=GITHUB_TOKEN \
36+
git config --global url."https://$(cat /run/secrets/GITHUB_TOKEN)@github.com/".insteadOf "https://github.com/"
37+
38+
WORKDIR /app
39+
40+
# Copy evaluations package
41+
COPY src /app/src
42+
COPY pyproject.toml /app/pyproject.toml
43+
44+
# Install evaluations package (pulls olmo-eval-internal from git)
45+
RUN --mount=type=cache,target=/root/.cache/uv \
46+
uv pip install --system /app
47+
48+
# ============================================================================
49+
# Stage 2: Runtime
50+
# ============================================================================
51+
FROM --platform=linux/amd64 python:3.14-slim-bookworm AS runner
52+
53+
# Install runtime dependencies
54+
RUN apt-get update -qq && \
55+
apt-get install -y --no-install-recommends ca-certificates && \
56+
rm -rf /var/lib/apt/lists/*
57+
58+
# Setup non-root user
59+
RUN groupadd --system --gid 999 nonroot \
60+
&& useradd --system --gid 999 --uid 999 --create-home nonroot
61+
62+
# Copy installed packages from builder
63+
COPY --from=builder /usr/local/lib/python3.14/site-packages /usr/local/lib/python3.14/site-packages
64+
COPY --from=builder /usr/local/bin/olmo-eval /usr/local/bin/olmo-eval
65+
COPY --from=builder /usr/local/bin/evaluations /usr/local/bin/evaluations
66+
67+
WORKDIR /app
68+
69+
# Use non-root user
70+
USER nonroot
71+
72+
ENV PYTHONUNBUFFERED=1
73+
ENV TERM=dumb
74+
ENV NO_COLOR=1
75+
76+
# Use Python CLI as entrypoint
77+
ENTRYPOINT ["python", "-m", "evaluations.cli"]

0 commit comments

Comments
 (0)