allenai
diff --git a/‎.github/workflows/build-and-push-evals.yml‎
Lines changed: 0 additions & 18 deletions b/‎.github/workflows/build-and-push-evals.yml‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎apps/evaluations/.env.local.example‎
Lines changed: 3 additions & 3 deletions b/‎apps/evaluations/.env.local.example‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎apps/evaluations/Dockerfile‎
Lines changed: 6 additions & 3 deletions b/‎apps/evaluations/Dockerfile‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎apps/evaluations/README.md‎
Lines changed: 10 additions & 1 deletion b/‎apps/evaluations/README.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎apps/evaluations/pyproject.toml‎
Lines changed: 2 additions & 5 deletions b/‎apps/evaluations/pyproject.toml‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎apps/evaluations/requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎apps/evaluations/requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apps/evaluations/src/evaluations/cli.py‎
Lines changed: 53 additions & 0 deletions b/‎apps/evaluations/src/evaluations/cli.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎apps/evaluations/src/evaluations/run_local.py‎
Lines changed: 15 additions & 3 deletions b/‎apps/evaluations/src/evaluations/run_local.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎apps/evaluations/src/evaluations/settings.py‎
Lines changed: 3 additions & 2 deletions b/‎apps/evaluations/src/evaluations/settings.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎apps/evaluations/tests/__init__.py‎ ‎…ations/src/evaluations/tests/__init__.py‎apps/evaluations/tests/__init__.py renamed to apps/evaluations/src/evaluations/tests/__init__.py b/‎apps/evaluations/tests/__init__.py‎ ‎…ations/src/evaluations/tests/__init__.py‎apps/evaluations/tests/__init__.py renamed to apps/evaluations/src/evaluations/tests/__init__.py
@@ -7,10 +7,6 @@ on:
     paths:
       - 'apps/evaluations/**'
       - '.github/workflows/build-and-push-evals.yml'
-  pull_request:
-    paths:
-      - 'apps/evaluations/**'
-      - '.github/workflows/build-and-push-evals.yml'
   workflow_dispatch:
 
 permissions:
@@ -23,21 +19,7 @@ env:
   REPO: model-evals
 
 jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Setup uv
-        uses: astral-sh/setup-uv@v7
-
-      - name: Run Tests
-        working-directory: apps/evaluations
-        run: uv run --only-group dev pytest -v
-
   build-and-deploy:
-    needs: test
-    if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     environment:
       name: ${{ github.ref_name }}
 
@@ -7,11 +7,11 @@ GITHUB_TOKEN=
 # Required for running evaluations
 LITELLM_PROXY_API_KEY=
 
-# Required for storage (Postgres)
+# Required for storage (Postgres, S3)
+# PGPASSWORD or DB_SECRET_ARN for fetching Postgres password from secrets manager
 PGHOST=
 PGPASSWORD=
-
-# Required for storage (S3)
+DB_SECRET_ARN= 
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
 
@@ -37,11 +37,14 @@ RUN --mount=type=secret,id=GITHUB_TOKEN \
 
 WORKDIR /app
 
-# Copy evaluations package
+# Copy and install requirements (olmo-eval-internal from private repo)
+COPY requirements.txt /app/requirements.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r /app/requirements.txt
+
+# Copy and install evaluations package
 COPY src /app/src
 COPY pyproject.toml /app/pyproject.toml
-
-# Install evaluations package (pulls olmo-eval-internal from git)
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system /app
 
 
@@ -273,7 +273,16 @@ gcloud run jobs execute eval --region us-west1 \
 
 ## CLI Reference
 
-The `evaluations` CLI is available inside the container:
+The `evaluations` CLI requires `olmo-eval-internal` which is only installed inside the Docker container
+(not in the local dev environment). Use `run-local` to run evaluations locally via Docker:
+
+```bash
+# Run evaluations locally via Docker (recommended)
+uv run run-local --tier smoke --task-index 0
+uv run run-local --tier standard --task-index 0 --with-storage
+```
+
+Inside the container, you can use the CLI directly:
 
 ```bash
 # Auto-run tier from env vars (used by Cloud Run)
 
@@ -5,14 +5,14 @@ description = "OLMo evaluation runner for Cloud Run Jobs"
 requires-python = ">=3.12"
 
 dependencies = [
-    "olmo-eval-internal[litellm,storage] @ git+https://github.com/allenai/olmo-eval-internal.git@main",
     "pydantic-settings>=2.0",
+    "boto3>=1.35",
 ]
 
 [dependency-groups]
 dev = [
     "pytest>=8.0",
-    "pydantic-settings>=2.0",
+    "boto3-stubs[secretsmanager]>=1.35",
 ]
 
 [project.scripts]
@@ -24,9 +24,6 @@ run-local = "evaluations.run_local:main"
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
-[tool.hatch.metadata]
-allow-direct-references = true
-
 [tool.hatch.build.targets.wheel]
 packages = ["src/evaluations"]
 
 
@@ -0,0 +1,2 @@
+# Runtime dependencies for Docker image (separate from evaluations package)
+olmo-eval-internal[litellm,storage] @ git+https://github.com/allenai/olmo-eval-internal.git@main
@@ -1,14 +1,64 @@
 """CLI for evaluations Cloud Run Jobs."""
 
 import argparse
+import json
+import os
 import subprocess
 import sys
 
+import boto3
+from botocore.exceptions import BotoCoreError, ClientError
+
 from evaluations.configs import ModelEval, TierName, get_tier
 from evaluations.logging import logger
 from evaluations.settings import settings
 
 
+def get_aws_secret_value(secret_arn: str, key: str | None = None) -> str:
+    """Fetch a secret value from AWS Secrets Manager.
+
+    Args:
+        secret_arn: The ARN of the secret.
+        key: If provided, parse secret as JSON and extract this key.
+
+    Returns:
+        The secret value (or extracted key value).
+    """
+    region = settings.AWS_REGION
+    client = boto3.client("secretsmanager", region_name=region)
+    response = client.get_secret_value(SecretId=secret_arn)
+    secret_string = response["SecretString"]
+
+    if key is None:
+        return secret_string
+
+    return json.loads(secret_string)[key]
+
+
+def setup_db_credentials() -> None:
+    """Set up database credentials from AWS Secrets Manager or settings.
+
+    If DB_SECRET_ARN is set, fetches the password from AWS Secrets Manager.
+    Otherwise, uses PGPASSWORD from settings.
+
+    This supports automatically rotating database credentials when using AWS.
+    """
+    if not settings.DB_SECRET_ARN:
+        # Use PGPASSWORD as set in the environment
+        return
+
+    logger.info("Fetching database password from AWS Secrets Manager")
+
+    try:
+        password = get_aws_secret_value(settings.DB_SECRET_ARN, key="password")
+        # override PGPASSWORD in the environment
+        os.environ["PGPASSWORD"] = password
+        logger.info("Database password loaded from AWS Secrets Manager")
+    except (BotoCoreError, ClientError, KeyError, json.JSONDecodeError) as e:
+        logger.error("Failed to fetch database password from AWS: %s", e)
+        sys.exit(1)
+
+
 def run_ad_hoc(
     model: str,
     tasks: str,
@@ -99,6 +149,9 @@ def main() -> None:
     2. EVAL_TIER set: Run tier evaluation
     3. CLI arguments: Manual invocation
     """
+    # Fetch database credentials from AWS if DB_SECRET_ARN is set
+    setup_db_credentials()
+
     # Check for ad-hoc mode
     if settings.EVAL_MODE == "ad-hoc":
         if not settings.AD_HOC_MODEL or not settings.AD_HOC_TASKS:
 
@@ -89,13 +89,25 @@ def main() -> int:
             "-e",
             f"PGUSER={settings.PGUSER}",
             "-e",
-            f"PGPASSWORD={settings.PGPASSWORD}",
-            "-e",
             f"AWS_ACCESS_KEY_ID={settings.AWS_ACCESS_KEY_ID or ''}",
             "-e",
             f"AWS_SECRET_ACCESS_KEY={settings.AWS_SECRET_ACCESS_KEY or ''}",
         ])
-        logger.info("Running: tier=%s, task_index=%d (with storage)", args.tier, args.task_index)
+        # Use DB_SECRET_ARN if set, otherwise pass PGPASSWORD directly
+        if settings.DB_SECRET_ARN:
+            docker_cmd.extend([
+                "-e",
+                f"DB_SECRET_ARN={settings.DB_SECRET_ARN}",
+                "-e",
+                f"AWS_REGION={settings.AWS_REGION}",
+            ])
+            logger.info("Running: tier=%s, task_index=%d (with storage, AWS secrets)", args.tier, args.task_index)
+        elif settings.PGPASSWORD:
+            docker_cmd.extend(["-e", f"PGPASSWORD={settings.PGPASSWORD}"])
+            logger.info("Running: tier=%s, task_index=%d (with storage)", args.tier, args.task_index)
+        else:
+            logger.warning("Neither DB_SECRET_ARN nor PGPASSWORD is set")
+            logger.info("Running: tier=%s, task_index=%d (with storage, no password)", args.tier, args.task_index)
     else:
         docker_cmd.extend(["-e", "LOCAL=true"])
         logger.info("Running: tier=%s, task_index=%d (local mode, no storage)", args.tier, args.task_index)
 
@@ -14,12 +14,13 @@ class Settings(BaseSettings):
     PGPORT: str = "5432"
     PGDATABASE: str = "olmo_eval"
     PGUSER: str = "postgres"
-    PGPASSWORD: str = "postgres"
+    PGPASSWORD: str | None = None
     DB_SECRET_ARN: str | None = None
 
-    # AWS credentials
+    # AWS credentials and settings
     AWS_ACCESS_KEY_ID: str | None = None
     AWS_SECRET_ACCESS_KEY: str | None = None
+    AWS_REGION: str = "us-east-1"
 
     # Settings below are set at runtime and control evaluation behavior. They shouldn't
     # be set in .env files, but are included here for validation and documentation.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Runtime dependencies for Docker image (separate from evaluations package)`
	`2`	`+olmo-eval-internal[litellm,storage] @ git+https://github.com/allenai/olmo-eval-internal.git@main`