allenai · jbragg · May 8, 2025 · May 7, 2025 · May 8, 2025 · May 8, 2025
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -11,14 +11,18 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout code
+        uses: actions/checkout@v3
+
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version: '3.10'
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install .[dev]
+
       - name: Run tests
         run: pytest --maxfail=1 --disable-warnings -q
diff --git a/.github/workflows/ci-validate-schema.yml b/.github/workflows/ci-validate-schema.yml
@@ -0,0 +1,40 @@
+name: Validate dataset_features.yml
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  check-schema:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: pip install -e .
+
+      - name: Check dataset_features.yml exists
+        run: |
+          if [ ! -e src/agenteval/dataset_features.yml ]; then
+            echo "dataset_features.yml is missing. Please run 'python scripts/update_schema.py' and commit the file to the repo." >&2
+            exit 1
+          fi
+
+      - name: Regenerate dataset_features.yml
+        run: python scripts/update_schema.py
+
+      - name: Verify schema is up to date
+        run: |
+          if ! git diff --quiet src/agenteval/dataset_features.yml; then
+            echo "dataset_features.yml is out of date. Please run 'python scripts/update_schema.py' and commit the updated file." >&2
+            exit 1
+          fi
diff --git a/Development.md b/Development.md
@@ -10,5 +10,15 @@ To publish to pypi:
 
 ```shell
 export PYPI_TOKEN=...
-bash publish.sh
+bash scripts/publish.sh
+```
+
+# Schema update
+
+The results leaderboard on HuggingFace uses a fixed schema to prevent inferred schema problems.
+As the results model changes over time, schema adjustments may be required (a CI check should fail).
+The schema can be inferred from the Pydantic model and re-computed using:
+
+```shell
+python scripts/update_schema.py
 ```
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include src/agenteval/dataset_features.yml
diff --git a/Makefile b/Makefile
@@ -12,7 +12,12 @@ tag:
 # Upload package to PyPI
 publish:
 	@echo "Uploading package to PyPI..."
-	@bash ./publish.sh
+	@bash scripts/publish.sh
+
+# Update HF dataset features
+update-schema:
+	@echo "Updating schema..."
+	python scripts/update_schema.py
 
 test:
 	@echo "Running tests..."

diff --git a/README.md b/README.md
@@ -29,12 +29,22 @@ agenteval score [OPTIONS] LOG_DIR
 ```
 Compute scores for the results in `agenteval.json` and update the file with the computed scores.
 
-## Publish Scores
+## Publish scores
 ```shell
 agenteval publish [OPTIONS] LOG_DIR
 ```
 Upload the scored results to HuggingFace datasets.
 
+# Administer the HuggingFace datasets
+Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
+
+If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet).
+This is done by updating the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset (the metadata block is identified by lines with just `---` above and below it).
+This attribute should contain a list of configs, each of which specifies the schema (under the `features` key) and dataset structure (under the `data_files` key).
+See [sample-config-hf-readme-metadata.yml](sample-config-hf-readme-metadata.yml) for a sample metadata block corresponding to [sample-comfig.yml](sample-config.yml) (note that the metadata references the [raw schema data](src/agenteval/dataset_features.yml), which must be copied).
+
+To facilitate initializing new configs, `agenteval publish` will automatically add this metadata if it is missing.
+
 # Development
 
 See [Development.md](Development.md) for development instructions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,16 +4,19 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "agent-eval"
-version = "0.1.0"
+version = "0.1.1"
 description = "Agent evaluation toolkit"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
   "click",
   "inspect-ai",
-  "huggingface_hub",
   "litellm",
   "pydantic>=2.0.0",
+  # For leaderboard
+  "huggingface_hub",
+  "pyarrow",
+  "datasets",
 ]
 
 [project.urls]
@@ -40,3 +43,9 @@ where = ["src"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.package-data]
+"agenteval" = ["dataset_features.yml"]
diff --git a/sample-config-hf-readme-metadata.yml b/sample-config-hf-readme-metadata.yml
@@ -0,0 +1,11 @@
+---
+configs:
+- config_name: 0.1-dev
+  data_files:
+    - split: validation
+      path: 0.1-dev/validation/*.json
+    - split: test
+      path: 0.1-dev/test/*.json
+  features:
+  # Insert src/agenteval/dataset_features.yml here at the proper indentation level.
+---
diff --git a/publish.sh → scripts/publish.sh b/publish.sh → scripts/publish.sh
@@ -10,9 +10,18 @@ if ! git rev-parse "$version" >/dev/null 2>&1; then
   exit 1
 fi
 
+
 # 🧹 Clean build artifacts
 rm -rf dist
 
+# 🔄 Regenerate schema file and verify it’s up to date
+echo "Regenerating schema file..."
+python scripts/update_schema.py
+if ! git diff --quiet src/agenteval/dataset_features.yml; then
+  echo "\ndataset_features.yml schema file is outdated. Please commit the updated file before publishing.\n" >&2
+  exit 1
+fi
+
 # 🔒 Set up PyPI credentials
 export TWINE_NON_INTERACTIVE=1
 export TWINE_USERNAME='__token__'

diff --git a/scripts/update_schema.py b/scripts/update_schema.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+"""
+Script to regenerate dataset_features.yml from the Pydantic schema.
+"""
+from pathlib import Path
+
+from agenteval.schema_generator import write_dataset_features
+
+
+def update_schema():
+    repo_root = Path(__file__).parent.parent
+    output_path = repo_root / "src" / "agenteval" / "dataset_features.yml"
+    write_dataset_features(str(output_path))
+
+
+def main():
+    """Regenerate dataset_features.yml from Pydantic schema"""
+    update_schema()
+    print("✅ dataset_features.yml updated at src/agenteval/dataset_features.yml")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/agenteval/config.py b/src/agenteval/config.py
@@ -6,17 +6,39 @@
 from pydantic import BaseModel, ValidationError
 
 
+class Task(BaseModel):
+    name: str
+    """Canonical task name (used by the leaderboard)."""
+
+    path: str
+    """Path to the task definition (used by Inspect)."""
+
+    primary_metric: str
+    """Primary metric for the task, used for summary scores."""
+
+    tags: list[str] | None = None
+    """List of tags, used for computing summary scores for task groups."""
+
+
+class Split(BaseModel):
+    name: str
+    """Name of the split."""
+
+    tasks: list[Task]
+    """List of tasks associated with the split."""
+
+
 class SuiteConfig(BaseModel):
     name: str
     """Name of the suite."""
 
     version: str | None = None
     """Version of the suite, e.g. '1.0.0.dev1'."""
 
-    splits: list["Split"]
+    splits: list[Split]
     """List of splits in the suite."""
 
-    def get_tasks(self, split_name: str) -> list["Task"]:
+    def get_tasks(self, split_name: str) -> list[Task]:
         """
         Get the tasks for a specific split.
 
@@ -39,28 +61,6 @@ def get_tasks(self, split_name: str) -> list["Task"]:
         )
 
 
-class Split(BaseModel):
-    name: str
-    """Name of the split."""
-
-    tasks: list["Task"]
-    """List of tasks associated with the split."""
-
-
-class Task(BaseModel):
-    name: str
-    """Canonical task name (used by the leaderboard)."""
-
-    path: str
-    """Path to the task definition (used by Inspect)."""
-
-    primary_metric: str
-    """Primary metric for the task, used for summary scores."""
-
-    tags: list[str] | None = None
-    """List of tags, used for computing summary scores for task groups."""
-
-
 def load_suite_config(file_path: str) -> SuiteConfig:
     """
     Load the suite configuration from the specified YAML file.

diff --git a/src/agenteval/dataset_features.yml b/src/agenteval/dataset_features.yml
@@ -0,0 +1,71 @@
+- name: suite_config
+  struct:
+  - name: name
+    dtype: string
+  - name: version
+    dtype: string
+  - name: splits
+    list:
+    - name: name
+      dtype: string
+    - name: tasks
+      list:
+      - name: name
+        dtype: string
+      - name: path
+        dtype: string
+      - name: primary_metric
+        dtype: string
+      - name: tags
+        sequence: string
+- name: split
+  dtype: string
+- name: results
+  list:
+  - name: task_name
+    dtype: string
+  - name: metrics
+    list:
+    - name: name
+      dtype: string
+    - name: value
+      dtype: float64
+  - name: model_usages
+    list:
+      list:
+      - name: model
+        dtype: string
+      - name: usage
+        struct:
+        - name: input_tokens
+          dtype: int64
+        - name: output_tokens
+          dtype: int64
+        - name: total_tokens
+          dtype: int64
+        - name: input_tokens_cache_write
+          dtype: int64
+        - name: input_tokens_cache_read
+          dtype: int64
+        - name: reasoning_tokens
+          dtype: int64
+  - name: model_costs
+    sequence: float64
+- name: submission
+  struct:
+  - name: submit_time
+    dtype: timestamp[us, tz=UTC]
+  - name: username
+    dtype: string
+  - name: agent_name
+    dtype: string
+  - name: agent_description
+    dtype: string
+  - name: agent_url
+    dtype: string
+  - name: logs_url
+    dtype: string
+  - name: logs_url_public
+    dtype: string
+  - name: summary_url
+    dtype: string
diff --git a/src/agenteval/models.py b/src/agenteval/models.py
@@ -83,7 +83,7 @@ def dump_json_bytes(
         """
         return self.model_dump_json(
             indent=indent,
-            exclude_none=True,
-            exclude_defaults=True,
+            exclude_none=False,
+            exclude_defaults=False,
             **model_dump_kwargs,
         ).encode("utf-8")