Package schema

jbragg · jbragg · commit bacb5bc65edd · 2025-05-07T22:09:35.000-07:00
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -11,14 +11,18 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout code
+        uses: actions/checkout@v3
+
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version: '3.10'
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install .[dev]
+
       - name: Run tests
         run: pytest --maxfail=1 --disable-warnings -q
diff --git a/.github/workflows/ci-validate-schema.yml b/.github/workflows/ci-validate-schema.yml
@@ -0,0 +1,40 @@
+name: Validate dataset_infos.json
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  check-schema:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: pip install -e .
+
+      - name: Check dataset_infos.json exists
+        run: |
+          if [ ! -e dataset_infos.json ]; then
+            echo "🚨 dataset_infos.json is missing. Please commit dataset_infos.json to the repo." >&2
+            exit 1
+          fi
+
+      - name: Regenerate dataset_infos.json
+        run: python scripts/update_schema.py
+
+      - name: Verify schema is up to date
+        run: |
+          if ! git diff --quiet dataset_infos.json; then
+            echo "🚨 dataset_infos.json is out of date. Please run 'python scripts/update_schema.py' and commit the updated file." >&2
+            exit 1
+          fi
diff --git a/Development.md b/Development.md
@@ -10,5 +10,15 @@ To publish to pypi:
 
 ```shell
 export PYPI_TOKEN=...
-bash publish.sh
+bash scripts/publish.sh
+```
+
+# Schema update
+
+The results leaderboard on HuggingFace uses a fixed schema to prevent inferred schema problems.
+As the results model changes over time, schema adjustments may be required (a CI check should fail).
+The schema can be inferred from the Pydantic model and re-computed using:
+
+```shell
+python scripts/update_schema.py
 ```
diff --git a/Makefile b/Makefile
@@ -12,7 +12,12 @@ tag:
 # Upload package to PyPI
 publish:
 	@echo "Uploading package to PyPI..."
-	@bash ./publish.sh
+	@bash scripts/publish.sh
+
+# Update dataset_infos.json
+update-schema:
+	@echo "Updating schema..."
+	python scripts/update_schema.py
 
 test:
 	@echo "Running tests..."
diff --git a/README.md b/README.md
@@ -35,6 +35,9 @@ agenteval publish [OPTIONS] LOG_DIR
 ```
 Upload the scored results to HuggingFace datasets.
 
+Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
+For more reliable schema parsing, upload the [results schema](https://github.com/allenai/agent-eval/blob/main/dataset_infos.json) to the root of the results dataset.
+
 # Development
 
 See [Development.md](Development.md) for development instructions.
diff --git a/dataset_infos.json b/dataset_infos.json
@@ -0,0 +1,148 @@
+{
+  "default": {
+    "features": {
+      "suite_config": {
+        "name": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "version": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "splits": [
+          {
+            "name": {
+              "dtype": "string",
+              "_type": "Value"
+            },
+            "tasks": [
+              {
+                "name": {
+                  "dtype": "string",
+                  "_type": "Value"
+                },
+                "path": {
+                  "dtype": "string",
+                  "_type": "Value"
+                },
+                "primary_metric": {
+                  "dtype": "string",
+                  "_type": "Value"
+                },
+                "tags": {
+                  "feature": {
+                    "dtype": "string",
+                    "_type": "Value"
+                  },
+                  "_type": "Sequence"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "split": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "results": [
+        {
+          "task_name": {
+            "dtype": "string",
+            "_type": "Value"
+          },
+          "metrics": [
+            {
+              "name": {
+                "dtype": "string",
+                "_type": "Value"
+              },
+              "value": {
+                "dtype": "float64",
+                "_type": "Value"
+              }
+            }
+          ],
+          "model_usages": [
+            [
+              {
+                "model": {
+                  "dtype": "string",
+                  "_type": "Value"
+                },
+                "usage": {
+                  "input_tokens": {
+                    "dtype": "int64",
+                    "_type": "Value"
+                  },
+                  "output_tokens": {
+                    "dtype": "int64",
+                    "_type": "Value"
+                  },
+                  "total_tokens": {
+                    "dtype": "int64",
+                    "_type": "Value"
+                  },
+                  "input_tokens_cache_write": {
+                    "dtype": "int64",
+                    "_type": "Value"
+                  },
+                  "input_tokens_cache_read": {
+                    "dtype": "int64",
+                    "_type": "Value"
+                  },
+                  "reasoning_tokens": {
+                    "dtype": "int64",
+                    "_type": "Value"
+                  }
+                }
+              }
+            ]
+          ],
+          "model_costs": {
+            "feature": {
+              "dtype": "float64",
+              "_type": "Value"
+            },
+            "_type": "Sequence"
+          }
+        }
+      ],
+      "submission": {
+        "submit_time": {
+          "dtype": "timestamp[us, tz=UTC]",
+          "_type": "Value"
+        },
+        "username": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "agent_name": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "agent_description": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "agent_url": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "logs_url": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "logs_url_public": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "summary_url": {
+          "dtype": "string",
+          "_type": "Value"
+        }
+      }
+    }
+  }
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,9 +11,12 @@ requires-python = ">=3.10"
 dependencies = [
   "click",
   "inspect-ai",
-  "huggingface_hub",
   "litellm",
   "pydantic>=2.0.0",
+  # For leaderboard
+  "huggingface_hub",
+  "pyarrow",
+  "datasets",
 ]
 
 [project.urls]
diff --git a/scripts/publish.sh b/scripts/publish.sh
@@ -10,9 +10,18 @@ if ! git rev-parse "$version" >/dev/null 2>&1; then
   exit 1
 fi
 
+
 # 🧹 Clean build artifacts
 rm -rf dist
 
+# 🔄 Regenerate dataset_infos.json and verify it’s up to date
+echo "Regenerating dataset_infos.json..."
+python scripts/update_schema.py ./dataset_infos.json
+if ! git diff --quiet ./dataset_infos.json; then
+  echo "\n🚨 dataset_infos.json is outdated. Please commit the updated file before publishing.\n" >&2
+  exit 1
+fi
+
 # 🔒 Set up PyPI credentials
 export TWINE_NON_INTERACTIVE=1
 export TWINE_USERNAME='__token__'
diff --git a/scripts/update_schema.py b/scripts/update_schema.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+"""
+Script to regenerate dataset_infos.json from the Pydantic schema.
+"""
+from pathlib import Path
+
+from agenteval.schema_generator import generate_dataset_infos
+
+
+def update_schema():
+    repo_root = Path(__file__).parent.parent
+    output_path = repo_root / "dataset_infos.json"
+    generate_dataset_infos(str(output_path))
+
+
+def main():
+    """Regenerate dataset_infos.json from Pydantic schema"""
+    update_schema()
+    print("✅ dataset_infos.json updated at dataset_infos.json")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/agenteval/config.py b/src/agenteval/config.py
@@ -6,17 +6,39 @@
 from pydantic import BaseModel, ValidationError
 
 
+class Task(BaseModel):
+    name: str
+    """Canonical task name (used by the leaderboard)."""
+
+    path: str
+    """Path to the task definition (used by Inspect)."""
+
+    primary_metric: str
+    """Primary metric for the task, used for summary scores."""
+
+    tags: list[str] | None = None
+    """List of tags, used for computing summary scores for task groups."""
+
+
+class Split(BaseModel):
+    name: str
+    """Name of the split."""
+
+    tasks: list[Task]
+    """List of tasks associated with the split."""
+
+
 class SuiteConfig(BaseModel):
     name: str
     """Name of the suite."""
 
     version: str | None = None
     """Version of the suite, e.g. '1.0.0.dev1'."""
 
-    splits: list["Split"]
+    splits: list[Split]
     """List of splits in the suite."""
 
-    def get_tasks(self, split_name: str) -> list["Task"]:
+    def get_tasks(self, split_name: str) -> list[Task]:
         """
         Get the tasks for a specific split.
 
@@ -39,28 +61,6 @@ def get_tasks(self, split_name: str) -> list["Task"]:
         )
 
 
-class Split(BaseModel):
-    name: str
-    """Name of the split."""
-
-    tasks: list["Task"]
-    """List of tasks associated with the split."""
-
-
-class Task(BaseModel):
-    name: str
-    """Canonical task name (used by the leaderboard)."""
-
-    path: str
-    """Path to the task definition (used by Inspect)."""
-
-    primary_metric: str
-    """Primary metric for the task, used for summary scores."""
-
-    tags: list[str] | None = None
-    """List of tags, used for computing summary scores for task groups."""
-
-
 def load_suite_config(file_path: str) -> SuiteConfig:
     """
     Load the suite configuration from the specified YAML file.
diff --git a/src/agenteval/models.py b/src/agenteval/models.py
@@ -83,7 +83,7 @@ def dump_json_bytes(
         """
         return self.model_dump_json(
             indent=indent,
-            exclude_none=True,
-            exclude_defaults=True,
+            exclude_none=False,
+            exclude_defaults=False,
             **model_dump_kwargs,
         ).encode("utf-8")
diff --git a/src/agenteval/schema_generator.py b/src/agenteval/schema_generator.py