Populate dataset card metadata with schema information

jbragg · jbragg · commit f672ad745b7d · 2025-05-08T15:48:53.000-07:00
diff --git a/.github/workflows/ci-validate-schema.yml b/.github/workflows/ci-validate-schema.yml
@@ -1,4 +1,4 @@
-name: Validate dataset_infos.json
+name: Validate dataset_features.yml
 
 on:
   push:
@@ -22,19 +22,19 @@ jobs:
       - name: Install dependencies
         run: pip install -e .
 
-      - name: Check dataset_infos.json exists
+      - name: Check dataset_features.yml exists
         run: |
-          if [ ! -e dataset_infos.json ]; then
-            echo "dataset_infos.json is missing. Please run 'python scripts/update_schema.py' and commit the file to the repo." >&2
+          if [ ! -e dataset_features.yml ]; then
+            echo "dataset_features.yml is missing. Please run 'python scripts/update_schema.py' and commit the file to the repo." >&2
             exit 1
           fi
 
-      - name: Regenerate dataset_infos.json
+      - name: Regenerate dataset_features.yml
         run: python scripts/update_schema.py
 
       - name: Verify schema is up to date
         run: |
-          if ! git diff --quiet dataset_infos.json; then
-            echo "dataset_infos.json is out of date. Please run 'python scripts/update_schema.py' and commit the updated file." >&2
+          if ! git diff --quiet dataset_features.yml; then
+            echo "dataset_features.yml is out of date. Please run 'python scripts/update_schema.py' and commit the updated file." >&2
             exit 1
           fi
diff --git a/Makefile b/Makefile
@@ -14,7 +14,7 @@ publish:
 	@echo "Uploading package to PyPI..."
 	@bash scripts/publish.sh
 
-# Update dataset_infos.json
+# Update HF dataset features
 update-schema:
 	@echo "Updating schema..."
 	python scripts/update_schema.py
diff --git a/README.md b/README.md
@@ -38,9 +38,12 @@ Upload the scored results to HuggingFace datasets.
 # Administer the HuggingFace datasets
 Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
 
-If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet):
-- *Schema:* Upload the [results schema](https://github.com/allenai/agent-eval/blob/main/dataset_infos.json) to the root of the results dataset.
-- *Dataset structure:*  Specify the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset. For example, see the [sample metadata block](sample-config-dataset-structure.yml) for the [sample config](sample-config.yml). Using `agenteval publish` will automatically add the corresponding config name and split to the YAML metadata if it is missing.
+If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet).
+This is done by updating the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset (the metadata block is identified by lines with just `---` above and below it).
+This attribute should contain a list of configs, each of which specifies the schema (under the `features` key) and dataset structure (under the `data_files` key).
+See [sample-config-hf-readme-metadata.yml](sample-config-hf-readme-metadata.yml) for a sample metadata block corresponding to [sample-comfig.yml](sample-config.yml) (note that the metadata references the [raw schema data](dataset_features.yml), which must be copied).
+
+To facilitate initializing new configs, `agenteval publish` will automatically add this metadata if it is missing.
 
 # Development
 
diff --git a/dataset_features.yml b/dataset_features.yml
@@ -0,0 +1,71 @@
+- name: suite_config
+  struct:
+  - name: name
+    dtype: string
+  - name: version
+    dtype: string
+  - name: splits
+    list:
+    - name: name
+      dtype: string
+    - name: tasks
+      list:
+      - name: name
+        dtype: string
+      - name: path
+        dtype: string
+      - name: primary_metric
+        dtype: string
+      - name: tags
+        sequence: string
+- name: split
+  dtype: string
+- name: results
+  list:
+  - name: task_name
+    dtype: string
+  - name: metrics
+    list:
+    - name: name
+      dtype: string
+    - name: value
+      dtype: float64
+  - name: model_usages
+    list:
+      list:
+      - name: model
+        dtype: string
+      - name: usage
+        struct:
+        - name: input_tokens
+          dtype: int64
+        - name: output_tokens
+          dtype: int64
+        - name: total_tokens
+          dtype: int64
+        - name: input_tokens_cache_write
+          dtype: int64
+        - name: input_tokens_cache_read
+          dtype: int64
+        - name: reasoning_tokens
+          dtype: int64
+  - name: model_costs
+    sequence: float64
+- name: submission
+  struct:
+  - name: submit_time
+    dtype: timestamp[us, tz=UTC]
+  - name: username
+    dtype: string
+  - name: agent_name
+    dtype: string
+  - name: agent_description
+    dtype: string
+  - name: agent_url
+    dtype: string
+  - name: logs_url
+    dtype: string
+  - name: logs_url_public
+    dtype: string
+  - name: summary_url
+    dtype: string
diff --git a/dataset_infos.json b/dataset_infos.json
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "agent-eval"
-version = "0.1.0"
+version = "0.1.1"
 description = "Agent evaluation toolkit"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -43,3 +43,9 @@ where = ["src"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.data-files]
+"agenteval" = ["dataset_features.yml"]
diff --git a/sample-config-hf-readme-metadata.yml b/sample-config-hf-readme-metadata.yml
@@ -6,4 +6,6 @@ configs:
       path: 0.1-dev/validation/*.json
     - split: test
       path: 0.1-dev/test/*.json
+  features:
+  # Insert dataset_features.yml here at the proper indentation level.
 ---
diff --git a/scripts/update_schema.py b/scripts/update_schema.py
@@ -1,22 +1,22 @@
 #!/usr/bin/env python3
 """
-Script to regenerate dataset_infos.json from the Pydantic schema.
+Script to regenerate dataset_features.yml from the Pydantic schema.
 """
 from pathlib import Path
 
-from agenteval.schema_generator import generate_dataset_infos
+from agenteval.schema_generator import write_dataset_features
 
 
 def update_schema():
     repo_root = Path(__file__).parent.parent
-    output_path = repo_root / "dataset_infos.json"
-    generate_dataset_infos(str(output_path))
+    output_path = repo_root / "dataset_features.yml"
+    write_dataset_features(str(output_path))
 
 
 def main():
-    """Regenerate dataset_infos.json from Pydantic schema"""
+    """Regenerate dataset_features.yml from Pydantic schema"""
     update_schema()
-    print("✅ dataset_infos.json updated at dataset_infos.json")
+    print("✅ dataset_features.yml updated")
 
 
 if __name__ == "__main__":
diff --git a/src/agenteval/schema_generator.py b/src/agenteval/schema_generator.py
@@ -3,11 +3,12 @@
 """
 
 import datetime
-import json
 import types
+from importlib import resources
 from typing import Union, get_args, get_origin
 
 import pyarrow as pa
+import yaml
 from datasets import Features
 from pydantic import BaseModel
 
@@ -61,19 +62,34 @@ def _schema_from_pydantic(model: type[BaseModel]) -> list[pa.Field]:
 
 def features_from_pydantic(model: type[BaseModel]) -> Features:
     """
-    Build a Hugging Face Features object from a Pydantic BaseModel using PyArrow schema.
+    Build a HuggingFace Features object from a Pydantic BaseModel using PyArrow schema.
     """
     pa_fields = _schema_from_pydantic(model)
     pa_schema = pa.schema(pa_fields)
     return Features.from_arrow_schema(pa_schema)
 
 
-def generate_dataset_infos(output_path: str = "dataset_infos.json"):
+def write_dataset_features(output_path: str) -> None:
     """
-    Generate a dataset_infos.json file from the EvalResult schema.
+    Write the HuggingFace Features data inferred from the EvalResult schema.
     """
     features = features_from_pydantic(EvalResult)
-    infos = {"default": {"features": features.to_dict()}}
     with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(infos, f, indent=2)
-    print(f"Generated dataset_infos.json at {output_path}")
+        yaml_values = features._to_yaml_list()
+        yaml.safe_dump(yaml_values, f, indent=2, sort_keys=False)
+
+
+def load_dataset_features(input_path: str | None = None) -> Features:
+    """
+    Load the HuggingFace Features data from a YAML file.
+    """
+    if input_path is None:
+        # load the shipped dataset_features.yml from the package
+        with resources.open_text(
+            "agenteval", "dataset_features.yml", encoding="utf-8"
+        ) as f:
+            yaml_values = yaml.safe_load(f)
+    else:
+        with open(input_path, "r", encoding="utf-8") as f:
+            yaml_values = yaml.safe_load(f)
+    return Features._from_yaml_list(yaml_values)
diff --git a/src/agenteval/upload.py b/src/agenteval/upload.py