Skip to content

Commit bacb5bc

Browse files
committed
Package schema
1 parent ea191d1 commit bacb5bc

File tree

12 files changed

+354
-30
lines changed

12 files changed

+354
-30
lines changed

.github/workflows/ci-tests.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,18 @@ jobs:
1111
runs-on: ubuntu-latest
1212

1313
steps:
14-
- uses: actions/checkout@v3
14+
- name: Checkout code
15+
uses: actions/checkout@v3
16+
1517
- name: Set up Python
1618
uses: actions/setup-python@v4
1719
with:
1820
python-version: '3.10'
21+
1922
- name: Install dependencies
2023
run: |
2124
python -m pip install --upgrade pip
2225
pip install .[dev]
26+
2327
- name: Run tests
2428
run: pytest --maxfail=1 --disable-warnings -q
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Validate dataset_infos.json
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
jobs:
10+
check-schema:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- name: Checkout code
15+
uses: actions/checkout@v3
16+
17+
- name: Set up Python
18+
uses: actions/setup-python@v4
19+
with:
20+
python-version: '3.10'
21+
22+
- name: Install dependencies
23+
run: pip install -e .
24+
25+
- name: Check dataset_infos.json exists
26+
run: |
27+
if [ ! -e dataset_infos.json ]; then
28+
echo "🚨 dataset_infos.json is missing. Please commit dataset_infos.json to the repo." >&2
29+
exit 1
30+
fi
31+
32+
- name: Regenerate dataset_infos.json
33+
run: python scripts/update_schema.py
34+
35+
- name: Verify schema is up to date
36+
run: |
37+
if ! git diff --quiet dataset_infos.json; then
38+
echo "🚨 dataset_infos.json is out of date. Please run 'python scripts/update_schema.py' and commit the updated file." >&2
39+
exit 1
40+
fi

Development.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,15 @@ To publish to pypi:
1010

1111
```shell
1212
export PYPI_TOKEN=...
13-
bash publish.sh
13+
bash scripts/publish.sh
14+
```
15+
16+
# Schema update
17+
18+
The results leaderboard on HuggingFace uses a fixed schema to prevent inferred schema problems.
19+
As the results model changes over time, schema adjustments may be required (a CI check should fail).
20+
The schema can be inferred from the Pydantic model and re-computed using:
21+
22+
```shell
23+
python scripts/update_schema.py
1424
```

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@ tag:
1212
# Upload package to PyPI
1313
publish:
1414
@echo "Uploading package to PyPI..."
15-
@bash ./publish.sh
15+
@bash scripts/publish.sh
16+
17+
# Update dataset_infos.json
18+
update-schema:
19+
@echo "Updating schema..."
20+
python scripts/update_schema.py
1621

1722
test:
1823
@echo "Running tests..."

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ agenteval publish [OPTIONS] LOG_DIR
3535
```
3636
Upload the scored results to HuggingFace datasets.
3737

38+
Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
39+
For more reliable schema parsing, upload the [results schema](https://github.com/allenai/agent-eval/blob/main/dataset_infos.json) to the root of the results dataset.
40+
3841
# Development
3942

4043
See [Development.md](Development.md) for development instructions.

dataset_infos.json

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
{
2+
"default": {
3+
"features": {
4+
"suite_config": {
5+
"name": {
6+
"dtype": "string",
7+
"_type": "Value"
8+
},
9+
"version": {
10+
"dtype": "string",
11+
"_type": "Value"
12+
},
13+
"splits": [
14+
{
15+
"name": {
16+
"dtype": "string",
17+
"_type": "Value"
18+
},
19+
"tasks": [
20+
{
21+
"name": {
22+
"dtype": "string",
23+
"_type": "Value"
24+
},
25+
"path": {
26+
"dtype": "string",
27+
"_type": "Value"
28+
},
29+
"primary_metric": {
30+
"dtype": "string",
31+
"_type": "Value"
32+
},
33+
"tags": {
34+
"feature": {
35+
"dtype": "string",
36+
"_type": "Value"
37+
},
38+
"_type": "Sequence"
39+
}
40+
}
41+
]
42+
}
43+
]
44+
},
45+
"split": {
46+
"dtype": "string",
47+
"_type": "Value"
48+
},
49+
"results": [
50+
{
51+
"task_name": {
52+
"dtype": "string",
53+
"_type": "Value"
54+
},
55+
"metrics": [
56+
{
57+
"name": {
58+
"dtype": "string",
59+
"_type": "Value"
60+
},
61+
"value": {
62+
"dtype": "float64",
63+
"_type": "Value"
64+
}
65+
}
66+
],
67+
"model_usages": [
68+
[
69+
{
70+
"model": {
71+
"dtype": "string",
72+
"_type": "Value"
73+
},
74+
"usage": {
75+
"input_tokens": {
76+
"dtype": "int64",
77+
"_type": "Value"
78+
},
79+
"output_tokens": {
80+
"dtype": "int64",
81+
"_type": "Value"
82+
},
83+
"total_tokens": {
84+
"dtype": "int64",
85+
"_type": "Value"
86+
},
87+
"input_tokens_cache_write": {
88+
"dtype": "int64",
89+
"_type": "Value"
90+
},
91+
"input_tokens_cache_read": {
92+
"dtype": "int64",
93+
"_type": "Value"
94+
},
95+
"reasoning_tokens": {
96+
"dtype": "int64",
97+
"_type": "Value"
98+
}
99+
}
100+
}
101+
]
102+
],
103+
"model_costs": {
104+
"feature": {
105+
"dtype": "float64",
106+
"_type": "Value"
107+
},
108+
"_type": "Sequence"
109+
}
110+
}
111+
],
112+
"submission": {
113+
"submit_time": {
114+
"dtype": "timestamp[us, tz=UTC]",
115+
"_type": "Value"
116+
},
117+
"username": {
118+
"dtype": "string",
119+
"_type": "Value"
120+
},
121+
"agent_name": {
122+
"dtype": "string",
123+
"_type": "Value"
124+
},
125+
"agent_description": {
126+
"dtype": "string",
127+
"_type": "Value"
128+
},
129+
"agent_url": {
130+
"dtype": "string",
131+
"_type": "Value"
132+
},
133+
"logs_url": {
134+
"dtype": "string",
135+
"_type": "Value"
136+
},
137+
"logs_url_public": {
138+
"dtype": "string",
139+
"_type": "Value"
140+
},
141+
"summary_url": {
142+
"dtype": "string",
143+
"_type": "Value"
144+
}
145+
}
146+
}
147+
}
148+
}

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ requires-python = ">=3.10"
1111
dependencies = [
1212
"click",
1313
"inspect-ai",
14-
"huggingface_hub",
1514
"litellm",
1615
"pydantic>=2.0.0",
16+
# For leaderboard
17+
"huggingface_hub",
18+
"pyarrow",
19+
"datasets",
1720
]
1821

1922
[project.urls]

publish.sh renamed to scripts/publish.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,18 @@ if ! git rev-parse "$version" >/dev/null 2>&1; then
1010
exit 1
1111
fi
1212

13+
1314
# 🧹 Clean build artifacts
1415
rm -rf dist
1516

17+
# 🔄 Regenerate dataset_infos.json and verify it’s up to date
18+
echo "Regenerating dataset_infos.json..."
19+
python scripts/update_schema.py ./dataset_infos.json
20+
if ! git diff --quiet ./dataset_infos.json; then
21+
echo "\n🚨 dataset_infos.json is outdated. Please commit the updated file before publishing.\n" >&2
22+
exit 1
23+
fi
24+
1625
# 🔒 Set up PyPI credentials
1726
export TWINE_NON_INTERACTIVE=1
1827
export TWINE_USERNAME='__token__'

scripts/update_schema.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Script to regenerate dataset_infos.json from the Pydantic schema.
4+
"""
5+
from pathlib import Path
6+
7+
from agenteval.schema_generator import generate_dataset_infos
8+
9+
10+
def update_schema():
11+
repo_root = Path(__file__).parent.parent
12+
output_path = repo_root / "dataset_infos.json"
13+
generate_dataset_infos(str(output_path))
14+
15+
16+
def main():
17+
"""Regenerate dataset_infos.json from Pydantic schema"""
18+
update_schema()
19+
print("✅ dataset_infos.json updated at dataset_infos.json")
20+
21+
22+
if __name__ == "__main__":
23+
main()

src/agenteval/config.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,39 @@
66
from pydantic import BaseModel, ValidationError
77

88

9+
class Task(BaseModel):
10+
name: str
11+
"""Canonical task name (used by the leaderboard)."""
12+
13+
path: str
14+
"""Path to the task definition (used by Inspect)."""
15+
16+
primary_metric: str
17+
"""Primary metric for the task, used for summary scores."""
18+
19+
tags: list[str] | None = None
20+
"""List of tags, used for computing summary scores for task groups."""
21+
22+
23+
class Split(BaseModel):
24+
name: str
25+
"""Name of the split."""
26+
27+
tasks: list[Task]
28+
"""List of tasks associated with the split."""
29+
30+
931
class SuiteConfig(BaseModel):
1032
name: str
1133
"""Name of the suite."""
1234

1335
version: str | None = None
1436
"""Version of the suite, e.g. '1.0.0.dev1'."""
1537

16-
splits: list["Split"]
38+
splits: list[Split]
1739
"""List of splits in the suite."""
1840

19-
def get_tasks(self, split_name: str) -> list["Task"]:
41+
def get_tasks(self, split_name: str) -> list[Task]:
2042
"""
2143
Get the tasks for a specific split.
2244
@@ -39,28 +61,6 @@ def get_tasks(self, split_name: str) -> list["Task"]:
3961
)
4062

4163

42-
class Split(BaseModel):
43-
name: str
44-
"""Name of the split."""
45-
46-
tasks: list["Task"]
47-
"""List of tasks associated with the split."""
48-
49-
50-
class Task(BaseModel):
51-
name: str
52-
"""Canonical task name (used by the leaderboard)."""
53-
54-
path: str
55-
"""Path to the task definition (used by Inspect)."""
56-
57-
primary_metric: str
58-
"""Primary metric for the task, used for summary scores."""
59-
60-
tags: list[str] | None = None
61-
"""List of tags, used for computing summary scores for task groups."""
62-
63-
6464
def load_suite_config(file_path: str) -> SuiteConfig:
6565
"""
6666
Load the suite configuration from the specified YAML file.

0 commit comments

Comments
 (0)