Skip to content

Commit 02151ef

Browse files
authored
add eval_set_id and eval set hooks (UKGovernmentBEIS#2407)
* introduce eval_set_id * update schema * add `eval_set_id` and eval set hooks
1 parent fc3a797 commit 02151ef

File tree

16 files changed

+202
-20
lines changed

16 files changed

+202
-20
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
## Unreleased
22

33
- OpenAI Compatible: Add support for using Responses API via `responses_api` model arg.
4+
- Eval Set: Add `eval_set_id` to log file (unique id for eval set across invocations for the same `log_dir`).
5+
- Hooks: New `EvalSetStart` and `EvalSetEnd` hook methods.
46
- Bugfix: Ensure ETags always match content when reading S3 logs to prevent write conflicts.
57

68
## 0.3.129 (03 September 2025)

docs/reference/_sidebar.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,10 @@ website:
678678
href: reference/inspect_ai.hooks.qmd#apikeyoverride
679679
- text: ModelUsageData
680680
href: reference/inspect_ai.hooks.qmd#modelusagedata
681+
- text: EvalSetStart
682+
href: reference/inspect_ai.hooks.qmd#evalsetstart
683+
- text: EvalSetEnd
684+
href: reference/inspect_ai.hooks.qmd#evalsetend
681685
- text: RunEnd
682686
href: reference/inspect_ai.hooks.qmd#runend
683687
- text: RunStart

docs/reference/inspect_ai.hooks.qmd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ title: "inspect_ai.hooks"
1111

1212
### ApiKeyOverride
1313
### ModelUsageData
14+
### EvalSetStart
15+
### EvalSetEnd
1416
### RunEnd
1517
### RunStart
1618
### SampleEnd

src/inspect_ai/_eval/eval.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def eval(
120120
run_samples: bool = True,
121121
score: bool = True,
122122
score_display: bool | None = None,
123+
eval_set_id: str | None = None,
123124
**kwargs: Unpack[GenerateConfigArgs],
124125
) -> list[EvalLog]:
125126
r"""Evaluate tasks using a Model.
@@ -203,6 +204,7 @@ def eval(
203204
empty `samples` list is returned.
204205
score: Score output (defaults to True)
205206
score_display: Show scoring metrics in realtime (defaults to True)
207+
eval_set_id: Unique id for eval set (this is passed from `eval_set()` and should not be specified directly).
206208
**kwargs: Model generation options.
207209
208210
Returns:
@@ -260,6 +262,7 @@ async def run_task_app() -> list[EvalLog]:
260262
run_samples=run_samples,
261263
score=score,
262264
score_display=score_display,
265+
eval_set_id=eval_set_id,
263266
**kwargs,
264267
)
265268
# exceptions can escape when debug_errors is True and that's okay
@@ -318,6 +321,7 @@ async def eval_async(
318321
run_samples: bool = True,
319322
score: bool = True,
320323
score_display: bool | None = None,
324+
eval_set_id: str | None = None,
321325
**kwargs: Unpack[GenerateConfigArgs],
322326
) -> list[EvalLog]:
323327
r"""Evaluate tasks using a Model (async).
@@ -382,6 +386,7 @@ async def eval_async(
382386
empty `samples` list is returned.
383387
score: Score output (defaults to True)
384388
score_display: Show scoring metrics in realtime (defaults to True)
389+
eval_set_id: Unique id for eval set (this is passed from `eval_set()` and should not be specified directly).
385390
**kwargs: Model generation options.
386391
387392
Returns:
@@ -435,6 +440,7 @@ async def run(tg: TaskGroup) -> None:
435440
run_samples=run_samples,
436441
score=score,
437442
score_display=score_display,
443+
eval_set_id=eval_set_id,
438444
**kwargs,
439445
)
440446
finally:
@@ -497,6 +503,7 @@ async def _eval_async_inner(
497503
run_samples: bool = True,
498504
score: bool = True,
499505
score_display: bool | None = None,
506+
eval_set_id: str | None = None,
500507
**kwargs: Unpack[GenerateConfigArgs],
501508
) -> list[EvalLog]:
502509
from inspect_ai.hooks._hooks import emit_run_end, emit_run_start
@@ -652,7 +659,7 @@ async def _eval_async_inner(
652659
task_definitions = len(resolved_tasks) // len(model)
653660
parallel = 1 if (task_definitions == 1 or max_tasks is None) else max_tasks
654661

655-
await emit_run_start(run_id, resolved_tasks)
662+
await emit_run_start(eval_set_id, run_id, resolved_tasks)
656663

657664
# single task definition (could be multi-model) or max_tasks capped to 1
658665
if parallel == 1:
@@ -663,6 +670,7 @@ async def _eval_async_inner(
663670
)
664671
results.extend(
665672
await eval_run(
673+
eval_set_id=eval_set_id,
666674
run_id=run_id,
667675
tasks=task_batch,
668676
parallel=parallel,
@@ -690,6 +698,7 @@ async def _eval_async_inner(
690698
# multiple task definitions AND tasks not capped at 1
691699
else:
692700
results = await eval_run(
701+
eval_set_id=eval_set_id,
693702
run_id=run_id,
694703
tasks=resolved_tasks,
695704
parallel=parallel,
@@ -711,13 +720,13 @@ async def _eval_async_inner(
711720
cleanup_sample_buffers(log_dir)
712721

713722
try:
714-
await emit_run_end(run_id, logs)
723+
await emit_run_end(eval_set_id, run_id, logs)
715724
except UnboundLocalError:
716-
await emit_run_end(run_id, EvalLogs([]))
725+
await emit_run_end(eval_set_id, run_id, EvalLogs([]))
717726
_eval_async_running = False
718727

719728
except Exception as e:
720-
await emit_run_end(run_id, EvalLogs([]), e)
729+
await emit_run_end(eval_set_id, run_id, EvalLogs([]), e)
721730
_eval_async_running = False
722731
raise e
723732

src/inspect_ai/_eval/evalset.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import rich
66
from pydantic_core import to_json
77
from rich.status import Status
8+
from shortuuid import uuid
89
from tenacity import (
910
RetryCallState,
1011
Retrying,
@@ -15,8 +16,9 @@
1516
from typing_extensions import Unpack
1617

1718
from inspect_ai._display import display as display_manager
19+
from inspect_ai._util._async import run_coroutine
1820
from inspect_ai._util.error import PrerequisiteError
19-
from inspect_ai._util.file import basename, filesystem
21+
from inspect_ai._util.file import basename, file, filesystem
2022
from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
2123
from inspect_ai.agent._agent import Agent
2224
from inspect_ai.approval._policy import ApprovalPolicy
@@ -206,9 +208,12 @@ def eval_set(
206208
Returns:
207209
A tuple of bool (whether all tasks completed successfully) and a list of `EvalLog` headers (i.e. raw sample data is not included in the logs returned).
208210
"""
211+
from inspect_ai.hooks._hooks import emit_eval_set_end, emit_eval_set_start
209212

210213
# helper function to run a set of evals
211-
def run_eval(tasks: list[ResolvedTask] | list[PreviousTask]) -> list[EvalLog]:
214+
def run_eval(
215+
eval_set_id: str, tasks: list[ResolvedTask] | list[PreviousTask]
216+
) -> list[EvalLog]:
212217
# run evals
213218
results = eval(
214219
tasks=tasks,
@@ -252,6 +257,7 @@ def run_eval(tasks: list[ResolvedTask] | list[PreviousTask]) -> list[EvalLog]:
252257
log_shared=log_shared,
253258
log_header_only=True,
254259
score=score,
260+
eval_set_id=eval_set_id,
255261
**kwargs,
256262
)
257263

@@ -289,6 +295,9 @@ def run_eval(tasks: list[ResolvedTask] | list[PreviousTask]) -> list[EvalLog]:
289295
fs = filesystem(log_dir)
290296
fs.mkdir(log_dir, exist_ok=True)
291297

298+
# get eval set id
299+
eval_set_id = eval_set_id_for_log_dir(log_dir)
300+
292301
# resolve some parameters
293302
retry_connections = retry_connections or 1.0
294303
retry_cleanup = retry_cleanup is not False
@@ -369,7 +378,7 @@ def try_eval() -> list[EvalLog]:
369378
# we have some pending tasks yet to run, run them
370379
if len(pending_tasks) > 0:
371380
# run the tasks
372-
run_logs = run_eval(pending_tasks)
381+
run_logs = run_eval(eval_set_id, pending_tasks)
373382

374383
# if this was the entire list of resolved tasks, return results
375384
if len(pending_tasks) == len(all_tasks):
@@ -398,7 +407,8 @@ def try_eval() -> list[EvalLog]:
398407

399408
# run previous tasks (no models passed b/c previous task already carries its model)
400409
retried_logs = run_eval(
401-
tasks=as_previous_tasks(failed_tasks, failed_logs)
410+
eval_set_id=eval_set_id,
411+
tasks=as_previous_tasks(failed_tasks, failed_logs),
402412
)
403413

404414
# return success
@@ -419,6 +429,9 @@ def try_eval() -> list[EvalLog]:
419429
before=before,
420430
)
421431

432+
# emit start event
433+
run_coroutine(emit_eval_set_start(eval_set_id, log_dir))
434+
422435
# execute w/ retry
423436
results = retry(try_eval)
424437

@@ -438,10 +451,27 @@ def try_eval() -> list[EvalLog]:
438451
# update manifest
439452
write_log_dir_manifest(log_dir)
440453

454+
# emit end event
455+
run_coroutine(emit_eval_set_end(eval_set_id, log_dir))
456+
441457
# return status + results
442458
return success, results
443459

444460

461+
def eval_set_id_for_log_dir(log_dir: str) -> str:
462+
EVAL_SET_ID_FILE = ".eval-set-id"
463+
fs = filesystem(log_dir)
464+
eval_set_id_file = f"{log_dir}{fs.sep}{EVAL_SET_ID_FILE}"
465+
if fs.exists(eval_set_id_file):
466+
with file(eval_set_id_file, "r") as f:
467+
return f.read().strip()
468+
else:
469+
eval_set_id = uuid()
470+
with file(eval_set_id_file, "w") as f:
471+
f.write(eval_set_id)
472+
return eval_set_id
473+
474+
445475
# convert resolved tasks to previous tasks
446476
def as_previous_tasks(
447477
tasks: list[ResolvedTask], failed_logs: list[Log]

src/inspect_ai/_eval/run.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858

5959

6060
async def eval_run(
61+
eval_set_id: str | None,
6162
run_id: str,
6263
tasks: list[ResolvedTask],
6364
parallel: int,
@@ -207,6 +208,7 @@ async def eval_run(
207208
task_registry_name=resolved_task.task.registry_name,
208209
task_display_name=resolved_task.task.display_name,
209210
task_id=resolved_task.id if resolved_task.id else uuid(),
211+
eval_set_id=eval_set_id,
210212
run_id=run_id,
211213
solver=eval_solver_spec,
212214
tags=tags,

src/inspect_ai/_eval/task/log.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def __init__(
6161
task_registry_name: str | None,
6262
task_display_name: str | None,
6363
task_id: str | None,
64+
eval_set_id: str | None,
6465
run_id: str,
6566
solver: SolverSpec | None,
6667
tags: list[str] | None,
@@ -121,6 +122,7 @@ def __init__(
121122

122123
# create eval spec
123124
self.eval = EvalSpec(
125+
eval_set_id=eval_set_id,
124126
run_id=run_id,
125127
created=iso_now(),
126128
task=f"{task_name}",

src/inspect_ai/_eval/task/run.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ async def run_sample(
355355
time_limit=config.time_limit,
356356
working_limit=config.working_limit,
357357
semaphore=sample_semaphore,
358+
eval_set_id=logger.eval.eval_set_id,
358359
run_id=logger.eval.run_id,
359360
task_id=logger.eval.eval_id,
360361
)
@@ -553,6 +554,7 @@ async def task_run_sample(
553554
time_limit: int | None,
554555
working_limit: int | None,
555556
semaphore: anyio.Semaphore | None,
557+
eval_set_id: str | None,
556558
run_id: str,
557559
task_id: str,
558560
) -> dict[str, SampleScore] | None:
@@ -722,7 +724,11 @@ async def run(tg: TaskGroup) -> None:
722724
# only emit the sample start once: not on retries
723725
if not error_retries:
724726
await emit_sample_start(
725-
run_id, task_id, state.uuid, sample_summary
727+
eval_set_id,
728+
run_id,
729+
task_id,
730+
state.uuid,
731+
sample_summary,
726732
)
727733

728734
# set progress for plan then run it
@@ -933,7 +939,7 @@ async def run(tg: TaskGroup) -> None:
933939
await log_sample(
934940
eval_sample=eval_sample, logger=logger, log_images=log_images
935941
)
936-
await emit_sample_end(run_id, task_id, state.uuid, eval_sample)
942+
await emit_sample_end(eval_set_id, run_id, task_id, state.uuid, eval_sample)
937943

938944
# error that should be retried (we do this outside of the above scope so that we can
939945
# retry outside of the original semaphore -- our retry will therefore go to the back
@@ -970,6 +976,7 @@ async def run(tg: TaskGroup) -> None:
970976
time_limit=time_limit,
971977
working_limit=working_limit,
972978
semaphore=semaphore,
979+
eval_set_id=eval_set_id,
973980
run_id=run_id,
974981
task_id=task_id,
975982
)

src/inspect_ai/_view/www/log-schema.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2968,6 +2968,18 @@
29682968
"EvalSpec": {
29692969
"description": "Eval target and configuration.",
29702970
"properties": {
2971+
"eval_set_id": {
2972+
"anyOf": [
2973+
{
2974+
"type": "string"
2975+
},
2976+
{
2977+
"type": "null"
2978+
}
2979+
],
2980+
"default": null,
2981+
"title": "Eval Set Id"
2982+
},
29712983
"eval_id": {
29722984
"title": "Eval Id",
29732985
"type": "string"
@@ -3219,6 +3231,7 @@
32193231
}
32203232
},
32213233
"required": [
3234+
"eval_set_id",
32223235
"eval_id",
32233236
"run_id",
32243237
"created",

src/inspect_ai/_view/www/src/@types/log.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
export type Version = number;
99
export type Status = "started" | "success" | "cancelled" | "error";
10+
export type EvalSetId = string | null;
1011
export type EvalId = string;
1112
export type RunId = string;
1213
export type Created = string;
@@ -787,6 +788,7 @@ export interface EvalLog {
787788
* Eval target and configuration.
788789
*/
789790
export interface EvalSpec {
791+
eval_set_id: EvalSetId;
790792
eval_id: EvalId;
791793
run_id: RunId;
792794
created: Created;

0 commit comments

Comments
 (0)