add eval_set_id and eval set hooks (UKGovernmentBEIS#2407)

jjallaire · web-flow · commit 02151ef0575a · 2025-09-04T07:04:56.000-04:00
* introduce eval_set_id

* update schema

* add `eval_set_id` and eval set hooks
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,8 @@
 ## Unreleased
 
 - OpenAI Compatible: Add support for using Responses API via `responses_api` model arg.
+- Eval Set: Add `eval_set_id` to log file (unique id for eval set across invocations for the same `log_dir`).
+- Hooks: New `EvalSetStart` and `EvalSetEnd` hook methods.
 - Bugfix: Ensure ETags always match content when reading S3 logs to prevent write conflicts.
 
 ## 0.3.129 (03 September 2025)
diff --git a/docs/reference/_sidebar.yml b/docs/reference/_sidebar.yml
@@ -678,6 +678,10 @@ website:
           href: reference/inspect_ai.hooks.qmd#apikeyoverride
         - text: ModelUsageData
           href: reference/inspect_ai.hooks.qmd#modelusagedata
+        - text: EvalSetStart
+          href: reference/inspect_ai.hooks.qmd#evalsetstart
+        - text: EvalSetEnd
+          href: reference/inspect_ai.hooks.qmd#evalsetend
         - text: RunEnd
           href: reference/inspect_ai.hooks.qmd#runend
         - text: RunStart
diff --git a/docs/reference/inspect_ai.hooks.qmd b/docs/reference/inspect_ai.hooks.qmd
@@ -11,6 +11,8 @@ title: "inspect_ai.hooks"
 
 ### ApiKeyOverride
 ### ModelUsageData
+### EvalSetStart
+### EvalSetEnd
 ### RunEnd
 ### RunStart
 ### SampleEnd
diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py
@@ -120,6 +120,7 @@ def eval(
     run_samples: bool = True,
     score: bool = True,
     score_display: bool | None = None,
+    eval_set_id: str | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> list[EvalLog]:
     r"""Evaluate tasks using a Model.
@@ -203,6 +204,7 @@ def eval(
             empty `samples` list is returned.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
+        eval_set_id: Unique id for eval set (this is passed from `eval_set()` and should not be specified directly).
         **kwargs: Model generation options.
 
     Returns:
@@ -260,6 +262,7 @@ async def run_task_app() -> list[EvalLog]:
                 run_samples=run_samples,
                 score=score,
                 score_display=score_display,
+                eval_set_id=eval_set_id,
                 **kwargs,
             )
         # exceptions can escape when debug_errors is True and that's okay
@@ -318,6 +321,7 @@ async def eval_async(
     run_samples: bool = True,
     score: bool = True,
     score_display: bool | None = None,
+    eval_set_id: str | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> list[EvalLog]:
     r"""Evaluate tasks using a Model (async).
@@ -382,6 +386,7 @@ async def eval_async(
            empty `samples` list is returned.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
+        eval_set_id: Unique id for eval set (this is passed from `eval_set()` and should not be specified directly).
         **kwargs: Model generation options.
 
     Returns:
@@ -435,6 +440,7 @@ async def run(tg: TaskGroup) -> None:
                 run_samples=run_samples,
                 score=score,
                 score_display=score_display,
+                eval_set_id=eval_set_id,
                 **kwargs,
             )
         finally:
@@ -497,6 +503,7 @@ async def _eval_async_inner(
     run_samples: bool = True,
     score: bool = True,
     score_display: bool | None = None,
+    eval_set_id: str | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> list[EvalLog]:
     from inspect_ai.hooks._hooks import emit_run_end, emit_run_start
@@ -652,7 +659,7 @@ async def _eval_async_inner(
         task_definitions = len(resolved_tasks) // len(model)
         parallel = 1 if (task_definitions == 1 or max_tasks is None) else max_tasks
 
-        await emit_run_start(run_id, resolved_tasks)
+        await emit_run_start(eval_set_id, run_id, resolved_tasks)
 
         # single task definition (could be multi-model) or max_tasks capped to 1
         if parallel == 1:
@@ -663,6 +670,7 @@ async def _eval_async_inner(
                 )
                 results.extend(
                     await eval_run(
+                        eval_set_id=eval_set_id,
                         run_id=run_id,
                         tasks=task_batch,
                         parallel=parallel,
@@ -690,6 +698,7 @@ async def _eval_async_inner(
         # multiple task definitions AND tasks not capped at 1
         else:
             results = await eval_run(
+                eval_set_id=eval_set_id,
                 run_id=run_id,
                 tasks=resolved_tasks,
                 parallel=parallel,
@@ -711,13 +720,13 @@ async def _eval_async_inner(
         cleanup_sample_buffers(log_dir)
 
         try:
-            await emit_run_end(run_id, logs)
+            await emit_run_end(eval_set_id, run_id, logs)
         except UnboundLocalError:
-            await emit_run_end(run_id, EvalLogs([]))
+            await emit_run_end(eval_set_id, run_id, EvalLogs([]))
         _eval_async_running = False
 
     except Exception as e:
-        await emit_run_end(run_id, EvalLogs([]), e)
+        await emit_run_end(eval_set_id, run_id, EvalLogs([]), e)
         _eval_async_running = False
         raise e
 
diff --git a/src/inspect_ai/_eval/evalset.py b/src/inspect_ai/_eval/evalset.py
@@ -5,6 +5,7 @@
 import rich
 from pydantic_core import to_json
 from rich.status import Status
+from shortuuid import uuid
 from tenacity import (
     RetryCallState,
     Retrying,
@@ -15,8 +16,9 @@
 from typing_extensions import Unpack
 
 from inspect_ai._display import display as display_manager
+from inspect_ai._util._async import run_coroutine
 from inspect_ai._util.error import PrerequisiteError
-from inspect_ai._util.file import basename, filesystem
+from inspect_ai._util.file import basename, file, filesystem
 from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
 from inspect_ai.agent._agent import Agent
 from inspect_ai.approval._policy import ApprovalPolicy
@@ -206,9 +208,12 @@ def eval_set(
     Returns:
         A tuple of bool (whether all tasks completed successfully) and a list of `EvalLog` headers (i.e. raw sample data is not included in the logs returned).
     """
+    from inspect_ai.hooks._hooks import emit_eval_set_end, emit_eval_set_start
 
     # helper function to run a set of evals
-    def run_eval(tasks: list[ResolvedTask] | list[PreviousTask]) -> list[EvalLog]:
+    def run_eval(
+        eval_set_id: str, tasks: list[ResolvedTask] | list[PreviousTask]
+    ) -> list[EvalLog]:
         # run evals
         results = eval(
             tasks=tasks,
@@ -252,6 +257,7 @@ def run_eval(tasks: list[ResolvedTask] | list[PreviousTask]) -> list[EvalLog]:
             log_shared=log_shared,
             log_header_only=True,
             score=score,
+            eval_set_id=eval_set_id,
             **kwargs,
         )
 
@@ -289,6 +295,9 @@ def run_eval(tasks: list[ResolvedTask] | list[PreviousTask]) -> list[EvalLog]:
     fs = filesystem(log_dir)
     fs.mkdir(log_dir, exist_ok=True)
 
+    # get eval set id
+    eval_set_id = eval_set_id_for_log_dir(log_dir)
+
     # resolve some parameters
     retry_connections = retry_connections or 1.0
     retry_cleanup = retry_cleanup is not False
@@ -369,7 +378,7 @@ def try_eval() -> list[EvalLog]:
         # we have some pending tasks yet to run, run them
         if len(pending_tasks) > 0:
             # run the tasks
-            run_logs = run_eval(pending_tasks)
+            run_logs = run_eval(eval_set_id, pending_tasks)
 
             # if this was the entire list of resolved tasks, return results
             if len(pending_tasks) == len(all_tasks):
@@ -398,7 +407,8 @@ def try_eval() -> list[EvalLog]:
 
                 # run previous tasks (no models passed b/c previous task already carries its model)
                 retried_logs = run_eval(
-                    tasks=as_previous_tasks(failed_tasks, failed_logs)
+                    eval_set_id=eval_set_id,
+                    tasks=as_previous_tasks(failed_tasks, failed_logs),
                 )
 
                 # return success
@@ -419,6 +429,9 @@ def try_eval() -> list[EvalLog]:
         before=before,
     )
 
+    # emit start event
+    run_coroutine(emit_eval_set_start(eval_set_id, log_dir))
+
     # execute w/ retry
     results = retry(try_eval)
 
@@ -438,10 +451,27 @@ def try_eval() -> list[EvalLog]:
     # update manifest
     write_log_dir_manifest(log_dir)
 
+    # emit end event
+    run_coroutine(emit_eval_set_end(eval_set_id, log_dir))
+
     # return status + results
     return success, results
 
 
+def eval_set_id_for_log_dir(log_dir: str) -> str:
+    EVAL_SET_ID_FILE = ".eval-set-id"
+    fs = filesystem(log_dir)
+    eval_set_id_file = f"{log_dir}{fs.sep}{EVAL_SET_ID_FILE}"
+    if fs.exists(eval_set_id_file):
+        with file(eval_set_id_file, "r") as f:
+            return f.read().strip()
+    else:
+        eval_set_id = uuid()
+        with file(eval_set_id_file, "w") as f:
+            f.write(eval_set_id)
+        return eval_set_id
+
+
 # convert resolved tasks to previous tasks
 def as_previous_tasks(
     tasks: list[ResolvedTask], failed_logs: list[Log]
diff --git a/src/inspect_ai/_eval/run.py b/src/inspect_ai/_eval/run.py
@@ -58,6 +58,7 @@
 
 
 async def eval_run(
+    eval_set_id: str | None,
     run_id: str,
     tasks: list[ResolvedTask],
     parallel: int,
@@ -207,6 +208,7 @@ async def eval_run(
                     task_registry_name=resolved_task.task.registry_name,
                     task_display_name=resolved_task.task.display_name,
                     task_id=resolved_task.id if resolved_task.id else uuid(),
+                    eval_set_id=eval_set_id,
                     run_id=run_id,
                     solver=eval_solver_spec,
                     tags=tags,
diff --git a/src/inspect_ai/_eval/task/log.py b/src/inspect_ai/_eval/task/log.py
@@ -61,6 +61,7 @@ def __init__(
         task_registry_name: str | None,
         task_display_name: str | None,
         task_id: str | None,
+        eval_set_id: str | None,
         run_id: str,
         solver: SolverSpec | None,
         tags: list[str] | None,
@@ -121,6 +122,7 @@ def __init__(
 
         # create eval spec
         self.eval = EvalSpec(
+            eval_set_id=eval_set_id,
             run_id=run_id,
             created=iso_now(),
             task=f"{task_name}",
diff --git a/src/inspect_ai/_eval/task/run.py b/src/inspect_ai/_eval/task/run.py
@@ -355,6 +355,7 @@ async def run_sample(
                         time_limit=config.time_limit,
                         working_limit=config.working_limit,
                         semaphore=sample_semaphore,
+                        eval_set_id=logger.eval.eval_set_id,
                         run_id=logger.eval.run_id,
                         task_id=logger.eval.eval_id,
                     )
@@ -553,6 +554,7 @@ async def task_run_sample(
     time_limit: int | None,
     working_limit: int | None,
     semaphore: anyio.Semaphore | None,
+    eval_set_id: str | None,
     run_id: str,
     task_id: str,
 ) -> dict[str, SampleScore] | None:
@@ -722,7 +724,11 @@ async def run(tg: TaskGroup) -> None:
                                 # only emit the sample start once: not on retries
                                 if not error_retries:
                                     await emit_sample_start(
-                                        run_id, task_id, state.uuid, sample_summary
+                                        eval_set_id,
+                                        run_id,
+                                        task_id,
+                                        state.uuid,
+                                        sample_summary,
                                     )
 
                                 # set progress for plan then run it
@@ -933,7 +939,7 @@ async def run(tg: TaskGroup) -> None:
                 await log_sample(
                     eval_sample=eval_sample, logger=logger, log_images=log_images
                 )
-            await emit_sample_end(run_id, task_id, state.uuid, eval_sample)
+            await emit_sample_end(eval_set_id, run_id, task_id, state.uuid, eval_sample)
 
     # error that should be retried (we do this outside of the above scope so that we can
     # retry outside of the original semaphore -- our retry will therefore go to the back
@@ -970,6 +976,7 @@ async def run(tg: TaskGroup) -> None:
             time_limit=time_limit,
             working_limit=working_limit,
             semaphore=semaphore,
+            eval_set_id=eval_set_id,
             run_id=run_id,
             task_id=task_id,
         )
diff --git a/src/inspect_ai/_view/www/log-schema.json b/src/inspect_ai/_view/www/log-schema.json
@@ -2968,6 +2968,18 @@
     "EvalSpec": {
       "description": "Eval target and configuration.",
       "properties": {
+        "eval_set_id": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Eval Set Id"
+        },
         "eval_id": {
           "title": "Eval Id",
           "type": "string"
@@ -3219,6 +3231,7 @@
         }
       },
       "required": [
+        "eval_set_id",
         "eval_id",
         "run_id",
         "created",
diff --git a/src/inspect_ai/_view/www/src/@types/log.d.ts b/src/inspect_ai/_view/www/src/@types/log.d.ts
@@ -7,6 +7,7 @@
 
 export type Version = number;
 export type Status = "started" | "success" | "cancelled" | "error";
+export type EvalSetId = string | null;
 export type EvalId = string;
 export type RunId = string;
 export type Created = string;
@@ -787,6 +788,7 @@ export interface EvalLog {
  * Eval target and configuration.
  */
 export interface EvalSpec {
+  eval_set_id: EvalSetId;
   eval_id: EvalId;
   run_id: RunId;
   created: Created;
diff --git a/src/inspect_ai/analysis/_dataframe/evals/columns.py b/src/inspect_ai/analysis/_dataframe/evals/columns.py
@@ -59,6 +59,7 @@ def path_schema(self) -> Mapping[str, Any]:
 """Eval log column."""
 
 EvalInfo: list[Column] = [
+    EvalColumn("eval_set_id", path="eval.eval_set_id"),
     EvalColumn("run_id", path="eval.run_id", required=True),
     EvalColumn("task_id", path="eval.task_id", required=True),
     *EvalLogPath,
diff --git a/src/inspect_ai/hooks/__init__.py b/src/inspect_ai/hooks/__init__.py
@@ -1,5 +1,7 @@
 from inspect_ai.hooks._hooks import (
     ApiKeyOverride,
+    EvalSetEnd,
+    EvalSetStart,
     Hooks,
     ModelUsageData,
     RunEnd,
@@ -15,6 +17,8 @@
     "ApiKeyOverride",
     "Hooks",
     "ModelUsageData",
+    "EvalSetStart",
+    "EvalSetEnd",
     "RunEnd",
     "RunStart",
     "SampleEnd",
diff --git a/src/inspect_ai/hooks/_hooks.py b/src/inspect_ai/hooks/_hooks.py
diff --git a/src/inspect_ai/log/_log.py b/src/inspect_ai/log/_log.py
diff --git a/tests/test_eval_set.py b/tests/test_eval_set.py
diff --git a/tests/test_extensions.py b/tests/test_extensions.py