@@ -120,6 +120,7 @@ def eval(
120120 run_samples : bool = True ,
121121 score : bool = True ,
122122 score_display : bool | None = None ,
123+ eval_set_id : str | None = None ,
123124 ** kwargs : Unpack [GenerateConfigArgs ],
124125) -> list [EvalLog ]:
125126 r"""Evaluate tasks using a Model.
@@ -203,6 +204,7 @@ def eval(
203204 empty `samples` list is returned.
204205 score: Score output (defaults to True)
205206 score_display: Show scoring metrics in realtime (defaults to True)
207+ eval_set_id: Unique id for eval set (this is passed from `eval_set()` and should not be specified directly).
206208 **kwargs: Model generation options.
207209
208210 Returns:
@@ -260,6 +262,7 @@ async def run_task_app() -> list[EvalLog]:
260262 run_samples = run_samples ,
261263 score = score ,
262264 score_display = score_display ,
265+ eval_set_id = eval_set_id ,
263266 ** kwargs ,
264267 )
265268 # exceptions can escape when debug_errors is True and that's okay
@@ -318,6 +321,7 @@ async def eval_async(
318321 run_samples : bool = True ,
319322 score : bool = True ,
320323 score_display : bool | None = None ,
324+ eval_set_id : str | None = None ,
321325 ** kwargs : Unpack [GenerateConfigArgs ],
322326) -> list [EvalLog ]:
323327 r"""Evaluate tasks using a Model (async).
@@ -382,6 +386,7 @@ async def eval_async(
382386 empty `samples` list is returned.
383387 score: Score output (defaults to True)
384388 score_display: Show scoring metrics in realtime (defaults to True)
389+ eval_set_id: Unique id for eval set (this is passed from `eval_set()` and should not be specified directly).
385390 **kwargs: Model generation options.
386391
387392 Returns:
@@ -435,6 +440,7 @@ async def run(tg: TaskGroup) -> None:
435440 run_samples = run_samples ,
436441 score = score ,
437442 score_display = score_display ,
443+ eval_set_id = eval_set_id ,
438444 ** kwargs ,
439445 )
440446 finally :
@@ -497,6 +503,7 @@ async def _eval_async_inner(
497503 run_samples : bool = True ,
498504 score : bool = True ,
499505 score_display : bool | None = None ,
506+ eval_set_id : str | None = None ,
500507 ** kwargs : Unpack [GenerateConfigArgs ],
501508) -> list [EvalLog ]:
502509 from inspect_ai .hooks ._hooks import emit_run_end , emit_run_start
@@ -652,7 +659,7 @@ async def _eval_async_inner(
652659 task_definitions = len (resolved_tasks ) // len (model )
653660 parallel = 1 if (task_definitions == 1 or max_tasks is None ) else max_tasks
654661
655- await emit_run_start (run_id , resolved_tasks )
662+ await emit_run_start (eval_set_id , run_id , resolved_tasks )
656663
657664 # single task definition (could be multi-model) or max_tasks capped to 1
658665 if parallel == 1 :
@@ -663,6 +670,7 @@ async def _eval_async_inner(
663670 )
664671 results .extend (
665672 await eval_run (
673+ eval_set_id = eval_set_id ,
666674 run_id = run_id ,
667675 tasks = task_batch ,
668676 parallel = parallel ,
@@ -690,6 +698,7 @@ async def _eval_async_inner(
690698 # multiple task definitions AND tasks not capped at 1
691699 else :
692700 results = await eval_run (
701+ eval_set_id = eval_set_id ,
693702 run_id = run_id ,
694703 tasks = resolved_tasks ,
695704 parallel = parallel ,
@@ -711,13 +720,13 @@ async def _eval_async_inner(
711720 cleanup_sample_buffers (log_dir )
712721
713722 try :
714- await emit_run_end (run_id , logs )
723+ await emit_run_end (eval_set_id , run_id , logs )
715724 except UnboundLocalError :
716- await emit_run_end (run_id , EvalLogs ([]))
725+ await emit_run_end (eval_set_id , run_id , EvalLogs ([]))
717726 _eval_async_running = False
718727
719728 except Exception as e :
720- await emit_run_end (run_id , EvalLogs ([]), e )
729+ await emit_run_end (eval_set_id , run_id , EvalLogs ([]), e )
721730 _eval_async_running = False
722731 raise e
723732
0 commit comments