add --taskset flag to associate jobs with tasksets

shfunc · shfunc · commit 9a76b816b750 · 2026-01-12T15:11:38.000+01:00
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
@@ -164,6 +164,7 @@ class EvalConfig(BaseModel):
         "auto_respond",
         "quiet",
         "gateway",
+        "taskset",
     }
     # Fields loaded from [agent] section
     _AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
@@ -184,6 +185,7 @@ class EvalConfig(BaseModel):
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
     gateway: bool = False  # Use HUD Gateway for LLM API calls
+    taskset: str | None = None  # Taskset slug to associate job with
 
     # Base agent config (these merge with task's agent_config)
     allowed_tools: list[str] | None = None
@@ -701,6 +703,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         max_concurrent=cfg.max_concurrent,
         group_size=cfg.group_size,
         quiet=cfg.quiet,
+        taskset=cfg.taskset,
     )
 
     # Show reward for single task
@@ -767,6 +770,9 @@ def eval_command(
     gateway: bool = typer.Option(
         False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
     ),
+    taskset: str | None = typer.Option(
+        None, "--taskset", "-t", help="Taskset slug to associate job with"
+    ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
 
@@ -801,6 +807,7 @@ def eval_command(
         byok=byok,
         quiet=quiet,
         gateway=gateway,
+        taskset=taskset,
     )
 
     # Find source if not provided
diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
@@ -29,6 +29,7 @@ async def run_dataset(
     max_concurrent: int = 30,
     group_size: int = 1,
     quiet: bool = True,
+    taskset: str | None = None,
 ) -> list[EvalContext]:
     """Run an agent on a dataset of tasks.
 
@@ -96,6 +97,7 @@ async def run_dataset(
         group=group_size,
         max_concurrent=max_concurrent,
         quiet=quiet,
+        taskset=taskset,
     ) as ctx:
         # Create agent using AgentType.cls.create()
         agent = agent_type.cls.create(**(agent_params or {}))
diff --git a/hud/eval/manager.py b/hud/eval/manager.py
@@ -62,6 +62,7 @@ def _send_job_enter(
     variants: dict[str, Any] | None,
     group: int,
     api_key: str | None,
+    taskset: str | None = None,
 ) -> None:
     """Send job enter payload (sync request before traces start)."""
     import httpx
@@ -77,6 +78,7 @@ def _send_job_enter(
         name=name,
         variants=variants,
         group=group,
+        taskset=taskset,
     )
 
     try:
@@ -105,6 +107,7 @@ async def run_eval(
     max_concurrent: int | None = None,
     trace: bool = True,
     quiet: bool = False,
+    taskset: str | None = None,
 ) -> AsyncGenerator[EvalContext, None]:
     """Standalone eval context manager.
 
@@ -279,6 +282,7 @@ async def run_eval(
             variants=variants,
             group=group,
             api_key=api_key,
+            taskset=taskset,
         )
 
         # Print job URL (not individual trace URLs)
diff --git a/hud/eval/types.py b/hud/eval/types.py
@@ -53,6 +53,7 @@ class JobEnterPayload(BaseModel):
     name: str | None = None
     variants: dict[str, Any] | None = None  # Full variant config
     group: int | None = None
+    taskset: str | None = None  # taskset slug to associate job with
 
 
 __all__ = [