Skip to content
Merged
7 changes: 7 additions & 0 deletions hud/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ class EvalConfig(BaseModel):
"auto_respond",
"quiet",
"gateway",
"taskset",
}
# Fields loaded from [agent] section
_AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
Expand All @@ -184,6 +185,7 @@ class EvalConfig(BaseModel):
remote: bool = False
quiet: bool = False # Suppress opening browser for eval links
gateway: bool = False # Use HUD Gateway for LLM API calls
taskset: str | None = None # Taskset slug to associate job with

# Base agent config (these merge with task's agent_config)
allowed_tools: list[str] | None = None
Expand Down Expand Up @@ -701,6 +703,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
max_concurrent=cfg.max_concurrent,
group_size=cfg.group_size,
quiet=cfg.quiet,
taskset=cfg.taskset,
)

# Show reward for single task
Expand Down Expand Up @@ -767,6 +770,9 @@ def eval_command(
gateway: bool = typer.Option(
False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
),
taskset: str | None = typer.Option(
None, "--taskset", "-t", help="Taskset slug to associate job with"
),
) -> None:
"""🚀 Run evaluation on datasets or individual tasks with agents.

Expand Down Expand Up @@ -801,6 +807,7 @@ def eval_command(
byok=byok,
quiet=quiet,
gateway=gateway,
taskset=taskset,
)

# Find source if not provided
Expand Down
2 changes: 2 additions & 0 deletions hud/datasets/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ async def run_dataset(
max_concurrent: int = 30,
group_size: int = 1,
quiet: bool = True,
taskset: str | None = None,
) -> list[EvalContext]:
"""Run an agent on a dataset of tasks.

Expand Down Expand Up @@ -96,6 +97,7 @@ async def run_dataset(
group=group_size,
max_concurrent=max_concurrent,
quiet=quiet,
taskset=taskset,
) as ctx:
# Create agent using AgentType.cls.create()
agent = agent_type.cls.create(**(agent_params or {}))
Expand Down
69 changes: 64 additions & 5 deletions hud/eval/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def _send_job_enter(
variants: dict[str, Any] | None,
group: int,
api_key: str | None,
) -> None:
taskset: str | None = None,
tasks: list[dict[str, Any]] | None = None,
) -> list[str] | None:
"""Send job enter payload (sync request before traces start)."""
import httpx

Expand All @@ -71,23 +73,35 @@ def _send_job_enter(

api_key = api_key or settings.api_key
if not settings.telemetry_enabled or not api_key:
return
return None

payload = JobEnterPayload(
name=name,
variants=variants,
group=group,
taskset=taskset,
tasks=tasks if taskset else None, # only send tasks if taskset specified
)

try:
httpx.post(
resp = httpx.post(
f"{settings.hud_api_url}/trace/job/{job_id}/enter",
json=payload.model_dump(exclude_none=True),
headers={"Authorization": f"Bearer {api_key}"},
timeout=10.0,
)
if resp.is_success:
try:
data = resp.json()
except Exception:
return None
if isinstance(data, dict):
ids = data.get("task_version_ids")
if isinstance(ids, list) and all(isinstance(x, str) for x in ids):
return ids
except Exception as e:
logger.warning("Failed to send job enter: %s", e)
return None


@asynccontextmanager
Expand All @@ -105,6 +119,7 @@ async def run_eval(
max_concurrent: int | None = None,
trace: bool = True,
quiet: bool = False,
taskset: str | None = None,
) -> AsyncGenerator[EvalContext, None]:
"""Standalone eval context manager.

Expand Down Expand Up @@ -235,13 +250,37 @@ async def run_eval(

if total_evals == 1:
if tasks:
# Even for single-task evals, --taskset requires a job_enter call so the run
# and task are linked to the taskset (via job_id + task_version_id).
job_id_for_run = job_id
if taskset:
eval_name = _get_eval_name(tasks=tasks)
if job_id_for_run is None:
job_id_for_run = str(uuid.uuid4())

task_data = None
if not tasks[0].id:
task_data = [tasks[0].model_dump(mode="json", exclude_none=True)]

created_task_version_ids = _send_job_enter(
job_id=job_id_for_run,
name=eval_name,
variants=variants,
group=group,
api_key=api_key,
taskset=taskset,
tasks=task_data,
)
if created_task_version_ids and not tasks[0].id:
tasks[0].id = created_task_version_ids[0]

# Single task - use EvalContext.from_task()
ctx = EvalContext.from_task(
tasks[0],
name=name,
trace_id=trace_id,
api_key=api_key,
job_id=job_id,
job_id=job_id_for_run,
group_id=group_id,
variants=variant_combos[0],
code_snippet=code_snippet,
Expand Down Expand Up @@ -273,13 +312,33 @@ async def run_eval(
job_url = f"https://hud.ai/jobs/{implicit_job_id}"

# Send job enter (sync request before traces start)
_send_job_enter(
# Serialize tasks for auto-add to taskset (only tasks without existing backend id).
# For v5 scenario tasks, the backend task_version_id is carried in Task.id.
tasks_data = None
tasks_to_create: list[Task] = []
if taskset and tasks:
tasks_to_create = [t for t in tasks if not t.id]
tasks_data = [
t.model_dump(mode="json", exclude_none=True)
for t in tasks
if not t.id # skip tasks that already exist in platform
]
created_task_version_ids = _send_job_enter(
job_id=implicit_job_id,
name=eval_name,
variants=variants,
group=group,
api_key=api_key,
taskset=taskset,
tasks=tasks_data,
)
if created_task_version_ids and tasks_to_create:
# Assign backend IDs back onto the in-memory tasks so trace enter includes
# task_version_id.
for task_obj, task_version_id in zip(
tasks_to_create, created_task_version_ids, strict=False
):
task_obj.id = task_version_id

# Print job URL (not individual trace URLs)
if not quiet:
Expand Down
2 changes: 2 additions & 0 deletions hud/eval/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ class JobEnterPayload(BaseModel):
name: str | None = None
variants: dict[str, Any] | None = None # Full variant config
group: int | None = None
taskset: str | None = None # taskset slug to associate job with
tasks: list[dict[str, Any]] | None = None # task definitions to add to taskset


__all__ = [
Expand Down
Loading