Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
621d2f0
feat: add scenario improvements and agent factory
lorenss-m Jan 8, 2026
d70d2b0
scenario as tool simplification
lorenss-m Jan 8, 2026
b2de659
change agent resolution for easier model switching
lorenss-m Jan 8, 2026
32b3118
agent tool does not get optional params (eval params)
lorenss-m Jan 8, 2026
8f9f2ba
fix tests
lorenss-m Jan 8, 2026
b957818
change routing logic and add tests
lorenss-m Jan 8, 2026
219f255
lint
lorenss-m Jan 8, 2026
627a6e3
add convenience back
lorenss-m Jan 8, 2026
85aad98
fix edge cases
lorenss-m Jan 8, 2026
8e6b186
mock path fixes
lorenss-m Jan 8, 2026
760f6c8
change import paths
lorenss-m Jan 8, 2026
2a5f10b
format
lorenss-m Jan 8, 2026
99fd3c2
fix agent edge cases
lorenss-m Jan 8, 2026
d74edb4
nested tracing
lorenss-m Jan 8, 2026
6415762
fix tests
lorenss-m Jan 8, 2026
7027550
agent tool examples
lorenss-m Jan 8, 2026
332f42d
docs link
lorenss-m Jan 8, 2026
5325d29
fix env connector
lorenss-m Jan 8, 2026
f17a93b
add routing and tools updates for remote
lorenss-m Jan 8, 2026
c4188d2
add tests to remote connectors and improve connection
lorenss-m Jan 8, 2026
9f95e0f
more precise tests
lorenss-m Jan 8, 2026
f9e18eb
fix: strip format field from JSON schemas for OpenAI strict mode
lorenss-m Jan 8, 2026
757d645
Merge remote-tracking branch 'origin/main' into feat/scenario-improve…
lorenss-m Jan 9, 2026
f3c9e0c
move
lorenss-m Jan 9, 2026
2f67cde
Merge main into feat/scenario-improvements, combine cookbooks
lorenss-m Jan 9, 2026
ff91f24
rm commit
lorenss-m Jan 9, 2026
b1c91b5
format
lorenss-m Jan 9, 2026
cd0cc40
provider fix
lorenss-m Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 52 additions & 5 deletions hud/agents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,66 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from .base import MCPAgent
from .openai import OpenAIAgent
from .openai_chat import OpenAIChatAgent
from .operator import OperatorAgent
from .resolver import resolve_cls

# Note: These agents are not exported here to avoid requiring optional dependencies.
# Import directly if needed:
# from hud.agents.claude import ClaudeAgent # requires anthropic
# from hud.agents.gemini import GeminiAgent # requires google-genai
# from hud.agents.gemini_cua import GeminiCUAAgent # requires google-genai
if TYPE_CHECKING:
from hud.types import AgentType

__all__ = [
"MCPAgent",
"OpenAIAgent",
"OpenAIChatAgent",
"OperatorAgent",
"create_agent",
"resolve_cls",
]


def create_agent(model: str | AgentType, **kwargs: Any) -> MCPAgent:
"""Create an agent from a model string or AgentType.

Args:
model: AgentType ("claude"), or gateway model name ("gpt-4o").
**kwargs: Params passed to agent.create().

Example:
```python
agent = create_agent("claude", model="claude-sonnet-4-5")
agent = create_agent("gpt-4o") # auto-configures gateway
```
"""
from hud.types import AgentType as AT

# AgentType enum → just create
if isinstance(model, AT):
return model.cls.create(**kwargs)

# Resolve class and optional gateway info
agent_cls, gateway_info = resolve_cls(model)

# If not a gateway model, just create
if gateway_info is None:
return agent_cls.create(**kwargs)

# Build gateway params
model_id = gateway_info.get("model") or gateway_info.get("id") or model
kwargs.setdefault("model", model_id)
kwargs.setdefault("validate_api_key", False)

# Build model_client based on provider
if "model_client" not in kwargs and "openai_client" not in kwargs:
from hud.agents.gateway import build_gateway_client

provider = gateway_info.get("provider", "openai_compatible")
client = build_gateway_client(provider)

# OpenAIChatAgent uses openai_client key, others use model_client
key = "openai_client" if agent_cls == OpenAIChatAgent else "model_client"
kwargs[key] = client

return agent_cls.create(**kwargs)
43 changes: 43 additions & 0 deletions hud/agents/gateway.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Gateway client utilities for HUD inference gateway."""

from __future__ import annotations

from typing import Any


def build_gateway_client(provider: str) -> Any:
"""Build a client configured for HUD gateway routing.

Args:
provider: Provider name ("anthropic", "openai", "gemini", etc.)

Returns:
Configured async client for the provider.
"""
from hud.settings import settings

provider = provider.lower()

if provider == "anthropic":
from anthropic import AsyncAnthropic

return AsyncAnthropic(api_key=settings.api_key, base_url=settings.hud_gateway_url)

if provider == "gemini":
from google import genai
from google.genai.types import HttpOptions

return genai.Client(
api_key="PLACEHOLDER",
http_options=HttpOptions(
api_version="v1beta",
base_url=settings.hud_gateway_url,
headers={"Authorization": f"Bearer {settings.api_key}"},
),
)

# OpenAI-compatible (openai, azure, together, groq, fireworks, etc.)
from openai import AsyncOpenAI

return AsyncOpenAI(api_key=settings.api_key, base_url=settings.hud_gateway_url)

70 changes: 70 additions & 0 deletions hud/agents/resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Model resolution - maps model strings to agent classes."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
from hud.agents.base import MCPAgent

__all__ = ["resolve_cls"]

_models_cache: list[dict[str, Any]] | None = None

# Provider name → AgentType value (only anthropic differs)
_PROVIDER_TO_AGENT = {"anthropic": "claude"}


def _fetch_gateway_models() -> list[dict[str, Any]]:
"""Fetch available models from HUD gateway (cached)."""
global _models_cache
if _models_cache is not None:
return _models_cache

import httpx

from hud.settings import settings

if not settings.api_key:
return []

try:
resp = httpx.get(
f"{settings.hud_gateway_url}/models",
headers={"Authorization": f"Bearer {settings.api_key}"},
timeout=10.0,
)
resp.raise_for_status()
data = resp.json()
_models_cache = data.get("data", data) if isinstance(data, dict) else data
return _models_cache or []
except Exception:
return []


def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]:
"""Resolve model string to (agent_class, gateway_info).
Returns:
(agent_class, None) for known AgentTypes
(agent_class, gateway_model_info) for gateway models
"""
from hud.types import AgentType

# Known AgentType → no gateway info
try:
return AgentType(model).cls, None
except ValueError:
pass

# Gateway lookup
for m in _fetch_gateway_models():
if model in (m.get("id"), m.get("name"), m.get("model")):
provider = m.get("provider", "openai_compatible").lower()
agent_str = _PROVIDER_TO_AGENT.get(provider, provider)
try:
return AgentType(agent_str).cls, m
except ValueError:
return AgentType.OPENAI_COMPATIBLE.cls, m

raise ValueError(f"Model '{model}' not found")
54 changes: 17 additions & 37 deletions hud/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,47 +338,27 @@ def get_agent_kwargs(self) -> dict[str, Any]:

# Configure gateway mode - route LLM API calls through HUD gateway
if self.gateway:
hud_api_key = settings.api_key
if not hud_api_key:
if not settings.api_key:
raise typer.Exit(1) # Already validated in validate_api_keys()

if self.agent_type == AgentType.CLAUDE:
from anthropic import AsyncAnthropic

kwargs["model_client"] = AsyncAnthropic(
api_key=hud_api_key,
base_url=settings.hud_gateway_url,
)
hud_console.info("🌐 Using HUD Gateway for Claude API")
elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
from openai import AsyncOpenAI
from hud.agents.gateway import build_gateway_client

kwargs["model_client"] = AsyncOpenAI(
api_key=hud_api_key,
base_url=settings.hud_gateway_url,
)
hud_console.info("🌐 Using HUD Gateway for OpenAI API")
elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
from openai import AsyncOpenAI
# Map AgentType to provider
agent_to_provider = {
AgentType.CLAUDE: "anthropic",
AgentType.OPENAI: "openai",
AgentType.OPERATOR: "openai",
AgentType.GEMINI: "gemini",
AgentType.GEMINI_CUA: "gemini",
AgentType.OPENAI_COMPATIBLE: "openai",
}
provider = agent_to_provider.get(self.agent_type, "openai")
client = build_gateway_client(provider)

kwargs["openai_client"] = AsyncOpenAI(
api_key=hud_api_key,
base_url=settings.hud_gateway_url,
)
hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
from google import genai
from google.genai.types import HttpOptions

kwargs["model_client"] = genai.Client(
api_key="PLACEHOLDER",
http_options=HttpOptions(
api_version="v1beta",
base_url=settings.hud_gateway_url,
headers={"Authorization": f"Bearer {hud_api_key}"},
),
)
hud_console.info("🌐 Using HUD Gateway for Gemini API")
# OpenAI-compatible uses openai_client key
is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE
kwargs["openai_client" if is_oai_compat else "model_client"] = client
hud_console.info(f"🌐 Using HUD Gateway for {provider} API")

return kwargs

Expand Down
2 changes: 1 addition & 1 deletion hud/datasets/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def save_tasks(
)
response.raise_for_status()
data = response.json()
taskset_id = data.get("evalset_id") or data.get("id") or name
taskset_id = data.get("taskset_id") or data.get("evalset_id") or data.get("id") or name
logger.info("Saved %d tasks to taskset: %s", len(tasks), taskset_id)
return taskset_id
except httpx.HTTPStatusError as e:
Expand Down
19 changes: 9 additions & 10 deletions hud/datasets/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,17 @@ async def run_dataset(
if not task_list:
raise ValueError("No tasks to run")

# Resolve agent class
agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
agent_cls = agent_type_enum.cls

# Use hud.eval() for both single and parallel execution
async with hud.eval(
task_list,
group=group_size,
max_concurrent=max_concurrent,
quiet=quiet,
) as ctx:
# Create agent fresh for each context (ensures correct tool initialization)
agent = agent_cls.create(**(agent_params or {}))
# Create agent (handles AgentType, gateway models, etc.)
from hud.agents import create_agent

agent = create_agent(agent_type, **(agent_params or {}))
await agent.run(ctx, max_steps=max_steps)
# Reward is computed by EvalContext.__aexit__ from evaluate tools

Expand All @@ -112,7 +110,7 @@ async def run_dataset(
async def run_single_task(
task: Task,
*,
agent_type: AgentType,
agent_type: str | AgentType,
agent_params: dict[str, Any] | None = None,
max_steps: int = 10,
job_id: str | None = None,
Expand Down Expand Up @@ -198,9 +196,10 @@ async def run_single_task(
if ctx.system_prompt and "system_prompt" not in final_agent_params:
final_agent_params["system_prompt"] = ctx.system_prompt

# Create agent inside ctx so it has access to context-derived values
agent_cls = agent_type.cls
agent = agent_cls.create(**final_agent_params)
# Create agent (handles AgentType, gateway models, etc.)
from hud.agents import create_agent

agent = create_agent(agent_type, **final_agent_params)

# Store metadata if provided
if metadata:
Expand Down
6 changes: 3 additions & 3 deletions hud/datasets/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ def test_load_tasks_success(
mock_settings.api_key = "test_key"

mock_response = MagicMock()
# EvalsetTasksResponse format: tasks keyed by task ID
# TasksetTasksResponse format: tasks keyed by task ID
mock_response.json.return_value = {
"evalset_id": "evalset-123",
"evalset_name": "test-dataset",
"taskset_id": "taskset-123",
"taskset_name": "test-dataset",
"tasks": {
"task-1": {
"env": {"name": "test"},
Expand Down
10 changes: 10 additions & 0 deletions hud/eval/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,20 @@ def from_task(
code_snippet: Code being evaluated
trace: Whether to send traces to backend
quiet: Whether to suppress output

Raises:
ValueError: If task.args is None (template tasks cannot be run directly)
"""
from hud.environment import Environment
from hud.eval.task import build_eval_name

# Validate that task has args (not a template)
if task.args is None:
raise ValueError(
f"Cannot run task with args=None (this is a template). "
f"Provide args when creating the task: env('{task.scenario}', **args)"
)

eval_name = name or build_eval_name(task.scenario, task.args)

# task.env is guaranteed to be Environment after Task.__post_init__
Expand Down
7 changes: 5 additions & 2 deletions hud/eval/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,10 @@ class Task(BaseModel):
env: Any = Field(default=None) # Typed as Any for input flexibility, validated below
scenario: str | None = None
id: str | None = None
args: dict[str, Any] = Field(default_factory=dict)
args: dict[str, Any] | None = Field(
default=None,
description="Scenario arguments. None indicates a template (args filled in later).",
)
validation: list[MCPToolCall] | None = None

# Agent config - settings passed to agent (system_prompt, etc.)
Expand Down Expand Up @@ -335,6 +338,6 @@ def copy(self) -> Task:
id=self.id,
env=self.env, # Share reference
scenario=self.scenario,
args=self.args.copy() if self.args else {},
args=self.args.copy() if self.args else None,
validation=self.validation.copy() if self.validation else None,
)
2 changes: 2 additions & 0 deletions hud/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import TYPE_CHECKING, Any

from .agent import AgentTool
from .base import BaseHub, BaseTool
from .bash import BashTool
from .edit import EditTool
Expand All @@ -21,6 +22,7 @@
)

__all__ = [
"AgentTool",
"AnthropicComputerTool",
"BaseHub",
"BaseTool",
Expand Down
Loading
Loading