hud-evals
diff --git a/‎docs/cookbooks/ops-diagnostics.mdx‎
Lines changed: 478 additions & 0 deletions b/‎docs/cookbooks/ops-diagnostics.mdx‎
Lines changed: 478 additions & 0 deletions
diff --git a/‎docs/docs.json‎
Lines changed: 3 additions & 2 deletions b/‎docs/docs.json‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/reference/environments.mdx‎
Lines changed: 75 additions & 0 deletions b/‎docs/reference/environments.mdx‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎docs/reference/tools.mdx‎
Lines changed: 87 additions & 0 deletions b/‎docs/reference/tools.mdx‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎hud/agents/__init__.py‎
Lines changed: 69 additions & 6 deletions b/‎hud/agents/__init__.py‎
Lines changed: 69 additions & 6 deletions
diff --git a/‎hud/agents/gateway.py‎
Lines changed: 42 additions & 0 deletions b/‎hud/agents/gateway.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎hud/agents/resolver.py‎
Lines changed: 70 additions & 0 deletions b/‎hud/agents/resolver.py‎
Lines changed: 70 additions & 0 deletions
@@ -33,7 +33,7 @@
         "icon": "code",
         "versions": [
           {
-            "version": "0.5.2",
+            "version": "0.5.3",
             "groups": [
               {
                 "group": "Get Started",
@@ -63,7 +63,8 @@
               {
                 "group": "Cookbooks",
                 "pages": [
-                  "cookbooks/codex-coding"
+                  "cookbooks/codex-coding",
+                  "cookbooks/ops-diagnostics"
                 ]
               },
               {
 
@@ -266,6 +266,81 @@ env.unmock()  # Disable mock mode
 | `mock_tool(name, output)` | Set specific mock output |
 | `is_mock` | Check if mock mode is enabled |
 
+## Serving as MCP Server
+
+Environment can serve its tools over MCP protocols, either standalone or mounted on an existing server.
+
+### serve()
+
+Start a standalone MCP server:
+
+```python
+from hud import Environment
+
+env = Environment("my-env")
+
+@env.tool()
+def greet(name: str) -> str:
+    return f"Hello, {name}!"
+
+# Run as MCP server (blocking)
+env.serve()
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `transport` | `Literal["stdio", "sse", "streamable-http"]` | Transport protocol | `"streamable-http"` |
+| `host` | `str` | Host address to bind | `"0.0.0.0"` |
+| `port` | `int` | Port to bind | `8000` |
+
+```python
+# Serve over stdio (for CLI tools)
+env.serve(transport="stdio")
+
+# Serve over HTTP on custom port
+env.serve(transport="streamable-http", host="0.0.0.0", port=8765)
+```
+
+### http_app()
+
+Get a Starlette/ASGI app to mount on an existing FastAPI server:
+
+```python
+from fastapi import FastAPI
+from hud import Environment
+
+app = FastAPI()
+env = Environment("my-env")
+
+@env.tool()
+def my_tool(arg: str) -> str:
+    return f"Got: {arg}"
+
+# Mount the HUD environment's MCP endpoint at /mcp
+app.mount("/mcp", env.http_app())
+
+# Your other FastAPI routes work normally
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `path` | `str \| None` | Internal path for the MCP endpoint | `"/"` |
+| `transport` | `Literal["http", "streamable-http", "sse"]` | Transport protocol | `"http"` |
+| `middleware` | `list[ASGIMiddleware] \| None` | Starlette middleware | `None` |
+| `json_response` | `bool \| None` | Use JSON response format | `None` |
+| `stateless_http` | `bool \| None` | Use stateless HTTP mode | `None` |
+
+MCP clients can then connect at `http://your-server/mcp`:
+
+```python
+# Client connecting to mounted environment
+env.connect_url("http://localhost:8000/mcp")
+```
+
+
 ## Properties
 
 | Property | Type | Description |
 
@@ -69,6 +69,93 @@ async def url_match(url: str) -> EvaluationResult:
 # Agents call: evaluators(name="url_match", arguments={"url": "..."})
 ```
 
+## Agent Tools
+
+### AgentTool
+
+```python
+from hud.tools import AgentTool
+```
+
+Wraps a scenario as a tool that can be called by another agent. Essential for building **hierarchical agent systems** where an orchestrator delegates to specialized subagents.
+
+**Constructor Parameters:**
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `task` | `Task` | Task template from `env("scenario_name")` | Required |
+| `model` | `str` | Model for subagent (via gateway) | `None` |
+| `agent` | `type[MCPAgent]` | Custom agent class | `None` |
+| `agent_params` | `dict` | Additional agent parameters | `{}` |
+| `name` | `str` | Tool name for orchestrator | From scenario |
+| `description` | `str` | Tool description | Auto-generated |
+| `trace` | `bool` | Enable tracing for standalone runs | `False` |
+
+<Note>Must provide either `model` or `agent`, not both.</Note>
+
+**Eval-Only Parameters:**
+
+Parameters with `| None = None` are hidden from the orchestrator but available for evaluation:
+
+```python
+@env.scenario("investigate")
+async def investigate(
+    query: str,                          # Visible - orchestrator passes this
+    expected_finding: str | None = None, # Hidden - only used in eval scoring
+):
+    response = yield f"Investigate: {query}"
+    
+    # Scoring uses expected_finding but orchestrator never sees it
+    if expected_finding and response:
+        yield 1.0 if expected_finding in response else 0.5
+    else:
+        yield 1.0 if response else 0.0
+```
+
+**Usage:**
+```python
+from hud import Environment
+from hud.tools import AgentTool
+
+# Subagent environment with scenario
+sentry_env = Environment(name="sentry-agent")
+
+@sentry_env.scenario("investigate")
+async def investigate_sentry(query: str):
+    yield f"Investigate Sentry: {query}"
+
+# Create orchestrator
+orchestrator = Environment(name="orchestrator")
+
+# Wrap subagent scenario as tool
+tool = AgentTool(
+    sentry_env("investigate"),  # Task template
+    model="gpt-4o-mini",
+    name="investigate_sentry",
+    description="Investigate errors in Sentry",
+)
+orchestrator.add_tool(tool.mcp)
+
+# Now orchestrator agent can call investigate_sentry(query="...")
+```
+
+**Trace Continuity:**
+
+When called from within an eval context, AgentTool automatically:
+1. Inherits the parent's trace_id
+2. Skips duplicate trace registration
+3. Routes all inference/tool calls to the parent trace
+
+```python
+async with hud.eval(task) as ctx:
+    agent = create_agent("gpt-4o")
+    result = await agent.run(ctx)
+    # All subagent activity appears in this single trace
+```
+
+**See Also:** [Ops Diagnostics Cookbook](/cookbook/ops-diagnostics) for a complete hierarchical agent example.
+
+---
+
 ## Core Tools
 
 ### BashTool
 
@@ -1,19 +1,82 @@
 from __future__ import annotations
 
+from typing import Any
+
 from .base import MCPAgent
 from .openai import OpenAIAgent
 from .openai_chat import OpenAIChatAgent
 from .operator import OperatorAgent
 
-# Note: These agents are not exported here to avoid requiring optional dependencies.
-# Import directly if needed:
-#   from hud.agents.claude import ClaudeAgent  # requires anthropic
-#   from hud.agents.gemini import GeminiAgent  # requires google-genai
-#   from hud.agents.gemini_cua import GeminiCUAAgent  # requires google-genai
-
 __all__ = [
     "MCPAgent",
     "OpenAIAgent",
     "OpenAIChatAgent",
     "OperatorAgent",
+    "create_agent",
 ]
+
+
+def create_agent(model: str, **kwargs: Any) -> MCPAgent:
+    """Create an agent for a gateway model.
+
+    This routes ALL requests through the HUD gateway. For direct API access
+    (using your own API keys), use the agent classes directly.
+
+    Args:
+        model: Model name (e.g., "gpt-4o", "claude-sonnet-4-5").
+        **kwargs: Additional params passed to agent.create().
+
+    Returns:
+        Configured MCPAgent instance with gateway routing.
+
+    Example:
+        ```python
+        # Gateway routing (recommended)
+        agent = create_agent("gpt-4o")
+        agent = create_agent("claude-sonnet-4-5", temperature=0.7)
+
+        # Direct API access (use agent classes)
+        from hud.agents.claude import ClaudeAgent
+
+        agent = ClaudeAgent.create(model="claude-sonnet-4-5")
+        ```
+    """
+    from hud.agents.gateway import build_gateway_client
+    from hud.agents.resolver import resolve_cls
+
+    # Resolve class and gateway info
+    agent_cls, gateway_info = resolve_cls(model)
+
+    # Get model ID from gateway info or use input
+    model_id = model
+    if gateway_info:
+        model_id = gateway_info.get("model") or gateway_info.get("id") or model
+
+    # Determine provider: from gateway info, or infer from agent class
+    if gateway_info:
+        provider = gateway_info.get("provider") or "openai"
+    else:
+        # Map agent class to provider for known types
+        from hud.agents.claude import ClaudeAgent
+        from hud.agents.gemini import GeminiAgent
+
+        _AGENT_TO_PROVIDER = {
+            ClaudeAgent: "anthropic",
+            GeminiAgent: "google",
+        }
+        provider = _AGENT_TO_PROVIDER.get(agent_cls, "openai")
+
+    client = build_gateway_client(provider)
+
+    # Set up kwargs
+    kwargs.setdefault("model", model_id)
+
+    # Use correct client key based on agent type
+    if agent_cls == OpenAIChatAgent:
+        kwargs.setdefault("openai_client", client)
+    else:
+        # Claude and other agents use model_client and validate_api_key
+        kwargs.setdefault("model_client", client)
+        kwargs.setdefault("validate_api_key", False)
+
+    return agent_cls.create(**kwargs)
@@ -0,0 +1,42 @@
+"""Gateway client utilities for HUD inference gateway."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def build_gateway_client(provider: str) -> Any:
+    """Build a client configured for HUD gateway routing.
+
+    Args:
+        provider: Provider name ("anthropic", "openai", "gemini", etc.)
+
+    Returns:
+        Configured async client for the provider.
+    """
+    from hud.settings import settings
+
+    provider = provider.lower()
+
+    if provider == "anthropic":
+        from anthropic import AsyncAnthropic
+
+        return AsyncAnthropic(api_key=settings.api_key, base_url=settings.hud_gateway_url)
+
+    if provider == "gemini":
+        from google import genai
+        from google.genai.types import HttpOptions
+
+        return genai.Client(
+            api_key="PLACEHOLDER",
+            http_options=HttpOptions(
+                api_version="v1beta",
+                base_url=settings.hud_gateway_url,
+                headers={"Authorization": f"Bearer {settings.api_key}"},
+            ),
+        )
+
+    # OpenAI-compatible (openai, azure, together, groq, fireworks, etc.)
+    from openai import AsyncOpenAI
+
+    return AsyncOpenAI(api_key=settings.api_key, base_url=settings.hud_gateway_url)
@@ -0,0 +1,70 @@
+"""Model resolution - maps model strings to agent classes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from hud.agents.base import MCPAgent
+
+__all__ = ["resolve_cls"]
+
+_models_cache: list[dict[str, Any]] | None = None
+
+# Provider name → AgentType value (only anthropic differs)
+_PROVIDER_TO_AGENT = {"anthropic": "claude"}
+
+
+def _fetch_gateway_models() -> list[dict[str, Any]]:
+    """Fetch available models from HUD gateway (cached)."""
+    global _models_cache
+    if _models_cache is not None:
+        return _models_cache
+
+    import httpx
+
+    from hud.settings import settings
+
+    if not settings.api_key:
+        return []
+
+    try:
+        resp = httpx.get(
+            f"{settings.hud_gateway_url}/models",
+            headers={"Authorization": f"Bearer {settings.api_key}"},
+            timeout=10.0,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        _models_cache = data.get("data", data) if isinstance(data, dict) else data
+        return _models_cache or []
+    except Exception:
+        return []
+
+
+def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]:
+    """Resolve model string to (agent_class, gateway_info).
+
+    Returns:
+        (agent_class, None) for known AgentTypes
+        (agent_class, gateway_model_info) for gateway models
+    """
+    from hud.types import AgentType
+
+    # Known AgentType → no gateway info
+    try:
+        return AgentType(model).cls, None
+    except ValueError:
+        pass
+
+    # Gateway lookup
+    for m in _fetch_gateway_models():
+        if model in (m.get("id"), m.get("name"), m.get("model")):
+            provider = (m.get("provider") or "openai_compatible").lower()
+            agent_str = _PROVIDER_TO_AGENT.get(provider, provider)
+            try:
+                return AgentType(agent_str).cls, m
+            except ValueError:
+                return AgentType.OPENAI_COMPATIBLE.cls, m
+
+    raise ValueError(f"Model '{model}' not found")
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`"icon": "code",`
`34`	`34`	`"versions": [`
`35`	`35`	`{`
`36`		`- "version": "0.5.2",`
	`36`	`+ "version": "0.5.3",`
`37`	`37`	`"groups": [`
`38`	`38`	`{`
`39`	`39`	`"group": "Get Started",`
`@@ -63,7 +63,8 @@`
`63`	`63`	`{`
`64`	`64`	`"group": "Cookbooks",`
`65`	`65`	`"pages": [`
`66`		`- "cookbooks/codex-coding"`
	`66`	`+ "cookbooks/codex-coding",`
	`67`	`+ "cookbooks/ops-diagnostics"`
`67`	`68`	`]`
`68`	`69`	`},`
`69`	`70`	`{`