diff --git a/docs/cookbooks/ops-diagnostics.mdx b/docs/cookbooks/ops-diagnostics.mdx
new file mode 100644
index 00000000..558d3d2d
--- /dev/null
+++ b/docs/cookbooks/ops-diagnostics.mdx
@@ -0,0 +1,478 @@
+---
+title: "Ops Diagnostics Agent"
+description: "How we built a hierarchical agent to diagnose production issues across our infrastructure"
+icon: "stethoscope"
+---
+
+At HUD, we run a complex stack: Sentry for errors, Supabase for data, Railway for deployments, and Kubernetes for orchestration. When something breaks, we wanted an agent that could investigate across all services and provide a unified diagnosis.
+
+This cookbook walks through how we built it—focusing on **environment design**, **hierarchical delegation**, and **practical patterns** for production agent systems.
+
+## Why Hierarchical?
+
+When you connect multiple MCP servers to a single environment, the agent sees all tools at once. For diagnostics across four services, this meant 60+ tools in a flat list. The cognitive load made it harder for the model to select the right tool for the job.
+
+We restructured into a hierarchy: an orchestrator that delegates to specialized subagents.
+
+```mermaid
+flowchart TD
+    subgraph orch["Orchestrator"]
+        O["4 subagent tools"]
+    end
+    
+    subgraph sentry["Sentry Agent"]
+        S1["search_issues"]
+        S2["get_issue_details"]
+        S3["analyze_with_seer"]
+    end
+    
+    subgraph supabase["Supabase Agent"]
+        SU1["list_tables"]
+        SU2["execute_sql"]
+        SU3["get_logs"]
+    end
+    
+    subgraph railway["Railway Agent"]
+        R1["list_projects"]
+        R2["get_deployments"]
+        R3["get_logs"]
+    end
+    
+    subgraph kubectl["kubectl Agent"]
+        K1["get_pods"]
+        K2["get_events"]
+        K3["describe_pod"]
+    end
+    
+    O --> sentry
+    O --> supabase
+    O --> railway
+    O --> kubectl
+```
+
+The orchestrator sees only 4 tools—one per specialist. Each specialist has a focused toolset for its domain.
+
+## Environment Design
+
+Good environment design is the foundation. Each subagent is an `Environment` with:
+- A **focused toolset** (only what's needed for this domain)
+- A **single scenario** that defines the interface
+- **Read-only constraints** for safety
+
+### Connecting to MCP Servers
+
+For services with official MCP servers (Sentry, Supabase), connect via `connect_mcp_config`:
+
+```python
+# environments/sentry.py
+from hud import Environment
+import os
+import platform
+
+sentry_env = Environment(name="sentry-agent")
+
+IS_WINDOWS = platform.system() == "Windows"
+token = os.getenv("SENTRY_AUTH_TOKEN")
+
+if token:
+    config = {
+        "command": "cmd" if IS_WINDOWS else "npx",
+        "args": ["/c", "npx", "-y", "@sentry/mcp-server@latest"] if IS_WINDOWS 
+                else ["-y", "@sentry/mcp-server@latest"],
+        "env": {"SENTRY_ACCESS_TOKEN": token}
+    }
+    sentry_env.connect_mcp_config({"sentry": config})
+```
+
+### Custom Tools When Needed
+
+Railway's MCP server requires browser OAuth—not ideal for headless agents. We built custom tools using their GraphQL API:
+
+```python
+# environments/tools/railway.py
+from hud.server import MCPRouter
+import httpx
+import os
+
+router = MCPRouter()
+RAILWAY_API = "https://backboard.railway.com/graphql/v2"
+
+
+async def _graphql(query: str, variables: dict | None = None) -> dict:
+    token = os.getenv("RAILWAY_API_TOKEN")
+    async with httpx.AsyncClient() as client:
+        resp = await client.post(
+            RAILWAY_API,
+            headers={"Authorization": f"Bearer {token}"},
+            json={"query": query, "variables": variables}
+        )
+        return resp.json()
+
+
+@router.tool()
+async def railway_list_projects() -> dict:
+    """List all projects with their services."""
+    return await _graphql("""
+        query {
+            projects {
+                edges { node { id name } }
+            }
+        }
+    """)
+
+
+@router.tool()
+async def railway_get_deployment_logs(deployment_id: str) -> dict:
+    """Get logs for a deployment."""
+    return await _graphql("""
+        query($id: String!) {
+            deploymentLogs(deploymentId: $id) {
+                ... on Log { message timestamp severity }
+            }
+        }
+    """, {"id": deployment_id})
+```
+
+Then include the router in your environment:
+
+```python
+# environments/railway.py
+from hud import Environment
+from .tools.railway import router
+
+railway_env = Environment(name="railway-agent")
+railway_env.include_router(router)
+```
+
+### Defining the Scenario
+
+The scenario is the contract between orchestrator and subagent:
+
+```python
+@sentry_env.scenario("investigate")
+async def investigate_issue(
+    query: str,                          # Orchestrator provides this
+    expected_finding: str | None = None, # Hidden from orchestrator (eval-only)
+):
+    """Investigate errors in Sentry."""
+    
+    prompt = f"""You are a Sentry specialist. Investigate:
+
+**Query:** {query}
+
+**IMPORTANT: This is a READ-ONLY investigation.**
+
+Provide findings, root cause analysis, and recommended fixes."""
+
+    response = yield prompt
+    
+    # Scoring for evals
+    if expected_finding and response:
+        yield 1.0 if expected_finding.lower() in response.lower() else 0.5
+    else:
+        yield 1.0 if response else 0.0
+```
+
+<Note>
+**Eval-only parameters**: Parameters with `| None = None` are automatically hidden from the orchestrator's tool schema but available for evaluation scoring.
+</Note>
+
+## Building the Orchestrator
+
+The orchestrator wraps each subagent's scenario as an `AgentTool`:
+
+```python
+# orchestrator.py
+from hud import Environment
+from hud.tools import AgentTool
+from hud.agents import create_agent
+import hud
+
+from environments import sentry_env, supabase_env, railway_env, kubectl_env
+
+
+async def diagnose(query: str, model: str = "claude-sonnet-4-5"):
+    orchestrator = Environment(name="ops-orchestrator")
+    
+    # Wrap each subagent as a tool
+    for name, env, desc in [
+        ("investigate_sentry", sentry_env, "Check error monitoring"),
+        ("investigate_supabase", supabase_env, "Check database/auth"),
+        ("investigate_railway", railway_env, "Check deployments"),
+        ("investigate_kubernetes", kubectl_env, "Check cluster health"),
+    ]:
+        tool = AgentTool(
+            env("investigate"),
+            model=model,
+            name=name,
+            description=desc,
+        )
+        orchestrator.add_tool(tool.mcp)
+    
+    @orchestrator.scenario("diagnose")
+    async def run_diagnosis(issue: str):
+        yield f"""You are an ops diagnostics orchestrator.
+
+**Issue:** {issue}
+
+You have READ-ONLY subagents for Sentry, Supabase, Railway, and Kubernetes.
+Investigate systematically and correlate findings across services."""
+    
+    task = orchestrator("diagnose", issue=query)
+    
+    async with hud.eval(task) as ctx:
+        agent = create_agent(model)
+        return await agent.run(ctx, max_steps=20)
+```
+
+### Trace Continuity
+
+All subagent activity appears in a single trace on the HUD platform. When the orchestrator calls a subagent tool, the inference and tool calls are recorded under the parent trace—no separate URLs to track.
+
+## The READ-ONLY Constraint
+
+<Warning>
+We tested and operated this environment directly on our production systems, so all scenarios enforce read-only constraints. We removed mutation tools like `kubectl_exec`, `railway_redeploy`, and Supabase DDL operations.
+
+Every prompt includes: **"This is a READ-ONLY investigation."**
+</Warning>
+
+## Sample Output
+
+Running against a real production issue:
+
+```bash
+python orchestrator.py --model claude-sonnet-4-5 \
+  "Failed to delete pod: 429 Too Many Requests. 7451 events, escalating."
+```
+
+The orchestrator delegates to `investigate_sentry`, `investigate_railway`, and `investigate_supabase`, then correlates findings across services. After about 5 minutes:
+
+```text Diagnosis
+COMPREHENSIVE DIAGNOSIS REPORT
+
+Issue Summary
+  - Error: Failed to delete pod ████████████████████████████████████: 429 Too Many Requests
+  - Impact: 7,451 events over 5 days, 16 users affected, escalating state
+  - Project: Orchestrator / mcp-server
+  - Alert ID: ORCHESTRATOR-AC
+
+ROOT CAUSE ANALYSIS
+
+  Primary Root Cause: Kubernetes API Rate Limiting
+
+  The orchestrator service is hitting Kubernetes API server rate limits when
+  attempting to delete pods at scale. This is occurring in the
+  ████████.hud_gym.utils.kubernetes module.
+
+  Key Contributing Factors:
+
+    1. Excessive Deletion Frequency: ~1,491 errors/day (~62/hour) indicates
+       aggressive pod deletion attempts
+    2. No Retry/Backoff Logic: Code lacks exponential backoff when encountering
+       429 responses
+    3. High Concurrency: Service runs with 50 uvicorn workers + 32 Railway
+       replicas, amplifying concurrent API calls
+    4. Burst Traffic Pattern: Correlated with API usage spikes (313 inference
+       calls/minute at peak)
+    5. No Client-Side Rate Limiting: Kubernetes client not configured with QPS
+       limits
+
+CORRELATED FINDINGS ACROSS SERVICES
+
+  Sentry (Error Tracking)
+    - 7,455 occurrences of the 429 error between ██████████████
+    - Last occurrence: ████████████████████
+    - Error originates from: ████████.hud_gym.utils.kubernetes logger
+    - Associated with HTTP PATCH to Supabase /rest/v1/environments endpoint
+    - Part of environment update/cleanup workflow
+
+  Railway (Deployment Platform)
+    - Production service: 32 replicas in us-west2
+    - Latest successful deployment: ████████████████████ (30 min AFTER last
+      Sentry error)
+    - Historical failures (██████): AWS EKS credential issues (now resolved)
+    - No current rate limiting errors in deployment logs
+    - Pod deletions working normally post-fix
+
+  Supabase (Database/API)
+    - API burst traffic spike: 313 calls/minute at ████████████████████
+    - ████ Team (22 members, free tier): 15,933 inference calls/24h - prime
+      candidate for "16 users"
+    - Connection pool saturation: 49 waiting connections out of 52
+    - Security vulnerabilities: 38 tables with RLS enabled but NO policies
+    - Performance issues: 52 unindexed foreign keys, inefficient RLS policies
+    - 429 errors occur at API gateway layer (not visible in Postgres logs)
+
+  Kubernetes
+    - Investigation unavailable due to response size (cluster likely healthy
+      but under load)
+
+CORRELATION & TIMELINE
+
+  ██████████████: 7,455 pod deletion failures (continuous)
+    - ████████████████████: Last 429 error recorded in Sentry
+    - ████████████████████: New production deployment (likely contained fix)
+    - ████████████████████: API traffic spike (313 req/min)
+
+  Pattern Identified:
+
+    1. Orchestrator creates ephemeral pods for task execution (inference
+       workloads)
+    2. High inference API traffic (15,933 calls/day from ████ team) triggers
+       frequent pod creation
+    3. Cleanup attempts to delete pods rapidly after task completion
+    4. Kubernetes API rate limits exceeded due to:
+       - 50 uvicorn workers x 32 Railway replicas = 1,600 concurrent processes
+       - No client-side rate limiting or request coordination
+       - No exponential backoff on 429 responses
+    5. Failed deletions likely retry aggressively, compounding the problem
+
+RECOMMENDED ACTIONS FOR HUMANS
+
+  CRITICAL - Immediate Actions
+
+    1. Implement Exponential Backoff
+       Location: ████████.hud_gym.utils.kubernetes
+
+         def delete_pod_with_retry(api_client, pod_name, namespace, max_retries=5):
+             for attempt in range(max_retries):
+                 try:
+                     api_client.delete_namespaced_pod(pod_name, namespace)
+                     return True
+                 except ApiException as e:
+                     if e.status == 429 and attempt < max_retries - 1:
+                         wait_time = (2 ** attempt) + random.uniform(0, 1)
+                         logger.warning(f"Rate limited, retrying in {wait_time:.1f}s")
+                         time.sleep(wait_time)
+                         continue
+                     raise
+             return False
+
+    2. Configure Kubernetes Client Rate Limiting
+
+         configuration = client.Configuration()
+         configuration.qps = 5.0      # Max 5 queries per second per client
+         configuration.burst = 10     # Allow bursts up to 10
+
+    3. Add Concurrency Control
+
+         k8s_api_semaphore = Semaphore(10)  # Max 10 concurrent API calls
+
+         async def delete_pod_rate_limited(pod_name, namespace):
+             async with k8s_api_semaphore:
+                 return await delete_pod_with_retry(pod_name, namespace)
+
+  HIGH PRIORITY - Within 48 Hours
+
+    4. Optimize Worker Configuration
+       - Current: 50 uvicorn workers x 32 Railway replicas = 1,600 processes
+       - Recommendation: Reduce uvicorn workers to 10-20 per replica
+       - Why: Excessive concurrency amplifies K8s API load
+
+    5. Implement Pod Deletion Queue
+       - Use background queue (Redis, Celery) for pod deletions
+       - Process deletions with controlled rate (e.g., 100/minute globally)
+       - Provides visibility into deletion backlog
+
+    6. Fix Supabase Security Issues
+       - URGENT: Add RLS policies to 38 tables currently without policies
+       - Enable leaked password protection
+       - Reduce OTP expiry to < 1 hour
+       - Index 52 foreign keys for query performance
+       - Remove 5 duplicate indexes
+
+    7. Upgrade ████ Team or Implement Graduated Rate Limits
+       - ████ team (22 members, free tier) using 15,933 API calls/day
+         (enterprise-level)
+       - Either upgrade to paid tier or implement request throttling
+       - Add monitoring for teams exceeding tier limits
+
+  MEDIUM PRIORITY - Within 1 Week
+
+    8. Add Monitoring & Alerting
+       - Track pod deletion success/failure rates
+       - Monitor K8s API rate limit headers (X-RateLimit-Remaining)
+       - Alert when deletion failure rate > 5%
+       - Add dashboards for pod lifecycle metrics
+
+    9. Implement Circuit Breaker Pattern
+
+         k8s_breaker = CircuitBreaker(fail_max=5, timeout_duration=60)
+
+         @k8s_breaker
+         def delete_pod_protected(pod_name, namespace):
+             return delete_pod_with_retry(pod_name, namespace)
+
+    10. Optimize Pod Lifecycle
+        - Review if pods can be longer-lived (reduce churn)
+        - Consider pod pooling/reuse for similar tasks
+        - Use K8s native garbage collection where possible
+        - Set propagationPolicy=Background for async cleanup
+
+    11. Fix Supabase Connection Pool
+        - Switch auth server to percentage-based connection allocation
+        - Current: 49 waiting connections out of 52 (saturation)
+        - Monitor connection wait times and adjust pool size
+
+  LOW PRIORITY - Technical Debt
+
+    12. Update Deprecated Dependencies
+        - Replace close() with aclose() for Redis connections
+        - Update Supabase client for new parameter configuration
+        - Address deprecation warnings in logs
+
+    13. Add Request Coalescing
+        - Batch multiple pod deletions into single API calls where possible
+        - Implement request deduplication for identical operations
+
+VALIDATION STEPS
+
+  After implementing fixes, validate with:
+
+    1. Sentry: Monitor ORCHESTRATOR-AC for decreased error frequency (target: 0
+       errors)
+    2. Kubernetes: Check API server metrics for reduced throttling events
+    3. Railway: Verify pod deletion logs show successful operations
+    4. Supabase: Confirm API traffic patterns stay within rate limits
+    5. Metrics: Track pod deletion latency and success rate
+
+COMMIT MESSAGE TEMPLATE
+
+    fix: implement exponential backoff for K8s pod deletions
+
+    - Add retry logic with exponential backoff for 429 errors
+    - Configure client-side rate limiting (5 QPS, 10 burst)
+    - Add concurrency control with semaphore (max 10 concurrent)
+    - Reduce uvicorn workers from 50 to 20 per replica
+
+    Fixes ORCHESTRATOR-AC
+    Resolves rate limiting issues affecting 16 users over 5 days
+
+SUCCESS CRITERIA
+
+    - Zero 429 errors in Sentry for 7 consecutive days
+    - Pod deletion success rate > 99.9%
+    - Average deletion latency < 2 seconds
+    - No user-facing impact from pod lifecycle operations
+    - Supabase API calls stay within tier limits
+
+Investigation Status: Complete
+Next Review: After fix deployment (monitor for 48 hours)
+```
+
+The entire investigation—from initial query to actionable recommendations—took about 5 minutes across the specialized subagents.
+
+## What We Learned
+
+1. **Environment design matters.** A focused toolset per domain outperforms a flat list of everything.
+
+2. **Scenarios are contracts.** They define what the orchestrator can ask and what the subagent returns.
+
+3. **Custom tools fill gaps.** When MCP servers don't fit your auth model, build direct API integrations.
+
+## See Also
+
+- [AgentTool Reference](/reference/tools#agenttool)
+- [Building Environments](/build-environments)
+- [Scenarios](/reference/environments#scenarios)
diff --git a/docs/docs.json b/docs/docs.json
index 114ba090..d3f7332c 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -33,7 +33,7 @@
         "icon": "code",
         "versions": [
           {
-            "version": "0.5.2",
+            "version": "0.5.3",
             "groups": [
               {
                 "group": "Get Started",
@@ -63,7 +63,8 @@
               {
                 "group": "Cookbooks",
                 "pages": [
-                  "cookbooks/codex-coding"
+                  "cookbooks/codex-coding",
+                  "cookbooks/ops-diagnostics"
                 ]
               },
               {
diff --git a/docs/reference/environments.mdx b/docs/reference/environments.mdx
index 1e6fc107..21fc0ce7 100644
--- a/docs/reference/environments.mdx
+++ b/docs/reference/environments.mdx
@@ -266,6 +266,81 @@ env.unmock()  # Disable mock mode
 | `mock_tool(name, output)` | Set specific mock output |
 | `is_mock` | Check if mock mode is enabled |
 
+## Serving as MCP Server
+
+Environment can serve its tools over MCP protocols, either standalone or mounted on an existing server.
+
+### serve()
+
+Start a standalone MCP server:
+
+```python
+from hud import Environment
+
+env = Environment("my-env")
+
+@env.tool()
+def greet(name: str) -> str:
+    return f"Hello, {name}!"
+
+# Run as MCP server (blocking)
+env.serve()
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `transport` | `Literal["stdio", "sse", "streamable-http"]` | Transport protocol | `"streamable-http"` |
+| `host` | `str` | Host address to bind | `"0.0.0.0"` |
+| `port` | `int` | Port to bind | `8000` |
+
+```python
+# Serve over stdio (for CLI tools)
+env.serve(transport="stdio")
+
+# Serve over HTTP on custom port
+env.serve(transport="streamable-http", host="0.0.0.0", port=8765)
+```
+
+### http_app()
+
+Get a Starlette/ASGI app to mount on an existing FastAPI server:
+
+```python
+from fastapi import FastAPI
+from hud import Environment
+
+app = FastAPI()
+env = Environment("my-env")
+
+@env.tool()
+def my_tool(arg: str) -> str:
+    return f"Got: {arg}"
+
+# Mount the HUD environment's MCP endpoint at /mcp
+app.mount("/mcp", env.http_app())
+
+# Your other FastAPI routes work normally
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `path` | `str \| None` | Internal path for the MCP endpoint | `"/"` |
+| `transport` | `Literal["http", "streamable-http", "sse"]` | Transport protocol | `"http"` |
+| `middleware` | `list[ASGIMiddleware] \| None` | Starlette middleware | `None` |
+| `json_response` | `bool \| None` | Use JSON response format | `None` |
+| `stateless_http` | `bool \| None` | Use stateless HTTP mode | `None` |
+
+MCP clients can then connect at `http://your-server/mcp`:
+
+```python
+# Client connecting to mounted environment
+env.connect_url("http://localhost:8000/mcp")
+```
+
+
 ## Properties
 
 | Property | Type | Description |
diff --git a/docs/reference/tools.mdx b/docs/reference/tools.mdx
index 3d12e0b9..bf5b208c 100644
--- a/docs/reference/tools.mdx
+++ b/docs/reference/tools.mdx
@@ -69,6 +69,93 @@ async def url_match(url: str) -> EvaluationResult:
 # Agents call: evaluators(name="url_match", arguments={"url": "..."})
 ```
 
+## Agent Tools
+
+### AgentTool
+
+```python
+from hud.tools import AgentTool
+```
+
+Wraps a scenario as a tool that can be called by another agent. Essential for building **hierarchical agent systems** where an orchestrator delegates to specialized subagents.
+
+**Constructor Parameters:**
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `task` | `Task` | Task template from `env("scenario_name")` | Required |
+| `model` | `str` | Model for subagent (via gateway) | `None` |
+| `agent` | `type[MCPAgent]` | Custom agent class | `None` |
+| `agent_params` | `dict` | Additional agent parameters | `{}` |
+| `name` | `str` | Tool name for orchestrator | From scenario |
+| `description` | `str` | Tool description | Auto-generated |
+| `trace` | `bool` | Enable tracing for standalone runs | `False` |
+
+<Note>Must provide either `model` or `agent`, not both.</Note>
+
+**Eval-Only Parameters:**
+
+Parameters with `| None = None` are hidden from the orchestrator but available for evaluation:
+
+```python
+@env.scenario("investigate")
+async def investigate(
+    query: str,                          # Visible - orchestrator passes this
+    expected_finding: str | None = None, # Hidden - only used in eval scoring
+):
+    response = yield f"Investigate: {query}"
+    
+    # Scoring uses expected_finding but orchestrator never sees it
+    if expected_finding and response:
+        yield 1.0 if expected_finding in response else 0.5
+    else:
+        yield 1.0 if response else 0.0
+```
+
+**Usage:**
+```python
+from hud import Environment
+from hud.tools import AgentTool
+
+# Subagent environment with scenario
+sentry_env = Environment(name="sentry-agent")
+
+@sentry_env.scenario("investigate")
+async def investigate_sentry(query: str):
+    yield f"Investigate Sentry: {query}"
+
+# Create orchestrator
+orchestrator = Environment(name="orchestrator")
+
+# Wrap subagent scenario as tool
+tool = AgentTool(
+    sentry_env("investigate"),  # Task template
+    model="gpt-4o-mini",
+    name="investigate_sentry",
+    description="Investigate errors in Sentry",
+)
+orchestrator.add_tool(tool.mcp)
+
+# Now orchestrator agent can call investigate_sentry(query="...")
+```
+
+**Trace Continuity:**
+
+When called from within an eval context, AgentTool automatically:
+1. Inherits the parent's trace_id
+2. Skips duplicate trace registration
+3. Routes all inference/tool calls to the parent trace
+
+```python
+async with hud.eval(task) as ctx:
+    agent = create_agent("gpt-4o")
+    result = await agent.run(ctx)
+    # All subagent activity appears in this single trace
+```
+
+**See Also:** [Ops Diagnostics Cookbook](/cookbook/ops-diagnostics) for a complete hierarchical agent example.
+
+---
+
 ## Core Tools
 
 ### BashTool
diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py
index 547d876b..03f9512a 100644
--- a/hud/agents/__init__.py
+++ b/hud/agents/__init__.py
@@ -1,19 +1,82 @@
 from __future__ import annotations
 
+from typing import Any
+
 from .base import MCPAgent
 from .openai import OpenAIAgent
 from .openai_chat import OpenAIChatAgent
 from .operator import OperatorAgent
 
-# Note: These agents are not exported here to avoid requiring optional dependencies.
-# Import directly if needed:
-#   from hud.agents.claude import ClaudeAgent  # requires anthropic
-#   from hud.agents.gemini import GeminiAgent  # requires google-genai
-#   from hud.agents.gemini_cua import GeminiCUAAgent  # requires google-genai
-
 __all__ = [
     "MCPAgent",
     "OpenAIAgent",
     "OpenAIChatAgent",
     "OperatorAgent",
+    "create_agent",
 ]
+
+
+def create_agent(model: str, **kwargs: Any) -> MCPAgent:
+    """Create an agent for a gateway model.
+
+    This routes ALL requests through the HUD gateway. For direct API access
+    (using your own API keys), use the agent classes directly.
+
+    Args:
+        model: Model name (e.g., "gpt-4o", "claude-sonnet-4-5").
+        **kwargs: Additional params passed to agent.create().
+
+    Returns:
+        Configured MCPAgent instance with gateway routing.
+
+    Example:
+        ```python
+        # Gateway routing (recommended)
+        agent = create_agent("gpt-4o")
+        agent = create_agent("claude-sonnet-4-5", temperature=0.7)
+
+        # Direct API access (use agent classes)
+        from hud.agents.claude import ClaudeAgent
+
+        agent = ClaudeAgent.create(model="claude-sonnet-4-5")
+        ```
+    """
+    from hud.agents.gateway import build_gateway_client
+    from hud.agents.resolver import resolve_cls
+
+    # Resolve class and gateway info
+    agent_cls, gateway_info = resolve_cls(model)
+
+    # Get model ID from gateway info or use input
+    model_id = model
+    if gateway_info:
+        model_id = gateway_info.get("model") or gateway_info.get("id") or model
+
+    # Determine provider: from gateway info, or infer from agent class
+    if gateway_info:
+        provider = gateway_info.get("provider") or "openai"
+    else:
+        # Map agent class to provider for known types
+        from hud.agents.claude import ClaudeAgent
+        from hud.agents.gemini import GeminiAgent
+
+        _AGENT_TO_PROVIDER = {
+            ClaudeAgent: "anthropic",
+            GeminiAgent: "google",
+        }
+        provider = _AGENT_TO_PROVIDER.get(agent_cls, "openai")
+
+    client = build_gateway_client(provider)
+
+    # Set up kwargs
+    kwargs.setdefault("model", model_id)
+
+    # Use correct client key based on agent type
+    if agent_cls == OpenAIChatAgent:
+        kwargs.setdefault("openai_client", client)
+    else:
+        # Claude and other agents use model_client and validate_api_key
+        kwargs.setdefault("model_client", client)
+        kwargs.setdefault("validate_api_key", False)
+
+    return agent_cls.create(**kwargs)
diff --git a/hud/agents/gateway.py b/hud/agents/gateway.py
new file mode 100644
index 00000000..4d0973f8
--- /dev/null
+++ b/hud/agents/gateway.py
@@ -0,0 +1,42 @@
+"""Gateway client utilities for HUD inference gateway."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def build_gateway_client(provider: str) -> Any:
+    """Build a client configured for HUD gateway routing.
+
+    Args:
+        provider: Provider name ("anthropic", "openai", "gemini", etc.)
+
+    Returns:
+        Configured async client for the provider.
+    """
+    from hud.settings import settings
+
+    provider = provider.lower()
+
+    if provider == "anthropic":
+        from anthropic import AsyncAnthropic
+
+        return AsyncAnthropic(api_key=settings.api_key, base_url=settings.hud_gateway_url)
+
+    if provider == "gemini":
+        from google import genai
+        from google.genai.types import HttpOptions
+
+        return genai.Client(
+            api_key="PLACEHOLDER",
+            http_options=HttpOptions(
+                api_version="v1beta",
+                base_url=settings.hud_gateway_url,
+                headers={"Authorization": f"Bearer {settings.api_key}"},
+            ),
+        )
+
+    # OpenAI-compatible (openai, azure, together, groq, fireworks, etc.)
+    from openai import AsyncOpenAI
+
+    return AsyncOpenAI(api_key=settings.api_key, base_url=settings.hud_gateway_url)
diff --git a/hud/agents/resolver.py b/hud/agents/resolver.py
new file mode 100644
index 00000000..80351800
--- /dev/null
+++ b/hud/agents/resolver.py
@@ -0,0 +1,70 @@
+"""Model resolution - maps model strings to agent classes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from hud.agents.base import MCPAgent
+
+__all__ = ["resolve_cls"]
+
+_models_cache: list[dict[str, Any]] | None = None
+
+# Provider name → AgentType value (only anthropic differs)
+_PROVIDER_TO_AGENT = {"anthropic": "claude"}
+
+
+def _fetch_gateway_models() -> list[dict[str, Any]]:
+    """Fetch available models from HUD gateway (cached)."""
+    global _models_cache
+    if _models_cache is not None:
+        return _models_cache
+
+    import httpx
+
+    from hud.settings import settings
+
+    if not settings.api_key:
+        return []
+
+    try:
+        resp = httpx.get(
+            f"{settings.hud_gateway_url}/models",
+            headers={"Authorization": f"Bearer {settings.api_key}"},
+            timeout=10.0,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        _models_cache = data.get("data", data) if isinstance(data, dict) else data
+        return _models_cache or []
+    except Exception:
+        return []
+
+
+def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]:
+    """Resolve model string to (agent_class, gateway_info).
+
+    Returns:
+        (agent_class, None) for known AgentTypes
+        (agent_class, gateway_model_info) for gateway models
+    """
+    from hud.types import AgentType
+
+    # Known AgentType → no gateway info
+    try:
+        return AgentType(model).cls, None
+    except ValueError:
+        pass
+
+    # Gateway lookup
+    for m in _fetch_gateway_models():
+        if model in (m.get("id"), m.get("name"), m.get("model")):
+            provider = (m.get("provider") or "openai_compatible").lower()
+            agent_str = _PROVIDER_TO_AGENT.get(provider, provider)
+            try:
+                return AgentType(agent_str).cls, m
+            except ValueError:
+                return AgentType.OPENAI_COMPATIBLE.cls, m
+
+    raise ValueError(f"Model '{model}' not found")
diff --git a/hud/agents/tests/test_resolver.py b/hud/agents/tests/test_resolver.py
new file mode 100644
index 00000000..04e6f51e
--- /dev/null
+++ b/hud/agents/tests/test_resolver.py
@@ -0,0 +1,192 @@
+"""Tests for model resolution and create_agent."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from hud.agents import create_agent
+from hud.agents.resolver import resolve_cls
+
+
+class TestResolveCls:
+    """Tests for resolve_cls function."""
+
+    def test_resolves_known_agent_type(self) -> None:
+        """Known AgentType strings resolve to their class."""
+        from hud.agents.claude import ClaudeAgent
+
+        cls, gateway_info = resolve_cls("claude")
+        assert cls == ClaudeAgent
+        assert gateway_info is None
+
+    def test_resolves_openai(self) -> None:
+        """Resolves 'openai' to OpenAIAgent."""
+        from hud.agents import OpenAIAgent
+
+        cls, _gateway_info = resolve_cls("openai")
+        assert cls == OpenAIAgent
+
+    def test_resolves_gemini(self) -> None:
+        """Resolves 'gemini' to GeminiAgent."""
+        from hud.agents.gemini import GeminiAgent
+
+        cls, _gateway_info = resolve_cls("gemini")
+        assert cls == GeminiAgent
+
+    def test_unknown_model_without_gateway_raises(self) -> None:
+        """Unknown model with no gateway models raises ValueError."""
+        with (
+            patch("hud.agents.resolver._fetch_gateway_models", return_value=[]),
+            pytest.raises(ValueError, match="not found"),
+        ):
+            resolve_cls("unknown-model-xyz")
+
+    def test_resolves_gateway_model(self) -> None:
+        """Resolves model found in gateway."""
+        from hud.agents import OpenAIAgent
+
+        mock_models = [
+            {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
+        ]
+
+        with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
+            cls, info = resolve_cls("gpt-4o")
+            assert cls == OpenAIAgent
+            assert info is not None
+            assert info["id"] == "gpt-4o"
+
+    def test_resolves_anthropic_provider_to_claude(self) -> None:
+        """Provider 'anthropic' maps to ClaudeAgent."""
+        from hud.agents.claude import ClaudeAgent
+
+        mock_models = [
+            {"id": "claude-sonnet", "model": "claude-3-sonnet", "provider": "anthropic"},
+        ]
+
+        with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
+            cls, _info = resolve_cls("claude-sonnet")
+            assert cls == ClaudeAgent
+
+    def test_resolves_unknown_provider_to_openai_compatible(self) -> None:
+        """Unknown provider maps to OpenAIChatAgent."""
+        from hud.agents.openai_chat import OpenAIChatAgent
+
+        mock_models = [
+            {"id": "custom-model", "model": "custom", "provider": "custom-provider"},
+        ]
+
+        with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
+            cls, _info = resolve_cls("custom-model")
+            assert cls == OpenAIChatAgent
+
+
+class TestCreateAgent:
+    """Tests for create_agent function - gateway-only."""
+
+    def test_creates_with_gateway_client(self) -> None:
+        """create_agent always uses gateway routing."""
+        from hud.agents import OpenAIAgent
+
+        mock_models = [
+            {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
+        ]
+
+        with (
+            patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
+            patch.object(OpenAIAgent, "create") as mock_create,
+            patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
+        ):
+            mock_client = MagicMock()
+            mock_build_client.return_value = mock_client
+            mock_agent = MagicMock()
+            mock_create.return_value = mock_agent
+
+            agent = create_agent("gpt-4o")
+
+            # Should have set model and model_client
+            call_kwargs = mock_create.call_args.kwargs
+            assert call_kwargs["model"] == "gpt-4o"
+            assert "model_client" in call_kwargs
+            assert agent == mock_agent
+
+    def test_passes_kwargs_to_create(self) -> None:
+        """Extra kwargs are passed to agent.create()."""
+        from hud.agents import OpenAIAgent
+
+        mock_models = [
+            {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
+        ]
+
+        with (
+            patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
+            patch.object(OpenAIAgent, "create") as mock_create,
+            patch("hud.agents.gateway.build_gateway_client"),
+        ):
+            mock_create.return_value = MagicMock()
+
+            create_agent("gpt-4o", temperature=0.5, max_tokens=1000)
+
+            call_kwargs = mock_create.call_args.kwargs
+            assert call_kwargs["temperature"] == 0.5
+            assert call_kwargs["max_tokens"] == 1000
+
+    def test_known_agent_type_also_uses_gateway(self) -> None:
+        """Even 'claude' string uses gateway (it's a gateway shortcut)."""
+        from hud.agents.claude import ClaudeAgent
+
+        with (
+            patch.object(ClaudeAgent, "create") as mock_create,
+            patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
+        ):
+            mock_client = MagicMock()
+            mock_build_client.return_value = mock_client
+            mock_create.return_value = MagicMock()
+
+            create_agent("claude")
+
+            # Should still build gateway client
+            mock_build_client.assert_called_once()
+            call_kwargs = mock_create.call_args.kwargs
+            assert "model_client" in call_kwargs
+
+
+class TestBuildGatewayClient:
+    """Tests for build_gateway_client function."""
+
+    def test_builds_anthropic_client(self) -> None:
+        """Builds AsyncAnthropic for anthropic provider."""
+        from hud.agents.gateway import build_gateway_client
+
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = "test-key"
+            mock_settings.hud_gateway_url = "https://gateway.hud.ai"
+
+            with patch("anthropic.AsyncAnthropic") as mock_client_cls:
+                build_gateway_client("anthropic")
+                mock_client_cls.assert_called_once()
+
+    def test_builds_openai_client_for_openai(self) -> None:
+        """Builds AsyncOpenAI for openai provider."""
+        from hud.agents.gateway import build_gateway_client
+
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = "test-key"
+            mock_settings.hud_gateway_url = "https://gateway.hud.ai"
+
+            with patch("openai.AsyncOpenAI") as mock_client_cls:
+                build_gateway_client("openai")
+                mock_client_cls.assert_called_once()
+
+    def test_builds_openai_client_for_unknown(self) -> None:
+        """Builds AsyncOpenAI for unknown providers (openai-compatible)."""
+        from hud.agents.gateway import build_gateway_client
+
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = "test-key"
+            mock_settings.hud_gateway_url = "https://gateway.hud.ai"
+
+            with patch("openai.AsyncOpenAI") as mock_client_cls:
+                build_gateway_client("together")
+                mock_client_cls.assert_called_once()
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index eb13ce34..faedb107 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -338,47 +338,27 @@ def get_agent_kwargs(self) -> dict[str, Any]:
 
         # Configure gateway mode - route LLM API calls through HUD gateway
         if self.gateway:
-            hud_api_key = settings.api_key
-            if not hud_api_key:
+            if not settings.api_key:
                 raise typer.Exit(1)  # Already validated in validate_api_keys()
 
-            if self.agent_type == AgentType.CLAUDE:
-                from anthropic import AsyncAnthropic
-
-                kwargs["model_client"] = AsyncAnthropic(
-                    api_key=hud_api_key,
-                    base_url=settings.hud_gateway_url,
-                )
-                hud_console.info("🌐 Using HUD Gateway for Claude API")
-            elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
-                from openai import AsyncOpenAI
+            from hud.agents.gateway import build_gateway_client
 
-                kwargs["model_client"] = AsyncOpenAI(
-                    api_key=hud_api_key,
-                    base_url=settings.hud_gateway_url,
-                )
-                hud_console.info("🌐 Using HUD Gateway for OpenAI API")
-            elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
-                from openai import AsyncOpenAI
+            # Map AgentType to provider
+            agent_to_provider = {
+                AgentType.CLAUDE: "anthropic",
+                AgentType.OPENAI: "openai",
+                AgentType.OPERATOR: "openai",
+                AgentType.GEMINI: "gemini",
+                AgentType.GEMINI_CUA: "gemini",
+                AgentType.OPENAI_COMPATIBLE: "openai",
+            }
+            provider = agent_to_provider.get(self.agent_type, "openai")
+            client = build_gateway_client(provider)
 
-                kwargs["openai_client"] = AsyncOpenAI(
-                    api_key=hud_api_key,
-                    base_url=settings.hud_gateway_url,
-                )
-                hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
-            elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
-                from google import genai
-                from google.genai.types import HttpOptions
-
-                kwargs["model_client"] = genai.Client(
-                    api_key="PLACEHOLDER",
-                    http_options=HttpOptions(
-                        api_version="v1beta",
-                        base_url=settings.hud_gateway_url,
-                        headers={"Authorization": f"Bearer {hud_api_key}"},
-                    ),
-                )
-                hud_console.info("🌐 Using HUD Gateway for Gemini API")
+            # OpenAI-compatible uses openai_client key
+            is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE
+            kwargs["openai_client" if is_oai_compat else "model_client"] = client
+            hud_console.info(f"🌐 Using HUD Gateway for {provider} API")
 
         return kwargs
 
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
index 0c1982f2..d313228e 100644
--- a/hud/datasets/loader.py
+++ b/hud/datasets/loader.py
@@ -63,7 +63,8 @@ def _load_from_file(path: Path) -> list[Task]:
     from hud.eval.task import Task
 
     raw_items = _load_raw_from_file(path)
-    return [Task(**item) for item in raw_items]
+    # Default args to {} for runnable tasks (None = template)
+    return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
 
 
 def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
@@ -99,7 +100,8 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
     raw_items = _load_raw_from_huggingface(dataset_name)
     from hud.eval.task import Task
 
-    return [Task(**item) for item in raw_items]
+    # Default args to {} for runnable tasks (None = template)
+    return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
 
 
 def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
@@ -138,7 +140,8 @@ def _load_from_api(dataset_name: str) -> list[Task]:
     from hud.eval.task import Task
 
     raw_items = _load_raw_from_api(dataset_name)
-    return [Task(**item) for item in raw_items]
+    # Default args to {} for runnable tasks (None = template)
+    return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
 
 
 @overload
diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
index 3b4b1162..1a27074f 100644
--- a/hud/datasets/runner.py
+++ b/hud/datasets/runner.py
@@ -40,7 +40,7 @@ async def run_dataset(
             - A source string (file path, API slug) - loaded via load_tasks()
             - A single TaskInput (Task, LegacyTask, or dict)
             - A list of TaskInput objects
-        agent_type: Type of agent to create (e.g., "claude", "openai", AgentType.CLAUDE).
+        agent_type: Agent type (e.g., "claude", "openai", AgentType.CLAUDE).
         agent_params: Parameters to pass to agent.create().
         max_steps: Maximum steps per task.
         max_concurrent: Maximum concurrent tasks (for parallel execution).
@@ -70,6 +70,10 @@ async def run_dataset(
     from hud.datasets.loader import load_tasks
     from hud.eval.task import Task
 
+    # Normalize agent_type to AgentType enum
+    if isinstance(agent_type, str):
+        agent_type = AgentType(agent_type)
+
     # Normalize tasks to list[Task]
     task_list: list[Task]
     if isinstance(tasks, str):
@@ -86,10 +90,6 @@ async def run_dataset(
     if not task_list:
         raise ValueError("No tasks to run")
 
-    # Resolve agent class
-    agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
-    agent_cls = agent_type_enum.cls
-
     # Use hud.eval() for both single and parallel execution
     async with hud.eval(
         task_list,
@@ -97,8 +97,8 @@ async def run_dataset(
         max_concurrent=max_concurrent,
         quiet=quiet,
     ) as ctx:
-        # Create agent fresh for each context (ensures correct tool initialization)
-        agent = agent_cls.create(**(agent_params or {}))
+        # Create agent using AgentType.cls.create()
+        agent = agent_type.cls.create(**(agent_params or {}))
         await agent.run(ctx, max_steps=max_steps)
         # Reward is computed by EvalContext.__aexit__ from evaluate tools
 
@@ -198,9 +198,8 @@ async def run_single_task(
         if ctx.system_prompt and "system_prompt" not in final_agent_params:
             final_agent_params["system_prompt"] = ctx.system_prompt
 
-        # Create agent inside ctx so it has access to context-derived values
-        agent_cls = agent_type.cls
-        agent = agent_cls.create(**final_agent_params)
+        # Create agent using AgentType.cls.create()
+        agent = agent_type.cls.create(**final_agent_params)
 
         # Store metadata if provided
         if metadata:
diff --git a/hud/environment/environment.py b/hud/environment/environment.py
index 83924cd2..4ed44b32 100644
--- a/hud/environment/environment.py
+++ b/hud/environment/environment.py
@@ -362,6 +362,22 @@ async def __aexit__(
             await asyncio.gather(*[c.disconnect() for c in self._connections.values()])
         self._router.clear()
 
+    async def run_async(
+        self,
+        transport: Literal["stdio", "http", "sse"] | None = None,
+        show_banner: bool = True,
+        **transport_kwargs: Any,
+    ) -> None:
+        """Run the MCP server, auto-connecting all connectors first.
+
+        This ensures that tools from external MCP servers (via connect_mcp_config)
+        are discovered and available when the server starts.
+        """
+        async with self:  # Connect all connectors via __aenter__
+            await super().run_async(
+                transport=transport, show_banner=show_banner, **transport_kwargs
+            )
+
     async def _build_routing(self) -> None:
         """Build tool routing from local tools and connection caches."""
         # Use get_tools() not list_tools() - it includes mounted servers without
@@ -376,6 +392,27 @@ async def _build_routing(self) -> None:
         # Populate mock schemas for auto-generated mock values
         self._populate_mock_schemas()
 
+    # =========================================================================
+    # MCP Protocol Overrides - Include connector tools in MCP responses
+    # =========================================================================
+
+    def _setup_handlers(self) -> None:
+        """Override FastMCP to register our custom handlers for tools."""
+        # Call parent to set up all standard handlers
+        super()._setup_handlers()
+        # Re-register our custom handlers (overwrites parent's registrations)
+        self._mcp_server.list_tools()(self._env_list_tools)
+        self._mcp_server.call_tool()(self._env_call_tool)
+
+    async def _env_list_tools(self) -> list[mcp_types.Tool]:
+        """Return all tools including those from connectors."""
+        return self._router.tools
+
+    async def _env_call_tool(self, name: str, arguments: dict[str, Any] | None = None) -> list[Any]:
+        """Route tool calls through our router (handles both local and connector tools)."""
+        result = await self._execute_tool(name, arguments or {})
+        return result.content or []
+
     # =========================================================================
     # Tool Operations
     # =========================================================================
diff --git a/hud/environment/tests/test_environment.py b/hud/environment/tests/test_environment.py
index 44febe88..60b544e7 100644
--- a/hud/environment/tests/test_environment.py
+++ b/hud/environment/tests/test_environment.py
@@ -159,3 +159,171 @@ def test_chaining_multiple_setup_calls(self) -> None:
         )
 
         assert len(env._setup_calls) == 2
+
+
+class TestEnvironmentMCPProtocol:
+    """Tests for MCP protocol overrides - Environment._env_list_tools and _env_call_tool.
+
+    These test that Environment properly exposes connector tools via MCP handlers.
+    """
+
+    @pytest.mark.asyncio
+    async def test_env_list_tools_includes_local_tools(self) -> None:
+        """_env_list_tools returns local tools after routing is built."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        @env.tool()
+        def my_tool(x: int) -> int:
+            """A test tool."""
+            return x * 2
+
+        # Build routing (simulates what __aenter__ does)
+        await env._build_routing()
+
+        # Call the handler that MCP will call
+        tools = await env._env_list_tools()
+
+        assert len(tools) == 1
+        assert tools[0].name == "my_tool"
+
+    @pytest.mark.asyncio
+    async def test_env_list_tools_includes_connector_tools(self) -> None:
+        """_env_list_tools returns tools from connectors (the key feature)."""
+        import mcp.types as mcp_types
+
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        # Create a mock connector with cached tools
+        mock_tools = [
+            mcp_types.Tool(
+                name="remote_tool",
+                description="A remote tool",
+                inputSchema={"type": "object"},
+            )
+        ]
+
+        class MockConnector:
+            is_connected = True
+            _tools_cache = mock_tools
+
+            @property
+            def cached_tools(self) -> list[mcp_types.Tool]:
+                return self._tools_cache
+
+            async def connect(self) -> None:
+                pass
+
+            async def disconnect(self) -> None:
+                pass
+
+            async def list_tools(self) -> list[mcp_types.Tool]:
+                return self._tools_cache
+
+        # Add the mock connector
+        env._connections["mock"] = MockConnector()  # type: ignore
+
+        # Build routing
+        await env._build_routing()
+
+        # Call the handler that MCP will call
+        tools = await env._env_list_tools()
+
+        # Should include the remote tool
+        tool_names = [t.name for t in tools]
+        assert "remote_tool" in tool_names
+
+    @pytest.mark.asyncio
+    async def test_env_call_tool_routes_to_local(self) -> None:
+        """_env_call_tool routes local tool calls correctly."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+        called_with: list[int] = []
+
+        @env.tool()
+        def my_tool(x: int) -> str:
+            """A test tool."""
+            called_with.append(x)
+            return f"result: {x}"
+
+        # Build routing
+        await env._build_routing()
+
+        # Call the handler that MCP will call
+        result = await env._env_call_tool("my_tool", {"x": 42})
+
+        assert called_with == [42]
+        assert len(result) == 1
+
+    @pytest.mark.asyncio
+    async def test_env_call_tool_routes_to_connector(self) -> None:
+        """_env_call_tool routes connector tool calls correctly."""
+        from unittest.mock import AsyncMock
+
+        import mcp.types as mcp_types
+
+        from hud.environment import Environment
+        from hud.types import MCPToolResult
+
+        env = Environment("test")
+
+        # Create a mock connector
+        mock_tools = [
+            mcp_types.Tool(
+                name="remote_tool",
+                description="A remote tool",
+                inputSchema={"type": "object"},
+            )
+        ]
+
+        class MockConnector:
+            is_connected = True
+            _tools_cache = mock_tools
+            call_tool = AsyncMock(
+                return_value=MCPToolResult(
+                    content=[mcp_types.TextContent(type="text", text="remote result")],
+                    isError=False,
+                )
+            )
+
+            @property
+            def cached_tools(self) -> list[mcp_types.Tool]:
+                return self._tools_cache
+
+            async def connect(self) -> None:
+                pass
+
+            async def disconnect(self) -> None:
+                pass
+
+            async def list_tools(self) -> list[mcp_types.Tool]:
+                return self._tools_cache
+
+        mock_conn = MockConnector()
+        env._connections["mock"] = mock_conn  # type: ignore
+
+        # Build routing
+        await env._build_routing()
+
+        # Call the handler that MCP will call
+        result = await env._env_call_tool("remote_tool", {"arg": "value"})
+
+        # Verify the connector was called
+        mock_conn.call_tool.assert_called_once_with("remote_tool", {"arg": "value"})
+        assert len(result) == 1
+
+    def test_setup_handlers_registers_custom_handlers(self) -> None:
+        """Verify _setup_handlers registers our _env_list_tools and _env_call_tool."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        # Verify the custom handlers exist
+        assert hasattr(env, "_env_list_tools")
+        assert hasattr(env, "_env_call_tool")
+        assert callable(env._env_list_tools)
+        assert callable(env._env_call_tool)
diff --git a/hud/eval/context.py b/hud/eval/context.py
index ca0704f5..e7d4be14 100644
--- a/hud/eval/context.py
+++ b/hud/eval/context.py
@@ -302,10 +302,20 @@ def from_task(
             code_snippet: Code being evaluated
             trace: Whether to send traces to backend
             quiet: Whether to suppress output
+
+        Raises:
+            ValueError: If task.args is None (template tasks cannot be run directly)
         """
         from hud.environment import Environment
         from hud.eval.task import build_eval_name
 
+        # Validate that task has args (not a template)
+        if task.args is None:
+            raise ValueError(
+                f"Cannot run task with args=None (this is a template). "
+                f"Provide args when creating the task: env('{task.scenario}', **args)"
+            )
+
         eval_name = name or build_eval_name(task.scenario, task.args)
 
         # task.env is guaranteed to be Environment after Task.__post_init__
@@ -343,7 +353,7 @@ async def _run_task_scenario_setup(self) -> None:
         if self._task is None or self._task.scenario is None:
             return
 
-        prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
+        prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {})
         if prompt:
             self.prompt = prompt
 
diff --git a/hud/eval/task.py b/hud/eval/task.py
index 085f1bf8..cfa6d64a 100644
--- a/hud/eval/task.py
+++ b/hud/eval/task.py
@@ -148,7 +148,10 @@ class Task(BaseModel):
     env: Any = Field(default=None)  # Typed as Any for input flexibility, validated below
     scenario: str | None = None
     id: str | None = None
-    args: dict[str, Any] = Field(default_factory=dict)
+    args: dict[str, Any] | None = Field(
+        default=None,
+        description="Scenario arguments. None indicates a template (args filled in later).",
+    )
     validation: list[MCPToolCall] | None = None
 
     # Agent config - settings passed to agent (system_prompt, etc.)
@@ -335,6 +338,6 @@ def copy(self) -> Task:
             id=self.id,
             env=self.env,  # Share reference
             scenario=self.scenario,
-            args=self.args.copy() if self.args else {},
+            args=self.args.copy() if self.args is not None else None,
             validation=self.validation.copy() if self.validation else None,
         )
diff --git a/hud/eval/tests/test_eval.py b/hud/eval/tests/test_eval.py
index 6d470808..ea958af4 100644
--- a/hud/eval/tests/test_eval.py
+++ b/hud/eval/tests/test_eval.py
@@ -16,7 +16,7 @@ def test_init_defaults(self) -> None:
 
         assert task.env is None
         assert task.scenario is None
-        assert task.args == {}
+        assert task.args is None  # None = template, {} = runnable with no args
 
     def test_init_with_env_dict(self) -> None:
         """Task auto-converts env dict to Environment via validator."""
diff --git a/hud/telemetry/tests/test_eval_telemetry.py b/hud/telemetry/tests/test_eval_telemetry.py
index 8849cd13..bfb61004 100644
--- a/hud/telemetry/tests/test_eval_telemetry.py
+++ b/hud/telemetry/tests/test_eval_telemetry.py
@@ -49,8 +49,8 @@ async def greet(name: str) -> str:
             """Say hello."""
             return f"Hello, {name}!"
 
-        # Create task from environment
-        task = Task(env=env)
+        # Create task from environment (args={} = runnable, args=None = template)
+        task = Task(env=env, args={})
 
         with (
             patch("hud.settings.settings") as mock_settings,
@@ -110,7 +110,7 @@ async def failing_tool() -> str:
             """Always fails."""
             raise ValueError("Tool error")
 
-        task = Task(env=env)
+        task = Task(env=env, args={})
 
         with (
             patch("hud.settings.settings") as mock_settings,
@@ -162,7 +162,7 @@ async def multiply(a: int, b: int) -> int:
             """Multiply two numbers."""
             return a * b
 
-        task = Task(env=env)
+        task = Task(env=env, args={})
 
         with (
             patch("hud.settings.settings") as mock_settings,
@@ -195,7 +195,7 @@ async def test_flush_called_on_context_exit(self):
         async def simple_tool() -> str:
             return "done"
 
-        task = Task(env=env)
+        task = Task(env=env, args={})
 
         with (
             patch("hud.eval.context.flush") as mock_flush,
@@ -229,7 +229,7 @@ def should_not_be_called(*args: Any, **kwargs: Any) -> bool:
         async def test_tool() -> str:
             return "ok"
 
-        task = Task(env=env)
+        task = Task(env=env, args={})
 
         with (
             patch("hud.settings.settings") as mock_settings,
@@ -272,7 +272,7 @@ def capture_upload(
         async def echo(message: str) -> str:
             return message
 
-        task = Task(env=env)
+        task = Task(env=env, args={})
 
         with (
             patch("hud.settings.settings") as mock_settings,
@@ -329,7 +329,7 @@ def capture_upload(
         async def noop() -> None:
             pass
 
-        task = Task(env=env)
+        task = Task(env=env, args={})
 
         with (
             patch("hud.settings.settings") as mock_settings,
diff --git a/hud/tools/__init__.py b/hud/tools/__init__.py
index 8451a04f..26495d33 100644
--- a/hud/tools/__init__.py
+++ b/hud/tools/__init__.py
@@ -4,6 +4,7 @@
 
 from typing import TYPE_CHECKING, Any
 
+from .agent import AgentTool
 from .base import BaseHub, BaseTool
 from .bash import BashTool
 from .edit import EditTool
@@ -21,6 +22,7 @@
     )
 
 __all__ = [
+    "AgentTool",
     "AnthropicComputerTool",
     "BaseHub",
     "BaseTool",
diff --git a/hud/tools/agent.py b/hud/tools/agent.py
new file mode 100644
index 00000000..2f5ad377
--- /dev/null
+++ b/hud/tools/agent.py
@@ -0,0 +1,216 @@
+"""AgentTool - run a Task with an agent as a tool."""
+
+from __future__ import annotations
+
+import inspect
+from typing import TYPE_CHECKING, Any, Union, get_args, get_origin
+
+from fastmcp.tools.tool import FunctionTool, ToolResult
+from mcp.types import TextContent
+
+from hud.tools.base import BaseTool
+
+if TYPE_CHECKING:
+    from hud.agents.base import MCPAgent
+    from hud.eval.task import Task
+
+__all__ = ["AgentTool"]
+
+
+def _is_eval_only(param: inspect.Parameter) -> bool:
+    """Check if param is eval-only: has None default AND None in type union.
+
+    Handles both runtime types and string annotations (PEP 563).
+    """
+    # Must have default of None
+    if param.default is not None:
+        return False
+    if param.annotation is inspect.Parameter.empty:
+        return False
+
+    annotation = param.annotation
+
+    # Handle string annotations (from __future__ annotations or quoted)
+    if isinstance(annotation, str):
+        # Check if it looks like "X | None", "Union[X, None]", or "Optional[X]"
+        return (
+            "| None" in annotation
+            or "None |" in annotation
+            or "Optional[" in annotation
+            or ("Union[" in annotation and "None" in annotation)
+        )
+
+    # Handle runtime type annotations
+    origin = get_origin(annotation)
+
+    # Union types (X | None or Union[X, None])
+    if origin is Union:
+        return type(None) in get_args(annotation)
+
+    # For Python 3.10+ union syntax at runtime (types.UnionType)
+    try:
+        import types
+
+        if isinstance(annotation, types.UnionType):
+            return type(None) in get_args(annotation)
+    except (ImportError, AttributeError):
+        pass
+
+    return False
+
+
+class AgentTool(BaseTool):
+    """Tool that runs a Task template with an agent.
+
+    Parameters with `| None = None` are eval-only and hidden from the tool schema.
+
+    Example:
+        ```python
+        @env.scenario()
+        async def investigate(
+            issue_id: str,  # Required - orchestrator sees
+            expected_cause: str | None = None,  # Eval only - hidden
+        ):
+            yield {"task": f"Investigate {issue_id}"}
+
+
+        seer = AgentTool(env("investigate"), model="ft:seer-v2")
+        ```
+    """
+
+    def __init__(
+        self,
+        task: Task,
+        *,
+        model: str | None = None,
+        agent: type[MCPAgent] | None = None,
+        agent_params: dict[str, Any] | None = None,
+        name: str | None = None,
+        description: str | None = None,
+        trace: bool = False,
+    ) -> None:
+        if not model and agent is None:
+            raise ValueError("Must provide either 'model' or 'agent'")
+        if model and agent is not None:
+            raise ValueError("Cannot provide both 'model' and 'agent'")
+
+        self._task = task
+        self._model = model
+        self._agent_cls = agent
+        self._agent_params = agent_params or {}
+        self._trace = trace
+
+        # Get visible params from scenario function
+        self._visible_params: set[str] = set()
+        self._param_schema: dict[str, Any] = {
+            "type": "object",
+            "properties": {},
+            "required": [],
+        }
+
+        if task.env and task.scenario:
+            scenario_fn = task.env._scenarios.get(task.scenario)
+            if scenario_fn:
+                sig = inspect.signature(scenario_fn)
+                visible = {name: p for name, p in sig.parameters.items() if not _is_eval_only(p)}
+                self._visible_params = set(visible.keys())
+                self._param_schema = self._build_schema(visible)
+
+        tool_name = name or task.scenario or "agent_tool"
+        tool_desc = description or f"Run scenario: {task.scenario}"
+
+        super().__init__(name=tool_name, description=tool_desc)
+
+    def _build_schema(self, params: dict[str, inspect.Parameter]) -> dict[str, Any]:
+        """Build JSON schema using Pydantic TypeAdapter."""
+        from pydantic import TypeAdapter
+
+        properties: dict[str, Any] = {}
+        required: list[str] = []
+
+        for name, param in params.items():
+            if param.annotation is not inspect.Parameter.empty:
+                try:
+                    # Handle string annotations
+                    annotation = param.annotation
+                    if isinstance(annotation, str):
+                        # Try to evaluate the annotation
+                        try:
+                            annotation = eval(annotation)  # noqa: S307
+                        except Exception:
+                            # Fall back to string type but don't skip required handling
+                            annotation = None
+
+                    if annotation is not None:
+                        adapter = TypeAdapter(annotation)
+                        properties[name] = adapter.json_schema()
+                    else:
+                        properties[name] = {"type": "string"}
+                except Exception:
+                    properties[name] = {"type": "string"}
+            else:
+                properties[name] = {"type": "string"}
+
+            if param.default is inspect.Parameter.empty:
+                required.append(name)
+            elif param.default is not None:
+                properties[name]["default"] = param.default
+
+        return {"type": "object", "properties": properties, "required": required}
+
+    @property
+    def mcp(self) -> FunctionTool:
+        """Get as FastMCP FunctionTool with filtered schema."""
+        if not hasattr(self, "_mcp_tool"):
+            # Directly instantiate FunctionTool with our callable and schema
+            # This bypasses from_function's signature parsing
+            self._mcp_tool = FunctionTool(
+                name=self.name,
+                description=self.description or "",
+                parameters=self._param_schema,
+                fn=self._execute_with_args,
+            )
+        return self._mcp_tool
+
+    async def _execute_with_args(self, **kwargs: Any) -> ToolResult:
+        """Internal executor that FastMCP calls with parsed arguments."""
+        return await self(**kwargs)
+
+    async def __call__(self, **kwargs: Any) -> ToolResult:
+        """Execute the task with a fresh agent."""
+        from hud.eval.context import get_current_trace_id
+        from hud.eval.manager import run_eval
+
+        # Filter to visible params only
+        filtered = {k: v for k, v in kwargs.items() if k in self._visible_params}
+
+        # Merge with template args
+        base_args = self._task.args or {}
+        task = self._task.model_copy(update={"args": {**base_args, **filtered}})
+
+        # Use parent trace if available (for hierarchical agents)
+        parent_trace_id = get_current_trace_id()
+
+        # If nested (has parent), skip subagent's enter/exit registration
+        # Tool calls are still recorded via the shared trace_id's context
+        is_nested = parent_trace_id is not None
+
+        # Trace if explicitly requested AND not nested (nested uses parent trace)
+        should_trace = self._trace and not is_nested
+
+        async with run_eval(
+            task,
+            trace=should_trace,
+            trace_id=parent_trace_id,
+            quiet=True,
+        ) as ctx:
+            if self._model:
+                from hud.agents import create_agent
+
+                agent = create_agent(self._model, **self._agent_params)
+            else:
+                agent = self._agent_cls.create(**self._agent_params)  # type: ignore
+
+            result = await agent.run(ctx)
+            content = result.content if hasattr(result, "content") and result.content else ""
+            return ToolResult(content=[TextContent(type="text", text=content)])
diff --git a/hud/tools/tests/test_agent_tool.py b/hud/tools/tests/test_agent_tool.py
new file mode 100644
index 00000000..de8196c3
--- /dev/null
+++ b/hud/tools/tests/test_agent_tool.py
@@ -0,0 +1,355 @@
+"""Tests for AgentTool - scenario-to-agent composition."""
+
+from __future__ import annotations
+
+import inspect
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from hud.environment import Environment
+from hud.eval.task import Task
+from hud.tools.agent import AgentTool, _is_eval_only
+
+
+class TestIsEvalOnly:
+    """Tests for _is_eval_only helper function."""
+
+    def test_required_param_not_eval_only(self) -> None:
+        """Required params (no default) are not eval-only."""
+
+        def fn(x: str) -> None:
+            pass
+
+        sig = inspect.signature(fn)
+        param = sig.parameters["x"]
+        assert not _is_eval_only(param)
+
+    def test_optional_with_value_not_eval_only(self) -> None:
+        """Optional params with non-None default are not eval-only."""
+
+        def fn(x: str = "default") -> None:
+            pass
+
+        sig = inspect.signature(fn)
+        param = sig.parameters["x"]
+        assert not _is_eval_only(param)
+
+    def test_optional_none_without_union_not_eval_only(self) -> None:
+        """Optional with None default but no None in type is not eval-only."""
+
+        def fn(x: str = None) -> None:  # type: ignore[assignment]  # noqa: RUF013
+            pass
+
+        sig = inspect.signature(fn)
+        param = sig.parameters["x"]
+        assert not _is_eval_only(param)
+
+    def test_optional_none_with_union_is_eval_only(self) -> None:
+        """Params with `X | None = None` pattern are eval-only."""
+
+        def fn(x: str | None = None) -> None:
+            pass
+
+        sig = inspect.signature(fn)
+        param = sig.parameters["x"]
+        assert _is_eval_only(param)
+
+    def test_optional_int_none_is_eval_only(self) -> None:
+        """Works with int | None = None too."""
+
+        def fn(x: int | None = None) -> None:
+            pass
+
+        sig = inspect.signature(fn)
+        param = sig.parameters["x"]
+        assert _is_eval_only(param)
+
+    def test_string_annotation_with_none_union(self) -> None:
+        """Handles string annotations like 'str | None'."""
+        # Simulate string annotation
+        param = inspect.Parameter(
+            "x",
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            default=None,
+            annotation="str | None",
+        )
+        assert _is_eval_only(param)
+
+    def test_string_annotation_without_none(self) -> None:
+        """String annotations without None are not eval-only."""
+        param = inspect.Parameter(
+            "x",
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            default=None,
+            annotation="str",
+        )
+        assert not _is_eval_only(param)
+
+
+class TestAgentToolInit:
+    """Tests for AgentTool initialization."""
+
+    def test_requires_model_or_agent(self) -> None:
+        """Must provide either model or agent."""
+        task = Task(args={})
+
+        with pytest.raises(ValueError, match="Must provide either"):
+            AgentTool(task)
+
+    def test_cannot_provide_both_model_and_agent(self) -> None:
+        """Cannot provide both model and agent."""
+        task = Task(args={})
+        mock_agent = MagicMock()
+
+        with pytest.raises(ValueError, match="Cannot provide both"):
+            AgentTool(task, model="claude", agent=mock_agent)  # type: ignore[arg-type]
+
+    def test_accepts_model_string(self) -> None:
+        """Can create with model string."""
+        task = Task(scenario="test", args={})
+        tool = AgentTool(task, model="claude")
+
+        assert tool._model == "claude"
+        assert tool._agent_cls is None
+
+    def test_accepts_agent_class(self) -> None:
+        """Can create with custom agent class."""
+        task = Task(scenario="test", args={})
+        mock_agent_cls = MagicMock()
+        tool = AgentTool(task, agent=mock_agent_cls)  # type: ignore[arg-type]
+
+        assert tool._model is None
+        assert tool._agent_cls is mock_agent_cls
+
+    def test_name_defaults_to_scenario(self) -> None:
+        """Tool name defaults to scenario name."""
+        task = Task(scenario="investigate", args={})
+        tool = AgentTool(task, model="claude")
+
+        assert tool.name == "investigate"
+
+    def test_name_can_be_overridden(self) -> None:
+        """Tool name can be overridden."""
+        task = Task(scenario="investigate", args={})
+        tool = AgentTool(task, model="claude", name="custom_name")
+
+        assert tool.name == "custom_name"
+
+
+class TestAgentToolParamFiltering:
+    """Tests for parameter filtering (eval-only params hidden)."""
+
+    def test_filters_eval_only_params(self) -> None:
+        """Eval-only params (| None = None) are filtered from visible_params."""
+        env = Environment("test")
+
+        # Use Union syntax for consistency across Python versions
+        @env.scenario()
+        async def investigate(
+            issue_id: str,
+            include_traces: bool = True,
+            expected_cause: str | None = None,  # Eval only
+        ):
+            yield {"task": f"Investigate {issue_id}"}
+
+        task = env("investigate")
+        tool = AgentTool(task, model="claude")
+
+        # visible_params should only have issue_id and include_traces
+        assert "issue_id" in tool._visible_params
+        assert "include_traces" in tool._visible_params
+        assert "expected_cause" not in tool._visible_params
+
+    def test_all_required_params_visible(self) -> None:
+        """All required params are visible."""
+        env = Environment("test")
+
+        @env.scenario()
+        async def search(query: str, limit: int):
+            yield {"task": f"Search: {query}"}
+
+        task = env("search")
+        tool = AgentTool(task, model="claude")
+
+        assert "query" in tool._visible_params
+        assert "limit" in tool._visible_params
+
+    def test_optional_with_default_visible(self) -> None:
+        """Optional params with non-None defaults are visible."""
+        env = Environment("test")
+
+        @env.scenario()
+        async def fetch(url: str, request_timeout: int = 30, retries: int = 3):
+            yield {"task": f"Fetch {url}"}
+
+        task = env("fetch")
+        tool = AgentTool(task, model="claude")
+
+        assert "url" in tool._visible_params
+        assert "request_timeout" in tool._visible_params
+        assert "retries" in tool._visible_params
+
+
+class TestAgentToolSchema:
+    """Tests for JSON schema generation."""
+
+    def test_builds_json_schema(self) -> None:
+        """Builds proper JSON schema from visible params."""
+        env = Environment("test")
+
+        @env.scenario()
+        async def investigate(issue_id: str, verbose: bool = False):
+            yield {"task": f"Investigate {issue_id}"}
+
+        task = env("investigate")
+        tool = AgentTool(task, model="claude")
+
+        schema = tool._param_schema
+        assert schema is not None
+        assert schema["type"] == "object"
+        assert "issue_id" in schema["properties"]
+        assert "verbose" in schema["properties"]
+        assert "issue_id" in schema["required"]
+        assert "verbose" not in schema["required"]  # Has default
+
+    def test_schema_excludes_eval_only(self) -> None:
+        """Schema excludes eval-only params."""
+        env = Environment("test")
+
+        @env.scenario()
+        async def check(
+            item_id: str,
+            expected_status: str | None = None,  # Eval only
+        ):
+            yield {"task": f"Check {item_id}"}
+
+        task = env("check")
+        tool = AgentTool(task, model="claude")
+
+        schema = tool._param_schema
+        assert schema is not None
+        assert "item_id" in schema["properties"]
+        assert "expected_status" not in schema["properties"]
+
+
+class TestAgentToolMCP:
+    """Tests for MCP tool integration."""
+
+    def test_mcp_property_returns_tool(self) -> None:
+        """The mcp property returns a FastMCP FunctionTool."""
+        from fastmcp.tools import FunctionTool
+
+        env = Environment("test")
+
+        @env.scenario()
+        async def greet(name: str):
+            yield {"task": f"Greet {name}"}
+
+        task = env("greet")
+        tool = AgentTool(task, model="claude")
+
+        mcp_tool = tool.mcp
+        assert isinstance(mcp_tool, FunctionTool)
+
+    def test_mcp_has_filtered_parameters(self) -> None:
+        """MCP tool has filtered parameter schema."""
+        env = Environment("test")
+
+        @env.scenario()
+        async def analyze(
+            data: str,
+            expected_result: str | None = None,  # Eval only
+        ):
+            yield {"task": f"Analyze {data}"}
+
+        task = env("analyze")
+        tool = AgentTool(task, model="claude")
+
+        mcp_tool = tool.mcp
+        params = mcp_tool.parameters  # FunctionTool uses 'parameters'
+
+        assert "data" in params["properties"]
+        assert "expected_result" not in params["properties"]
+
+
+class TestAgentToolCall:
+    """Tests for AgentTool.__call__."""
+
+    @pytest.mark.asyncio
+    async def test_filters_kwargs_to_visible_only(self) -> None:
+        """Call filters kwargs to visible params only."""
+        # Import modules first so patches work
+        import hud.agents
+        import hud.eval.manager  # noqa: F401
+
+        env = Environment("test")
+
+        @env.scenario()
+        async def process(item: str, expected: str | None = None):
+            yield {"task": f"Process {item}"}
+
+        task = env("process")
+        tool = AgentTool(task, model="claude")
+
+        # Mock the eval context and agent
+        with (
+            patch("hud.eval.manager.run_eval") as mock_run_eval,
+            patch("hud.agents.create_agent") as mock_create_agent,
+        ):
+            mock_ctx = AsyncMock()
+            mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_ctx.__aexit__ = AsyncMock(return_value=None)
+            mock_run_eval.return_value = mock_ctx
+
+            mock_agent = MagicMock()
+            mock_agent.run = AsyncMock(return_value=MagicMock(content="result"))
+            mock_create_agent.return_value = mock_agent
+
+            # Call with both visible and eval-only params
+            await tool(item="test", expected="should_be_filtered")
+
+            # Check that task was created with filtered args
+            call_args = mock_run_eval.call_args
+            task_arg = call_args[0][0]
+            assert "item" in task_arg.args
+            assert "expected" not in task_arg.args  # Filtered out
+
+    @pytest.mark.asyncio
+    async def test_merges_template_args(self) -> None:
+        """Call merges kwargs with template args."""
+        # Import modules first so patches work
+        import hud.agents
+        import hud.eval.manager  # noqa: F401
+
+        env = Environment("test")
+
+        @env.scenario()
+        async def search(query: str, limit: int = 10):
+            yield {"task": f"Search {query}"}
+
+        # Create template with some args pre-filled
+        task = env("search", limit=5)
+        tool = AgentTool(task, model="claude")
+
+        with (
+            patch("hud.eval.manager.run_eval") as mock_run_eval,
+            patch("hud.agents.create_agent") as mock_create_agent,
+        ):
+            mock_ctx = AsyncMock()
+            mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_ctx.__aexit__ = AsyncMock(return_value=None)
+            mock_run_eval.return_value = mock_ctx
+
+            mock_agent = MagicMock()
+            mock_agent.run = AsyncMock(return_value=MagicMock(content="result"))
+            mock_create_agent.return_value = mock_agent
+
+            # Call with additional arg
+            await tool(query="test query")
+
+            # Check merged args
+            call_args = mock_run_eval.call_args
+            task_arg = call_args[0][0]
+            assert task_arg.args["query"] == "test query"
+            assert task_arg.args["limit"] == 5  # From template
diff --git a/hud/utils/strict_schema.py b/hud/utils/strict_schema.py
index 5d3fa0da..263919b3 100644
--- a/hud/utils/strict_schema.py
+++ b/hud/utils/strict_schema.py
@@ -118,7 +118,7 @@ def _ensure_strict_json_schema(
     if "default" in json_schema:
         json_schema.pop("default")
 
-    for keyword in ("title", "examples"):
+    for keyword in ("title", "examples", "format"):
         json_schema.pop(keyword, None)
 
     ref = json_schema.get("$ref")
diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py
index 8ba18f35..1d40fb59 100644
--- a/hud/utils/tests/test_version.py
+++ b/hud/utils/tests/test_version.py
@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
 
-    assert hud.__version__ == "0.5.2"
+    assert hud.__version__ == "0.5.3"
diff --git a/hud/version.py b/hud/version.py
index 6fbebfff..c9ba33a6 100644
--- a/hud/version.py
+++ b/hud/version.py
@@ -4,4 +4,4 @@
 
 from __future__ import annotations
 
-__version__ = "0.5.2"
+__version__ = "0.5.3"
diff --git a/pyproject.toml b/pyproject.toml
index 3cb5c7d3..b1ad8b5a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.5.2"
+version = "0.5.3"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.13"