diff --git a/docs/cookbooks/ops-diagnostics.mdx b/docs/cookbooks/ops-diagnostics.mdx
new file mode 100644
index 00000000..558d3d2d
--- /dev/null
+++ b/docs/cookbooks/ops-diagnostics.mdx
@@ -0,0 +1,478 @@
+---
+title: "Ops Diagnostics Agent"
+description: "How we built a hierarchical agent to diagnose production issues across our infrastructure"
+icon: "stethoscope"
+---
+
+At HUD, we run a complex stack: Sentry for errors, Supabase for data, Railway for deployments, and Kubernetes for orchestration. When something breaks, we wanted an agent that could investigate across all services and provide a unified diagnosis.
+
+This cookbook walks through how we built it—focusing on **environment design**, **hierarchical delegation**, and **practical patterns** for production agent systems.
+
+## Why Hierarchical?
+
+When you connect multiple MCP servers to a single environment, the agent sees all tools at once. For diagnostics across four services, this meant 60+ tools in a flat list. The cognitive load made it harder for the model to select the right tool for the job.
+
+We restructured into a hierarchy: an orchestrator that delegates to specialized subagents.
+
+```mermaid
+flowchart TD
+ subgraph orch["Orchestrator"]
+ O["4 subagent tools"]
+ end
+
+ subgraph sentry["Sentry Agent"]
+ S1["search_issues"]
+ S2["get_issue_details"]
+ S3["analyze_with_seer"]
+ end
+
+ subgraph supabase["Supabase Agent"]
+ SU1["list_tables"]
+ SU2["execute_sql"]
+ SU3["get_logs"]
+ end
+
+ subgraph railway["Railway Agent"]
+ R1["list_projects"]
+ R2["get_deployments"]
+ R3["get_logs"]
+ end
+
+ subgraph kubectl["kubectl Agent"]
+ K1["get_pods"]
+ K2["get_events"]
+ K3["describe_pod"]
+ end
+
+ O --> sentry
+ O --> supabase
+ O --> railway
+ O --> kubectl
+```
+
+The orchestrator sees only 4 tools—one per specialist. Each specialist has a focused toolset for its domain.
+
+## Environment Design
+
+Good environment design is the foundation. Each subagent is an `Environment` with:
+- A **focused toolset** (only what's needed for this domain)
+- A **single scenario** that defines the interface
+- **Read-only constraints** for safety
+
+### Connecting to MCP Servers
+
+For services with official MCP servers (Sentry, Supabase), connect via `connect_mcp_config`:
+
+```python
+# environments/sentry.py
+from hud import Environment
+import os
+import platform
+
+sentry_env = Environment(name="sentry-agent")
+
+IS_WINDOWS = platform.system() == "Windows"
+token = os.getenv("SENTRY_AUTH_TOKEN")
+
+if token:
+ config = {
+ "command": "cmd" if IS_WINDOWS else "npx",
+ "args": ["/c", "npx", "-y", "@sentry/mcp-server@latest"] if IS_WINDOWS
+ else ["-y", "@sentry/mcp-server@latest"],
+ "env": {"SENTRY_ACCESS_TOKEN": token}
+ }
+ sentry_env.connect_mcp_config({"sentry": config})
+```
+
+### Custom Tools When Needed
+
+Railway's MCP server requires browser OAuth—not ideal for headless agents. We built custom tools using their GraphQL API:
+
+```python
+# environments/tools/railway.py
+from hud.server import MCPRouter
+import httpx
+import os
+
+router = MCPRouter()
+RAILWAY_API = "https://backboard.railway.com/graphql/v2"
+
+
+async def _graphql(query: str, variables: dict | None = None) -> dict:
+ token = os.getenv("RAILWAY_API_TOKEN")
+ async with httpx.AsyncClient() as client:
+ resp = await client.post(
+ RAILWAY_API,
+ headers={"Authorization": f"Bearer {token}"},
+ json={"query": query, "variables": variables}
+ )
+ return resp.json()
+
+
+@router.tool()
+async def railway_list_projects() -> dict:
+ """List all projects with their services."""
+ return await _graphql("""
+ query {
+ projects {
+ edges { node { id name } }
+ }
+ }
+ """)
+
+
+@router.tool()
+async def railway_get_deployment_logs(deployment_id: str) -> dict:
+ """Get logs for a deployment."""
+ return await _graphql("""
+ query($id: String!) {
+ deploymentLogs(deploymentId: $id) {
+ ... on Log { message timestamp severity }
+ }
+ }
+ """, {"id": deployment_id})
+```
+
+Then include the router in your environment:
+
+```python
+# environments/railway.py
+from hud import Environment
+from .tools.railway import router
+
+railway_env = Environment(name="railway-agent")
+railway_env.include_router(router)
+```
+
+### Defining the Scenario
+
+The scenario is the contract between orchestrator and subagent:
+
+```python
+@sentry_env.scenario("investigate")
+async def investigate_issue(
+ query: str, # Orchestrator provides this
+ expected_finding: str | None = None, # Hidden from orchestrator (eval-only)
+):
+ """Investigate errors in Sentry."""
+
+ prompt = f"""You are a Sentry specialist. Investigate:
+
+**Query:** {query}
+
+**IMPORTANT: This is a READ-ONLY investigation.**
+
+Provide findings, root cause analysis, and recommended fixes."""
+
+ response = yield prompt
+
+ # Scoring for evals
+ if expected_finding and response:
+ yield 1.0 if expected_finding.lower() in response.lower() else 0.5
+ else:
+ yield 1.0 if response else 0.0
+```
+
+
+**Eval-only parameters**: Parameters with `| None = None` are automatically hidden from the orchestrator's tool schema but available for evaluation scoring.
+
+
+## Building the Orchestrator
+
+The orchestrator wraps each subagent's scenario as an `AgentTool`:
+
+```python
+# orchestrator.py
+from hud import Environment
+from hud.tools import AgentTool
+from hud.agents import create_agent
+import hud
+
+from environments import sentry_env, supabase_env, railway_env, kubectl_env
+
+
+async def diagnose(query: str, model: str = "claude-sonnet-4-5"):
+ orchestrator = Environment(name="ops-orchestrator")
+
+ # Wrap each subagent as a tool
+ for name, env, desc in [
+ ("investigate_sentry", sentry_env, "Check error monitoring"),
+ ("investigate_supabase", supabase_env, "Check database/auth"),
+ ("investigate_railway", railway_env, "Check deployments"),
+ ("investigate_kubernetes", kubectl_env, "Check cluster health"),
+ ]:
+ tool = AgentTool(
+ env("investigate"),
+ model=model,
+ name=name,
+ description=desc,
+ )
+ orchestrator.add_tool(tool.mcp)
+
+ @orchestrator.scenario("diagnose")
+ async def run_diagnosis(issue: str):
+ yield f"""You are an ops diagnostics orchestrator.
+
+**Issue:** {issue}
+
+You have READ-ONLY subagents for Sentry, Supabase, Railway, and Kubernetes.
+Investigate systematically and correlate findings across services."""
+
+ task = orchestrator("diagnose", issue=query)
+
+ async with hud.eval(task) as ctx:
+ agent = create_agent(model)
+ return await agent.run(ctx, max_steps=20)
+```
+
+### Trace Continuity
+
+All subagent activity appears in a single trace on the HUD platform. When the orchestrator calls a subagent tool, the inference and tool calls are recorded under the parent trace—no separate URLs to track.
+
+## The READ-ONLY Constraint
+
+
+We tested and operated this environment directly on our production systems, so all scenarios enforce read-only constraints. We removed mutation tools like `kubectl_exec`, `railway_redeploy`, and Supabase DDL operations.
+
+Every prompt includes: **"This is a READ-ONLY investigation."**
+
+
+## Sample Output
+
+Running against a real production issue:
+
+```bash
+python orchestrator.py --model claude-sonnet-4-5 \
+ "Failed to delete pod: 429 Too Many Requests. 7451 events, escalating."
+```
+
+The orchestrator delegates to `investigate_sentry`, `investigate_railway`, and `investigate_supabase`, then correlates findings across services. After about 5 minutes:
+
+```text Diagnosis
+COMPREHENSIVE DIAGNOSIS REPORT
+
+Issue Summary
+ - Error: Failed to delete pod ████████████████████████████████████: 429 Too Many Requests
+ - Impact: 7,451 events over 5 days, 16 users affected, escalating state
+ - Project: Orchestrator / mcp-server
+ - Alert ID: ORCHESTRATOR-AC
+
+ROOT CAUSE ANALYSIS
+
+ Primary Root Cause: Kubernetes API Rate Limiting
+
+ The orchestrator service is hitting Kubernetes API server rate limits when
+ attempting to delete pods at scale. This is occurring in the
+ ████████.hud_gym.utils.kubernetes module.
+
+ Key Contributing Factors:
+
+ 1. Excessive Deletion Frequency: ~1,491 errors/day (~62/hour) indicates
+ aggressive pod deletion attempts
+ 2. No Retry/Backoff Logic: Code lacks exponential backoff when encountering
+ 429 responses
+ 3. High Concurrency: Service runs with 50 uvicorn workers + 32 Railway
+ replicas, amplifying concurrent API calls
+ 4. Burst Traffic Pattern: Correlated with API usage spikes (313 inference
+ calls/minute at peak)
+ 5. No Client-Side Rate Limiting: Kubernetes client not configured with QPS
+ limits
+
+CORRELATED FINDINGS ACROSS SERVICES
+
+ Sentry (Error Tracking)
+ - 7,455 occurrences of the 429 error between ██████████████
+ - Last occurrence: ████████████████████
+ - Error originates from: ████████.hud_gym.utils.kubernetes logger
+ - Associated with HTTP PATCH to Supabase /rest/v1/environments endpoint
+ - Part of environment update/cleanup workflow
+
+ Railway (Deployment Platform)
+ - Production service: 32 replicas in us-west2
+ - Latest successful deployment: ████████████████████ (30 min AFTER last
+ Sentry error)
+ - Historical failures (██████): AWS EKS credential issues (now resolved)
+ - No current rate limiting errors in deployment logs
+ - Pod deletions working normally post-fix
+
+ Supabase (Database/API)
+ - API burst traffic spike: 313 calls/minute at ████████████████████
+ - ████ Team (22 members, free tier): 15,933 inference calls/24h - prime
+ candidate for "16 users"
+ - Connection pool saturation: 49 waiting connections out of 52
+ - Security vulnerabilities: 38 tables with RLS enabled but NO policies
+ - Performance issues: 52 unindexed foreign keys, inefficient RLS policies
+ - 429 errors occur at API gateway layer (not visible in Postgres logs)
+
+ Kubernetes
+ - Investigation unavailable due to response size (cluster likely healthy
+ but under load)
+
+CORRELATION & TIMELINE
+
+ ██████████████: 7,455 pod deletion failures (continuous)
+ - ████████████████████: Last 429 error recorded in Sentry
+ - ████████████████████: New production deployment (likely contained fix)
+ - ████████████████████: API traffic spike (313 req/min)
+
+ Pattern Identified:
+
+ 1. Orchestrator creates ephemeral pods for task execution (inference
+ workloads)
+ 2. High inference API traffic (15,933 calls/day from ████ team) triggers
+ frequent pod creation
+ 3. Cleanup attempts to delete pods rapidly after task completion
+ 4. Kubernetes API rate limits exceeded due to:
+ - 50 uvicorn workers x 32 Railway replicas = 1,600 concurrent processes
+ - No client-side rate limiting or request coordination
+ - No exponential backoff on 429 responses
+ 5. Failed deletions likely retry aggressively, compounding the problem
+
+RECOMMENDED ACTIONS FOR HUMANS
+
+ CRITICAL - Immediate Actions
+
+ 1. Implement Exponential Backoff
+ Location: ████████.hud_gym.utils.kubernetes
+
+ def delete_pod_with_retry(api_client, pod_name, namespace, max_retries=5):
+ for attempt in range(max_retries):
+ try:
+ api_client.delete_namespaced_pod(pod_name, namespace)
+ return True
+ except ApiException as e:
+ if e.status == 429 and attempt < max_retries - 1:
+ wait_time = (2 ** attempt) + random.uniform(0, 1)
+ logger.warning(f"Rate limited, retrying in {wait_time:.1f}s")
+ time.sleep(wait_time)
+ continue
+ raise
+ return False
+
+ 2. Configure Kubernetes Client Rate Limiting
+
+ configuration = client.Configuration()
+ configuration.qps = 5.0 # Max 5 queries per second per client
+ configuration.burst = 10 # Allow bursts up to 10
+
+ 3. Add Concurrency Control
+
+ k8s_api_semaphore = Semaphore(10) # Max 10 concurrent API calls
+
+ async def delete_pod_rate_limited(pod_name, namespace):
+ async with k8s_api_semaphore:
+ return await delete_pod_with_retry(pod_name, namespace)
+
+ HIGH PRIORITY - Within 48 Hours
+
+ 4. Optimize Worker Configuration
+ - Current: 50 uvicorn workers x 32 Railway replicas = 1,600 processes
+ - Recommendation: Reduce uvicorn workers to 10-20 per replica
+ - Why: Excessive concurrency amplifies K8s API load
+
+ 5. Implement Pod Deletion Queue
+ - Use background queue (Redis, Celery) for pod deletions
+ - Process deletions with controlled rate (e.g., 100/minute globally)
+ - Provides visibility into deletion backlog
+
+ 6. Fix Supabase Security Issues
+ - URGENT: Add RLS policies to 38 tables currently without policies
+ - Enable leaked password protection
+ - Reduce OTP expiry to < 1 hour
+ - Index 52 foreign keys for query performance
+ - Remove 5 duplicate indexes
+
+ 7. Upgrade ████ Team or Implement Graduated Rate Limits
+ - ████ team (22 members, free tier) using 15,933 API calls/day
+ (enterprise-level)
+ - Either upgrade to paid tier or implement request throttling
+ - Add monitoring for teams exceeding tier limits
+
+ MEDIUM PRIORITY - Within 1 Week
+
+ 8. Add Monitoring & Alerting
+ - Track pod deletion success/failure rates
+ - Monitor K8s API rate limit headers (X-RateLimit-Remaining)
+ - Alert when deletion failure rate > 5%
+ - Add dashboards for pod lifecycle metrics
+
+ 9. Implement Circuit Breaker Pattern
+
+ k8s_breaker = CircuitBreaker(fail_max=5, timeout_duration=60)
+
+ @k8s_breaker
+ def delete_pod_protected(pod_name, namespace):
+ return delete_pod_with_retry(pod_name, namespace)
+
+ 10. Optimize Pod Lifecycle
+ - Review if pods can be longer-lived (reduce churn)
+ - Consider pod pooling/reuse for similar tasks
+ - Use K8s native garbage collection where possible
+ - Set propagationPolicy=Background for async cleanup
+
+ 11. Fix Supabase Connection Pool
+ - Switch auth server to percentage-based connection allocation
+ - Current: 49 waiting connections out of 52 (saturation)
+ - Monitor connection wait times and adjust pool size
+
+ LOW PRIORITY - Technical Debt
+
+ 12. Update Deprecated Dependencies
+ - Replace close() with aclose() for Redis connections
+ - Update Supabase client for new parameter configuration
+ - Address deprecation warnings in logs
+
+ 13. Add Request Coalescing
+ - Batch multiple pod deletions into single API calls where possible
+ - Implement request deduplication for identical operations
+
+VALIDATION STEPS
+
+ After implementing fixes, validate with:
+
+ 1. Sentry: Monitor ORCHESTRATOR-AC for decreased error frequency (target: 0
+ errors)
+ 2. Kubernetes: Check API server metrics for reduced throttling events
+ 3. Railway: Verify pod deletion logs show successful operations
+ 4. Supabase: Confirm API traffic patterns stay within rate limits
+ 5. Metrics: Track pod deletion latency and success rate
+
+COMMIT MESSAGE TEMPLATE
+
+ fix: implement exponential backoff for K8s pod deletions
+
+ - Add retry logic with exponential backoff for 429 errors
+ - Configure client-side rate limiting (5 QPS, 10 burst)
+ - Add concurrency control with semaphore (max 10 concurrent)
+ - Reduce uvicorn workers from 50 to 20 per replica
+
+ Fixes ORCHESTRATOR-AC
+ Resolves rate limiting issues affecting 16 users over 5 days
+
+SUCCESS CRITERIA
+
+ - Zero 429 errors in Sentry for 7 consecutive days
+ - Pod deletion success rate > 99.9%
+ - Average deletion latency < 2 seconds
+ - No user-facing impact from pod lifecycle operations
+ - Supabase API calls stay within tier limits
+
+Investigation Status: Complete
+Next Review: After fix deployment (monitor for 48 hours)
+```
+
+The entire investigation—from initial query to actionable recommendations—took about 5 minutes across the specialized subagents.
+
+## What We Learned
+
+1. **Environment design matters.** A focused toolset per domain outperforms a flat list of everything.
+
+2. **Scenarios are contracts.** They define what the orchestrator can ask and what the subagent returns.
+
+3. **Custom tools fill gaps.** When MCP servers don't fit your auth model, build direct API integrations.
+
+## See Also
+
+- [AgentTool Reference](/reference/tools#agenttool)
+- [Building Environments](/build-environments)
+- [Scenarios](/reference/environments#scenarios)
diff --git a/docs/docs.json b/docs/docs.json
index 114ba090..d3f7332c 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -33,7 +33,7 @@
"icon": "code",
"versions": [
{
- "version": "0.5.2",
+ "version": "0.5.3",
"groups": [
{
"group": "Get Started",
@@ -63,7 +63,8 @@
{
"group": "Cookbooks",
"pages": [
- "cookbooks/codex-coding"
+ "cookbooks/codex-coding",
+ "cookbooks/ops-diagnostics"
]
},
{
diff --git a/docs/reference/environments.mdx b/docs/reference/environments.mdx
index 1e6fc107..21fc0ce7 100644
--- a/docs/reference/environments.mdx
+++ b/docs/reference/environments.mdx
@@ -266,6 +266,81 @@ env.unmock() # Disable mock mode
| `mock_tool(name, output)` | Set specific mock output |
| `is_mock` | Check if mock mode is enabled |
+## Serving as MCP Server
+
+Environment can serve its tools over MCP protocols, either standalone or mounted on an existing server.
+
+### serve()
+
+Start a standalone MCP server:
+
+```python
+from hud import Environment
+
+env = Environment("my-env")
+
+@env.tool()
+def greet(name: str) -> str:
+ return f"Hello, {name}!"
+
+# Run as MCP server (blocking)
+env.serve()
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `transport` | `Literal["stdio", "sse", "streamable-http"]` | Transport protocol | `"streamable-http"` |
+| `host` | `str` | Host address to bind | `"0.0.0.0"` |
+| `port` | `int` | Port to bind | `8000` |
+
+```python
+# Serve over stdio (for CLI tools)
+env.serve(transport="stdio")
+
+# Serve over HTTP on custom port
+env.serve(transport="streamable-http", host="0.0.0.0", port=8765)
+```
+
+### http_app()
+
+Get a Starlette/ASGI app to mount on an existing FastAPI server:
+
+```python
+from fastapi import FastAPI
+from hud import Environment
+
+app = FastAPI()
+env = Environment("my-env")
+
+@env.tool()
+def my_tool(arg: str) -> str:
+ return f"Got: {arg}"
+
+# Mount the HUD environment's MCP endpoint at /mcp
+app.mount("/mcp", env.http_app())
+
+# Your other FastAPI routes work normally
+@app.get("/health")
+def health():
+ return {"status": "ok"}
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `path` | `str \| None` | Internal path for the MCP endpoint | `"/"` |
+| `transport` | `Literal["http", "streamable-http", "sse"]` | Transport protocol | `"http"` |
+| `middleware` | `list[ASGIMiddleware] \| None` | Starlette middleware | `None` |
+| `json_response` | `bool \| None` | Use JSON response format | `None` |
+| `stateless_http` | `bool \| None` | Use stateless HTTP mode | `None` |
+
+MCP clients can then connect at `http://your-server/mcp`:
+
+```python
+# Client connecting to mounted environment
+env.connect_url("http://localhost:8000/mcp")
+```
+
+
## Properties
| Property | Type | Description |
diff --git a/docs/reference/tools.mdx b/docs/reference/tools.mdx
index 3d12e0b9..bf5b208c 100644
--- a/docs/reference/tools.mdx
+++ b/docs/reference/tools.mdx
@@ -69,6 +69,93 @@ async def url_match(url: str) -> EvaluationResult:
# Agents call: evaluators(name="url_match", arguments={"url": "..."})
```
+## Agent Tools
+
+### AgentTool
+
+```python
+from hud.tools import AgentTool
+```
+
+Wraps a scenario as a tool that can be called by another agent. Essential for building **hierarchical agent systems** where an orchestrator delegates to specialized subagents.
+
+**Constructor Parameters:**
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `task` | `Task` | Task template from `env("scenario_name")` | Required |
+| `model` | `str` | Model for subagent (via gateway) | `None` |
+| `agent` | `type[MCPAgent]` | Custom agent class | `None` |
+| `agent_params` | `dict` | Additional agent parameters | `{}` |
+| `name` | `str` | Tool name for orchestrator | From scenario |
+| `description` | `str` | Tool description | Auto-generated |
+| `trace` | `bool` | Enable tracing for standalone runs | `False` |
+
+Must provide either `model` or `agent`, not both.
+
+**Eval-Only Parameters:**
+
+Parameters with `| None = None` are hidden from the orchestrator but available for evaluation:
+
+```python
+@env.scenario("investigate")
+async def investigate(
+ query: str, # Visible - orchestrator passes this
+ expected_finding: str | None = None, # Hidden - only used in eval scoring
+):
+ response = yield f"Investigate: {query}"
+
+ # Scoring uses expected_finding but orchestrator never sees it
+ if expected_finding and response:
+ yield 1.0 if expected_finding in response else 0.5
+ else:
+ yield 1.0 if response else 0.0
+```
+
+**Usage:**
+```python
+from hud import Environment
+from hud.tools import AgentTool
+
+# Subagent environment with scenario
+sentry_env = Environment(name="sentry-agent")
+
+@sentry_env.scenario("investigate")
+async def investigate_sentry(query: str):
+ yield f"Investigate Sentry: {query}"
+
+# Create orchestrator
+orchestrator = Environment(name="orchestrator")
+
+# Wrap subagent scenario as tool
+tool = AgentTool(
+ sentry_env("investigate"), # Task template
+ model="gpt-4o-mini",
+ name="investigate_sentry",
+ description="Investigate errors in Sentry",
+)
+orchestrator.add_tool(tool.mcp)
+
+# Now orchestrator agent can call investigate_sentry(query="...")
+```
+
+**Trace Continuity:**
+
+When called from within an eval context, AgentTool automatically:
+1. Inherits the parent's trace_id
+2. Skips duplicate trace registration
+3. Routes all inference/tool calls to the parent trace
+
+```python
+async with hud.eval(task) as ctx:
+ agent = create_agent("gpt-4o")
+ result = await agent.run(ctx)
+ # All subagent activity appears in this single trace
+```
+
+**See Also:** [Ops Diagnostics Cookbook](/cookbook/ops-diagnostics) for a complete hierarchical agent example.
+
+---
+
## Core Tools
### BashTool
diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py
index 547d876b..03f9512a 100644
--- a/hud/agents/__init__.py
+++ b/hud/agents/__init__.py
@@ -1,19 +1,82 @@
from __future__ import annotations
+from typing import Any
+
from .base import MCPAgent
from .openai import OpenAIAgent
from .openai_chat import OpenAIChatAgent
from .operator import OperatorAgent
-# Note: These agents are not exported here to avoid requiring optional dependencies.
-# Import directly if needed:
-# from hud.agents.claude import ClaudeAgent # requires anthropic
-# from hud.agents.gemini import GeminiAgent # requires google-genai
-# from hud.agents.gemini_cua import GeminiCUAAgent # requires google-genai
-
__all__ = [
"MCPAgent",
"OpenAIAgent",
"OpenAIChatAgent",
"OperatorAgent",
+ "create_agent",
]
+
+
+def create_agent(model: str, **kwargs: Any) -> MCPAgent:
+ """Create an agent for a gateway model.
+
+ This routes ALL requests through the HUD gateway. For direct API access
+ (using your own API keys), use the agent classes directly.
+
+ Args:
+ model: Model name (e.g., "gpt-4o", "claude-sonnet-4-5").
+ **kwargs: Additional params passed to agent.create().
+
+ Returns:
+ Configured MCPAgent instance with gateway routing.
+
+ Example:
+ ```python
+ # Gateway routing (recommended)
+ agent = create_agent("gpt-4o")
+ agent = create_agent("claude-sonnet-4-5", temperature=0.7)
+
+ # Direct API access (use agent classes)
+ from hud.agents.claude import ClaudeAgent
+
+ agent = ClaudeAgent.create(model="claude-sonnet-4-5")
+ ```
+ """
+ from hud.agents.gateway import build_gateway_client
+ from hud.agents.resolver import resolve_cls
+
+ # Resolve class and gateway info
+ agent_cls, gateway_info = resolve_cls(model)
+
+ # Get model ID from gateway info or use input
+ model_id = model
+ if gateway_info:
+ model_id = gateway_info.get("model") or gateway_info.get("id") or model
+
+ # Determine provider: from gateway info, or infer from agent class
+ if gateway_info:
+ provider = gateway_info.get("provider") or "openai"
+ else:
+ # Map agent class to provider for known types
+ from hud.agents.claude import ClaudeAgent
+ from hud.agents.gemini import GeminiAgent
+
+ _AGENT_TO_PROVIDER = {
+ ClaudeAgent: "anthropic",
+ GeminiAgent: "google",
+ }
+ provider = _AGENT_TO_PROVIDER.get(agent_cls, "openai")
+
+ client = build_gateway_client(provider)
+
+ # Set up kwargs
+ kwargs.setdefault("model", model_id)
+
+ # Use correct client key based on agent type
+ if agent_cls == OpenAIChatAgent:
+ kwargs.setdefault("openai_client", client)
+ else:
+ # Claude and other agents use model_client and validate_api_key
+ kwargs.setdefault("model_client", client)
+ kwargs.setdefault("validate_api_key", False)
+
+ return agent_cls.create(**kwargs)
diff --git a/hud/agents/gateway.py b/hud/agents/gateway.py
new file mode 100644
index 00000000..4d0973f8
--- /dev/null
+++ b/hud/agents/gateway.py
@@ -0,0 +1,42 @@
+"""Gateway client utilities for HUD inference gateway."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def build_gateway_client(provider: str) -> Any:
+ """Build a client configured for HUD gateway routing.
+
+ Args:
+ provider: Provider name ("anthropic", "openai", "gemini", etc.)
+
+ Returns:
+ Configured async client for the provider.
+ """
+ from hud.settings import settings
+
+ provider = provider.lower()
+
+ if provider == "anthropic":
+ from anthropic import AsyncAnthropic
+
+ return AsyncAnthropic(api_key=settings.api_key, base_url=settings.hud_gateway_url)
+
+ if provider == "gemini":
+ from google import genai
+ from google.genai.types import HttpOptions
+
+ return genai.Client(
+ api_key="PLACEHOLDER",
+ http_options=HttpOptions(
+ api_version="v1beta",
+ base_url=settings.hud_gateway_url,
+ headers={"Authorization": f"Bearer {settings.api_key}"},
+ ),
+ )
+
+ # OpenAI-compatible (openai, azure, together, groq, fireworks, etc.)
+ from openai import AsyncOpenAI
+
+ return AsyncOpenAI(api_key=settings.api_key, base_url=settings.hud_gateway_url)
diff --git a/hud/agents/resolver.py b/hud/agents/resolver.py
new file mode 100644
index 00000000..80351800
--- /dev/null
+++ b/hud/agents/resolver.py
@@ -0,0 +1,70 @@
+"""Model resolution - maps model strings to agent classes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+ from hud.agents.base import MCPAgent
+
+__all__ = ["resolve_cls"]
+
+_models_cache: list[dict[str, Any]] | None = None
+
+# Provider name → AgentType value (only anthropic differs)
+_PROVIDER_TO_AGENT = {"anthropic": "claude"}
+
+
+def _fetch_gateway_models() -> list[dict[str, Any]]:
+ """Fetch available models from HUD gateway (cached)."""
+ global _models_cache
+ if _models_cache is not None:
+ return _models_cache
+
+ import httpx
+
+ from hud.settings import settings
+
+ if not settings.api_key:
+ return []
+
+ try:
+ resp = httpx.get(
+ f"{settings.hud_gateway_url}/models",
+ headers={"Authorization": f"Bearer {settings.api_key}"},
+ timeout=10.0,
+ )
+ resp.raise_for_status()
+ data = resp.json()
+ _models_cache = data.get("data", data) if isinstance(data, dict) else data
+ return _models_cache or []
+ except Exception:
+ return []
+
+
+def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]:
+ """Resolve model string to (agent_class, gateway_info).
+
+ Returns:
+ (agent_class, None) for known AgentTypes
+ (agent_class, gateway_model_info) for gateway models
+ """
+ from hud.types import AgentType
+
+ # Known AgentType → no gateway info
+ try:
+ return AgentType(model).cls, None
+ except ValueError:
+ pass
+
+ # Gateway lookup
+ for m in _fetch_gateway_models():
+ if model in (m.get("id"), m.get("name"), m.get("model")):
+ provider = (m.get("provider") or "openai_compatible").lower()
+ agent_str = _PROVIDER_TO_AGENT.get(provider, provider)
+ try:
+ return AgentType(agent_str).cls, m
+ except ValueError:
+ return AgentType.OPENAI_COMPATIBLE.cls, m
+
+ raise ValueError(f"Model '{model}' not found")
diff --git a/hud/agents/tests/test_resolver.py b/hud/agents/tests/test_resolver.py
new file mode 100644
index 00000000..04e6f51e
--- /dev/null
+++ b/hud/agents/tests/test_resolver.py
@@ -0,0 +1,192 @@
+"""Tests for model resolution and create_agent."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from hud.agents import create_agent
+from hud.agents.resolver import resolve_cls
+
+
+class TestResolveCls:
+ """Tests for resolve_cls function."""
+
+ def test_resolves_known_agent_type(self) -> None:
+ """Known AgentType strings resolve to their class."""
+ from hud.agents.claude import ClaudeAgent
+
+ cls, gateway_info = resolve_cls("claude")
+ assert cls == ClaudeAgent
+ assert gateway_info is None
+
+ def test_resolves_openai(self) -> None:
+ """Resolves 'openai' to OpenAIAgent."""
+ from hud.agents import OpenAIAgent
+
+ cls, _gateway_info = resolve_cls("openai")
+ assert cls == OpenAIAgent
+
+ def test_resolves_gemini(self) -> None:
+ """Resolves 'gemini' to GeminiAgent."""
+ from hud.agents.gemini import GeminiAgent
+
+ cls, _gateway_info = resolve_cls("gemini")
+ assert cls == GeminiAgent
+
+ def test_unknown_model_without_gateway_raises(self) -> None:
+ """Unknown model with no gateway models raises ValueError."""
+ with (
+ patch("hud.agents.resolver._fetch_gateway_models", return_value=[]),
+ pytest.raises(ValueError, match="not found"),
+ ):
+ resolve_cls("unknown-model-xyz")
+
+ def test_resolves_gateway_model(self) -> None:
+ """Resolves model found in gateway."""
+ from hud.agents import OpenAIAgent
+
+ mock_models = [
+ {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
+ ]
+
+ with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
+ cls, info = resolve_cls("gpt-4o")
+ assert cls == OpenAIAgent
+ assert info is not None
+ assert info["id"] == "gpt-4o"
+
+ def test_resolves_anthropic_provider_to_claude(self) -> None:
+ """Provider 'anthropic' maps to ClaudeAgent."""
+ from hud.agents.claude import ClaudeAgent
+
+ mock_models = [
+ {"id": "claude-sonnet", "model": "claude-3-sonnet", "provider": "anthropic"},
+ ]
+
+ with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
+ cls, _info = resolve_cls("claude-sonnet")
+ assert cls == ClaudeAgent
+
+ def test_resolves_unknown_provider_to_openai_compatible(self) -> None:
+ """Unknown provider maps to OpenAIChatAgent."""
+ from hud.agents.openai_chat import OpenAIChatAgent
+
+ mock_models = [
+ {"id": "custom-model", "model": "custom", "provider": "custom-provider"},
+ ]
+
+ with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models):
+ cls, _info = resolve_cls("custom-model")
+ assert cls == OpenAIChatAgent
+
+
+class TestCreateAgent:
+ """Tests for create_agent function - gateway-only."""
+
+ def test_creates_with_gateway_client(self) -> None:
+ """create_agent always uses gateway routing."""
+ from hud.agents import OpenAIAgent
+
+ mock_models = [
+ {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
+ ]
+
+ with (
+ patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
+ patch.object(OpenAIAgent, "create") as mock_create,
+ patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
+ ):
+ mock_client = MagicMock()
+ mock_build_client.return_value = mock_client
+ mock_agent = MagicMock()
+ mock_create.return_value = mock_agent
+
+ agent = create_agent("gpt-4o")
+
+ # Should have set model and model_client
+ call_kwargs = mock_create.call_args.kwargs
+ assert call_kwargs["model"] == "gpt-4o"
+ assert "model_client" in call_kwargs
+ assert agent == mock_agent
+
+ def test_passes_kwargs_to_create(self) -> None:
+ """Extra kwargs are passed to agent.create()."""
+ from hud.agents import OpenAIAgent
+
+ mock_models = [
+ {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"},
+ ]
+
+ with (
+ patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models),
+ patch.object(OpenAIAgent, "create") as mock_create,
+ patch("hud.agents.gateway.build_gateway_client"),
+ ):
+ mock_create.return_value = MagicMock()
+
+ create_agent("gpt-4o", temperature=0.5, max_tokens=1000)
+
+ call_kwargs = mock_create.call_args.kwargs
+ assert call_kwargs["temperature"] == 0.5
+ assert call_kwargs["max_tokens"] == 1000
+
+ def test_known_agent_type_also_uses_gateway(self) -> None:
+ """Even 'claude' string uses gateway (it's a gateway shortcut)."""
+ from hud.agents.claude import ClaudeAgent
+
+ with (
+ patch.object(ClaudeAgent, "create") as mock_create,
+ patch("hud.agents.gateway.build_gateway_client") as mock_build_client,
+ ):
+ mock_client = MagicMock()
+ mock_build_client.return_value = mock_client
+ mock_create.return_value = MagicMock()
+
+ create_agent("claude")
+
+ # Should still build gateway client
+ mock_build_client.assert_called_once()
+ call_kwargs = mock_create.call_args.kwargs
+ assert "model_client" in call_kwargs
+
+
+class TestBuildGatewayClient:
+ """Tests for build_gateway_client function."""
+
+ def test_builds_anthropic_client(self) -> None:
+ """Builds AsyncAnthropic for anthropic provider."""
+ from hud.agents.gateway import build_gateway_client
+
+ with patch("hud.settings.settings") as mock_settings:
+ mock_settings.api_key = "test-key"
+ mock_settings.hud_gateway_url = "https://gateway.hud.ai"
+
+ with patch("anthropic.AsyncAnthropic") as mock_client_cls:
+ build_gateway_client("anthropic")
+ mock_client_cls.assert_called_once()
+
+ def test_builds_openai_client_for_openai(self) -> None:
+ """Builds AsyncOpenAI for openai provider."""
+ from hud.agents.gateway import build_gateway_client
+
+ with patch("hud.settings.settings") as mock_settings:
+ mock_settings.api_key = "test-key"
+ mock_settings.hud_gateway_url = "https://gateway.hud.ai"
+
+ with patch("openai.AsyncOpenAI") as mock_client_cls:
+ build_gateway_client("openai")
+ mock_client_cls.assert_called_once()
+
+ def test_builds_openai_client_for_unknown(self) -> None:
+ """Builds AsyncOpenAI for unknown providers (openai-compatible)."""
+ from hud.agents.gateway import build_gateway_client
+
+ with patch("hud.settings.settings") as mock_settings:
+ mock_settings.api_key = "test-key"
+ mock_settings.hud_gateway_url = "https://gateway.hud.ai"
+
+ with patch("openai.AsyncOpenAI") as mock_client_cls:
+ build_gateway_client("together")
+ mock_client_cls.assert_called_once()
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index eb13ce34..faedb107 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -338,47 +338,27 @@ def get_agent_kwargs(self) -> dict[str, Any]:
# Configure gateway mode - route LLM API calls through HUD gateway
if self.gateway:
- hud_api_key = settings.api_key
- if not hud_api_key:
+ if not settings.api_key:
raise typer.Exit(1) # Already validated in validate_api_keys()
- if self.agent_type == AgentType.CLAUDE:
- from anthropic import AsyncAnthropic
-
- kwargs["model_client"] = AsyncAnthropic(
- api_key=hud_api_key,
- base_url=settings.hud_gateway_url,
- )
- hud_console.info("🌐 Using HUD Gateway for Claude API")
- elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
- from openai import AsyncOpenAI
+ from hud.agents.gateway import build_gateway_client
- kwargs["model_client"] = AsyncOpenAI(
- api_key=hud_api_key,
- base_url=settings.hud_gateway_url,
- )
- hud_console.info("🌐 Using HUD Gateway for OpenAI API")
- elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
- from openai import AsyncOpenAI
+ # Map AgentType to provider
+ agent_to_provider = {
+ AgentType.CLAUDE: "anthropic",
+ AgentType.OPENAI: "openai",
+ AgentType.OPERATOR: "openai",
+ AgentType.GEMINI: "gemini",
+ AgentType.GEMINI_CUA: "gemini",
+ AgentType.OPENAI_COMPATIBLE: "openai",
+ }
+ provider = agent_to_provider.get(self.agent_type, "openai")
+ client = build_gateway_client(provider)
- kwargs["openai_client"] = AsyncOpenAI(
- api_key=hud_api_key,
- base_url=settings.hud_gateway_url,
- )
- hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
- elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
- from google import genai
- from google.genai.types import HttpOptions
-
- kwargs["model_client"] = genai.Client(
- api_key="PLACEHOLDER",
- http_options=HttpOptions(
- api_version="v1beta",
- base_url=settings.hud_gateway_url,
- headers={"Authorization": f"Bearer {hud_api_key}"},
- ),
- )
- hud_console.info("🌐 Using HUD Gateway for Gemini API")
+ # OpenAI-compatible uses openai_client key
+ is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE
+ kwargs["openai_client" if is_oai_compat else "model_client"] = client
+ hud_console.info(f"🌐 Using HUD Gateway for {provider} API")
return kwargs
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
index 0c1982f2..d313228e 100644
--- a/hud/datasets/loader.py
+++ b/hud/datasets/loader.py
@@ -63,7 +63,8 @@ def _load_from_file(path: Path) -> list[Task]:
from hud.eval.task import Task
raw_items = _load_raw_from_file(path)
- return [Task(**item) for item in raw_items]
+ # Default args to {} for runnable tasks (None = template)
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
@@ -99,7 +100,8 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
raw_items = _load_raw_from_huggingface(dataset_name)
from hud.eval.task import Task
- return [Task(**item) for item in raw_items]
+ # Default args to {} for runnable tasks (None = template)
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
@@ -138,7 +140,8 @@ def _load_from_api(dataset_name: str) -> list[Task]:
from hud.eval.task import Task
raw_items = _load_raw_from_api(dataset_name)
- return [Task(**item) for item in raw_items]
+ # Default args to {} for runnable tasks (None = template)
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
@overload
diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
index 3b4b1162..1a27074f 100644
--- a/hud/datasets/runner.py
+++ b/hud/datasets/runner.py
@@ -40,7 +40,7 @@ async def run_dataset(
- A source string (file path, API slug) - loaded via load_tasks()
- A single TaskInput (Task, LegacyTask, or dict)
- A list of TaskInput objects
- agent_type: Type of agent to create (e.g., "claude", "openai", AgentType.CLAUDE).
+ agent_type: Agent type (e.g., "claude", "openai", AgentType.CLAUDE).
agent_params: Parameters to pass to agent.create().
max_steps: Maximum steps per task.
max_concurrent: Maximum concurrent tasks (for parallel execution).
@@ -70,6 +70,10 @@ async def run_dataset(
from hud.datasets.loader import load_tasks
from hud.eval.task import Task
+ # Normalize agent_type to AgentType enum
+ if isinstance(agent_type, str):
+ agent_type = AgentType(agent_type)
+
# Normalize tasks to list[Task]
task_list: list[Task]
if isinstance(tasks, str):
@@ -86,10 +90,6 @@ async def run_dataset(
if not task_list:
raise ValueError("No tasks to run")
- # Resolve agent class
- agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
- agent_cls = agent_type_enum.cls
-
# Use hud.eval() for both single and parallel execution
async with hud.eval(
task_list,
@@ -97,8 +97,8 @@ async def run_dataset(
max_concurrent=max_concurrent,
quiet=quiet,
) as ctx:
- # Create agent fresh for each context (ensures correct tool initialization)
- agent = agent_cls.create(**(agent_params or {}))
+ # Create agent using AgentType.cls.create()
+ agent = agent_type.cls.create(**(agent_params or {}))
await agent.run(ctx, max_steps=max_steps)
# Reward is computed by EvalContext.__aexit__ from evaluate tools
@@ -198,9 +198,8 @@ async def run_single_task(
if ctx.system_prompt and "system_prompt" not in final_agent_params:
final_agent_params["system_prompt"] = ctx.system_prompt
- # Create agent inside ctx so it has access to context-derived values
- agent_cls = agent_type.cls
- agent = agent_cls.create(**final_agent_params)
+ # Create agent using AgentType.cls.create()
+ agent = agent_type.cls.create(**final_agent_params)
# Store metadata if provided
if metadata:
diff --git a/hud/environment/environment.py b/hud/environment/environment.py
index 83924cd2..4ed44b32 100644
--- a/hud/environment/environment.py
+++ b/hud/environment/environment.py
@@ -362,6 +362,22 @@ async def __aexit__(
await asyncio.gather(*[c.disconnect() for c in self._connections.values()])
self._router.clear()
+ async def run_async(
+ self,
+ transport: Literal["stdio", "http", "sse"] | None = None,
+ show_banner: bool = True,
+ **transport_kwargs: Any,
+ ) -> None:
+ """Run the MCP server, auto-connecting all connectors first.
+
+ This ensures that tools from external MCP servers (via connect_mcp_config)
+ are discovered and available when the server starts.
+ """
+ async with self: # Connect all connectors via __aenter__
+ await super().run_async(
+ transport=transport, show_banner=show_banner, **transport_kwargs
+ )
+
async def _build_routing(self) -> None:
"""Build tool routing from local tools and connection caches."""
# Use get_tools() not list_tools() - it includes mounted servers without
@@ -376,6 +392,27 @@ async def _build_routing(self) -> None:
# Populate mock schemas for auto-generated mock values
self._populate_mock_schemas()
+ # =========================================================================
+ # MCP Protocol Overrides - Include connector tools in MCP responses
+ # =========================================================================
+
+ def _setup_handlers(self) -> None:
+ """Override FastMCP to register our custom handlers for tools."""
+ # Call parent to set up all standard handlers
+ super()._setup_handlers()
+ # Re-register our custom handlers (overwrites parent's registrations)
+ self._mcp_server.list_tools()(self._env_list_tools)
+ self._mcp_server.call_tool()(self._env_call_tool)
+
+ async def _env_list_tools(self) -> list[mcp_types.Tool]:
+ """Return all tools including those from connectors."""
+ return self._router.tools
+
+ async def _env_call_tool(self, name: str, arguments: dict[str, Any] | None = None) -> list[Any]:
+ """Route tool calls through our router (handles both local and connector tools)."""
+ result = await self._execute_tool(name, arguments or {})
+ return result.content or []
+
# =========================================================================
# Tool Operations
# =========================================================================
diff --git a/hud/environment/tests/test_environment.py b/hud/environment/tests/test_environment.py
index 44febe88..60b544e7 100644
--- a/hud/environment/tests/test_environment.py
+++ b/hud/environment/tests/test_environment.py
@@ -159,3 +159,171 @@ def test_chaining_multiple_setup_calls(self) -> None:
)
assert len(env._setup_calls) == 2
+
+
+class TestEnvironmentMCPProtocol:
+ """Tests for MCP protocol overrides - Environment._env_list_tools and _env_call_tool.
+
+ These test that Environment properly exposes connector tools via MCP handlers.
+ """
+
+ @pytest.mark.asyncio
+ async def test_env_list_tools_includes_local_tools(self) -> None:
+ """_env_list_tools returns local tools after routing is built."""
+ from hud.environment import Environment
+
+ env = Environment("test")
+
+ @env.tool()
+ def my_tool(x: int) -> int:
+ """A test tool."""
+ return x * 2
+
+ # Build routing (simulates what __aenter__ does)
+ await env._build_routing()
+
+ # Call the handler that MCP will call
+ tools = await env._env_list_tools()
+
+ assert len(tools) == 1
+ assert tools[0].name == "my_tool"
+
+ @pytest.mark.asyncio
+ async def test_env_list_tools_includes_connector_tools(self) -> None:
+ """_env_list_tools returns tools from connectors (the key feature)."""
+ import mcp.types as mcp_types
+
+ from hud.environment import Environment
+
+ env = Environment("test")
+
+ # Create a mock connector with cached tools
+ mock_tools = [
+ mcp_types.Tool(
+ name="remote_tool",
+ description="A remote tool",
+ inputSchema={"type": "object"},
+ )
+ ]
+
+ class MockConnector:
+ is_connected = True
+ _tools_cache = mock_tools
+
+ @property
+ def cached_tools(self) -> list[mcp_types.Tool]:
+ return self._tools_cache
+
+ async def connect(self) -> None:
+ pass
+
+ async def disconnect(self) -> None:
+ pass
+
+ async def list_tools(self) -> list[mcp_types.Tool]:
+ return self._tools_cache
+
+ # Add the mock connector
+ env._connections["mock"] = MockConnector() # type: ignore
+
+ # Build routing
+ await env._build_routing()
+
+ # Call the handler that MCP will call
+ tools = await env._env_list_tools()
+
+ # Should include the remote tool
+ tool_names = [t.name for t in tools]
+ assert "remote_tool" in tool_names
+
+ @pytest.mark.asyncio
+ async def test_env_call_tool_routes_to_local(self) -> None:
+ """_env_call_tool routes local tool calls correctly."""
+ from hud.environment import Environment
+
+ env = Environment("test")
+ called_with: list[int] = []
+
+ @env.tool()
+ def my_tool(x: int) -> str:
+ """A test tool."""
+ called_with.append(x)
+ return f"result: {x}"
+
+ # Build routing
+ await env._build_routing()
+
+ # Call the handler that MCP will call
+ result = await env._env_call_tool("my_tool", {"x": 42})
+
+ assert called_with == [42]
+ assert len(result) == 1
+
+ @pytest.mark.asyncio
+ async def test_env_call_tool_routes_to_connector(self) -> None:
+ """_env_call_tool routes connector tool calls correctly."""
+ from unittest.mock import AsyncMock
+
+ import mcp.types as mcp_types
+
+ from hud.environment import Environment
+ from hud.types import MCPToolResult
+
+ env = Environment("test")
+
+ # Create a mock connector
+ mock_tools = [
+ mcp_types.Tool(
+ name="remote_tool",
+ description="A remote tool",
+ inputSchema={"type": "object"},
+ )
+ ]
+
+ class MockConnector:
+ is_connected = True
+ _tools_cache = mock_tools
+ call_tool = AsyncMock(
+ return_value=MCPToolResult(
+ content=[mcp_types.TextContent(type="text", text="remote result")],
+ isError=False,
+ )
+ )
+
+ @property
+ def cached_tools(self) -> list[mcp_types.Tool]:
+ return self._tools_cache
+
+ async def connect(self) -> None:
+ pass
+
+ async def disconnect(self) -> None:
+ pass
+
+ async def list_tools(self) -> list[mcp_types.Tool]:
+ return self._tools_cache
+
+ mock_conn = MockConnector()
+ env._connections["mock"] = mock_conn # type: ignore
+
+ # Build routing
+ await env._build_routing()
+
+ # Call the handler that MCP will call
+ result = await env._env_call_tool("remote_tool", {"arg": "value"})
+
+ # Verify the connector was called
+ mock_conn.call_tool.assert_called_once_with("remote_tool", {"arg": "value"})
+ assert len(result) == 1
+
+ def test_setup_handlers_registers_custom_handlers(self) -> None:
+ """Verify _setup_handlers registers our _env_list_tools and _env_call_tool."""
+ from hud.environment import Environment
+
+ env = Environment("test")
+
+ # Verify the custom handlers exist
+ assert hasattr(env, "_env_list_tools")
+ assert hasattr(env, "_env_call_tool")
+ assert callable(env._env_list_tools)
+ assert callable(env._env_call_tool)
diff --git a/hud/eval/context.py b/hud/eval/context.py
index ca0704f5..e7d4be14 100644
--- a/hud/eval/context.py
+++ b/hud/eval/context.py
@@ -302,10 +302,20 @@ def from_task(
code_snippet: Code being evaluated
trace: Whether to send traces to backend
quiet: Whether to suppress output
+
+ Raises:
+ ValueError: If task.args is None (template tasks cannot be run directly)
"""
from hud.environment import Environment
from hud.eval.task import build_eval_name
+ # Validate that task has args (not a template)
+ if task.args is None:
+ raise ValueError(
+ f"Cannot run task with args=None (this is a template). "
+ f"Provide args when creating the task: env('{task.scenario}', **args)"
+ )
+
eval_name = name or build_eval_name(task.scenario, task.args)
# task.env is guaranteed to be Environment after Task.__post_init__
@@ -343,7 +353,7 @@ async def _run_task_scenario_setup(self) -> None:
if self._task is None or self._task.scenario is None:
return
- prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
+ prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {})
if prompt:
self.prompt = prompt
diff --git a/hud/eval/task.py b/hud/eval/task.py
index 085f1bf8..cfa6d64a 100644
--- a/hud/eval/task.py
+++ b/hud/eval/task.py
@@ -148,7 +148,10 @@ class Task(BaseModel):
env: Any = Field(default=None) # Typed as Any for input flexibility, validated below
scenario: str | None = None
id: str | None = None
- args: dict[str, Any] = Field(default_factory=dict)
+ args: dict[str, Any] | None = Field(
+ default=None,
+ description="Scenario arguments. None indicates a template (args filled in later).",
+ )
validation: list[MCPToolCall] | None = None
# Agent config - settings passed to agent (system_prompt, etc.)
@@ -335,6 +338,6 @@ def copy(self) -> Task:
id=self.id,
env=self.env, # Share reference
scenario=self.scenario,
- args=self.args.copy() if self.args else {},
+ args=self.args.copy() if self.args is not None else None,
validation=self.validation.copy() if self.validation else None,
)
diff --git a/hud/eval/tests/test_eval.py b/hud/eval/tests/test_eval.py
index 6d470808..ea958af4 100644
--- a/hud/eval/tests/test_eval.py
+++ b/hud/eval/tests/test_eval.py
@@ -16,7 +16,7 @@ def test_init_defaults(self) -> None:
assert task.env is None
assert task.scenario is None
- assert task.args == {}
+ assert task.args is None # None = template, {} = runnable with no args
def test_init_with_env_dict(self) -> None:
"""Task auto-converts env dict to Environment via validator."""
diff --git a/hud/telemetry/tests/test_eval_telemetry.py b/hud/telemetry/tests/test_eval_telemetry.py
index 8849cd13..bfb61004 100644
--- a/hud/telemetry/tests/test_eval_telemetry.py
+++ b/hud/telemetry/tests/test_eval_telemetry.py
@@ -49,8 +49,8 @@ async def greet(name: str) -> str:
"""Say hello."""
return f"Hello, {name}!"
- # Create task from environment
- task = Task(env=env)
+ # Create task from environment (args={} = runnable, args=None = template)
+ task = Task(env=env, args={})
with (
patch("hud.settings.settings") as mock_settings,
@@ -110,7 +110,7 @@ async def failing_tool() -> str:
"""Always fails."""
raise ValueError("Tool error")
- task = Task(env=env)
+ task = Task(env=env, args={})
with (
patch("hud.settings.settings") as mock_settings,
@@ -162,7 +162,7 @@ async def multiply(a: int, b: int) -> int:
"""Multiply two numbers."""
return a * b
- task = Task(env=env)
+ task = Task(env=env, args={})
with (
patch("hud.settings.settings") as mock_settings,
@@ -195,7 +195,7 @@ async def test_flush_called_on_context_exit(self):
async def simple_tool() -> str:
return "done"
- task = Task(env=env)
+ task = Task(env=env, args={})
with (
patch("hud.eval.context.flush") as mock_flush,
@@ -229,7 +229,7 @@ def should_not_be_called(*args: Any, **kwargs: Any) -> bool:
async def test_tool() -> str:
return "ok"
- task = Task(env=env)
+ task = Task(env=env, args={})
with (
patch("hud.settings.settings") as mock_settings,
@@ -272,7 +272,7 @@ def capture_upload(
async def echo(message: str) -> str:
return message
- task = Task(env=env)
+ task = Task(env=env, args={})
with (
patch("hud.settings.settings") as mock_settings,
@@ -329,7 +329,7 @@ def capture_upload(
async def noop() -> None:
pass
- task = Task(env=env)
+ task = Task(env=env, args={})
with (
patch("hud.settings.settings") as mock_settings,
diff --git a/hud/tools/__init__.py b/hud/tools/__init__.py
index 8451a04f..26495d33 100644
--- a/hud/tools/__init__.py
+++ b/hud/tools/__init__.py
@@ -4,6 +4,7 @@
from typing import TYPE_CHECKING, Any
+from .agent import AgentTool
from .base import BaseHub, BaseTool
from .bash import BashTool
from .edit import EditTool
@@ -21,6 +22,7 @@
)
__all__ = [
+ "AgentTool",
"AnthropicComputerTool",
"BaseHub",
"BaseTool",
diff --git a/hud/tools/agent.py b/hud/tools/agent.py
new file mode 100644
index 00000000..2f5ad377
--- /dev/null
+++ b/hud/tools/agent.py
@@ -0,0 +1,216 @@
+"""AgentTool - run a Task with an agent as a tool."""
+
+from __future__ import annotations
+
+import inspect
+from typing import TYPE_CHECKING, Any, Union, get_args, get_origin
+
+from fastmcp.tools.tool import FunctionTool, ToolResult
+from mcp.types import TextContent
+
+from hud.tools.base import BaseTool
+
+if TYPE_CHECKING:
+ from hud.agents.base import MCPAgent
+ from hud.eval.task import Task
+
+__all__ = ["AgentTool"]
+
+
+def _is_eval_only(param: inspect.Parameter) -> bool:
+ """Check if param is eval-only: has None default AND None in type union.
+
+ Handles both runtime types and string annotations (PEP 563).
+ """
+ # Must have default of None
+ if param.default is not None:
+ return False
+ if param.annotation is inspect.Parameter.empty:
+ return False
+
+ annotation = param.annotation
+
+ # Handle string annotations (from __future__ annotations or quoted)
+ if isinstance(annotation, str):
+ # Check if it looks like "X | None", "Union[X, None]", or "Optional[X]"
+ return (
+ "| None" in annotation
+ or "None |" in annotation
+ or "Optional[" in annotation
+ or ("Union[" in annotation and "None" in annotation)
+ )
+
+ # Handle runtime type annotations
+ origin = get_origin(annotation)
+
+ # Union types (X | None or Union[X, None])
+ if origin is Union:
+ return type(None) in get_args(annotation)
+
+ # For Python 3.10+ union syntax at runtime (types.UnionType)
+ try:
+ import types
+
+ if isinstance(annotation, types.UnionType):
+ return type(None) in get_args(annotation)
+ except (ImportError, AttributeError):
+ pass
+
+ return False
+
+
+class AgentTool(BaseTool):
+ """Tool that runs a Task template with an agent.
+
+ Parameters with `| None = None` are eval-only and hidden from the tool schema.
+
+ Example:
+ ```python
+ @env.scenario()
+ async def investigate(
+ issue_id: str, # Required - orchestrator sees
+ expected_cause: str | None = None, # Eval only - hidden
+ ):
+ yield {"task": f"Investigate {issue_id}"}
+
+
+ seer = AgentTool(env("investigate"), model="ft:seer-v2")
+ ```
+ """
+
+ def __init__(
+ self,
+ task: Task,
+ *,
+ model: str | None = None,
+ agent: type[MCPAgent] | None = None,
+ agent_params: dict[str, Any] | None = None,
+ name: str | None = None,
+ description: str | None = None,
+ trace: bool = False,
+ ) -> None:
+ if not model and agent is None:
+ raise ValueError("Must provide either 'model' or 'agent'")
+ if model and agent is not None:
+ raise ValueError("Cannot provide both 'model' and 'agent'")
+
+ self._task = task
+ self._model = model
+ self._agent_cls = agent
+ self._agent_params = agent_params or {}
+ self._trace = trace
+
+ # Get visible params from scenario function
+ self._visible_params: set[str] = set()
+ self._param_schema: dict[str, Any] = {
+ "type": "object",
+ "properties": {},
+ "required": [],
+ }
+
+ if task.env and task.scenario:
+ scenario_fn = task.env._scenarios.get(task.scenario)
+ if scenario_fn:
+ sig = inspect.signature(scenario_fn)
+ visible = {name: p for name, p in sig.parameters.items() if not _is_eval_only(p)}
+ self._visible_params = set(visible.keys())
+ self._param_schema = self._build_schema(visible)
+
+ tool_name = name or task.scenario or "agent_tool"
+ tool_desc = description or f"Run scenario: {task.scenario}"
+
+ super().__init__(name=tool_name, description=tool_desc)
+
+ def _build_schema(self, params: dict[str, inspect.Parameter]) -> dict[str, Any]:
+ """Build JSON schema using Pydantic TypeAdapter."""
+ from pydantic import TypeAdapter
+
+ properties: dict[str, Any] = {}
+ required: list[str] = []
+
+ for name, param in params.items():
+ if param.annotation is not inspect.Parameter.empty:
+ try:
+ # Handle string annotations
+ annotation = param.annotation
+ if isinstance(annotation, str):
+ # Try to evaluate the annotation
+ try:
+ annotation = eval(annotation) # noqa: S307
+ except Exception:
+ # Fall back to string type but don't skip required handling
+ annotation = None
+
+ if annotation is not None:
+ adapter = TypeAdapter(annotation)
+ properties[name] = adapter.json_schema()
+ else:
+ properties[name] = {"type": "string"}
+ except Exception:
+ properties[name] = {"type": "string"}
+ else:
+ properties[name] = {"type": "string"}
+
+ if param.default is inspect.Parameter.empty:
+ required.append(name)
+ elif param.default is not None:
+ properties[name]["default"] = param.default
+
+ return {"type": "object", "properties": properties, "required": required}
+
+ @property
+ def mcp(self) -> FunctionTool:
+ """Get as FastMCP FunctionTool with filtered schema."""
+ if not hasattr(self, "_mcp_tool"):
+ # Directly instantiate FunctionTool with our callable and schema
+ # This bypasses from_function's signature parsing
+ self._mcp_tool = FunctionTool(
+ name=self.name,
+ description=self.description or "",
+ parameters=self._param_schema,
+ fn=self._execute_with_args,
+ )
+ return self._mcp_tool
+
+ async def _execute_with_args(self, **kwargs: Any) -> ToolResult:
+ """Internal executor that FastMCP calls with parsed arguments."""
+ return await self(**kwargs)
+
+ async def __call__(self, **kwargs: Any) -> ToolResult:
+ """Execute the task with a fresh agent."""
+ from hud.eval.context import get_current_trace_id
+ from hud.eval.manager import run_eval
+
+ # Filter to visible params only
+ filtered = {k: v for k, v in kwargs.items() if k in self._visible_params}
+
+ # Merge with template args
+ base_args = self._task.args or {}
+ task = self._task.model_copy(update={"args": {**base_args, **filtered}})
+
+ # Use parent trace if available (for hierarchical agents)
+ parent_trace_id = get_current_trace_id()
+
+ # If nested (has parent), skip subagent's enter/exit registration
+ # Tool calls are still recorded via the shared trace_id's context
+ is_nested = parent_trace_id is not None
+
+ # Trace if explicitly requested AND not nested (nested uses parent trace)
+ should_trace = self._trace and not is_nested
+
+ async with run_eval(
+ task,
+ trace=should_trace,
+ trace_id=parent_trace_id,
+ quiet=True,
+ ) as ctx:
+ if self._model:
+ from hud.agents import create_agent
+
+ agent = create_agent(self._model, **self._agent_params)
+ else:
+ agent = self._agent_cls.create(**self._agent_params) # type: ignore
+
+ result = await agent.run(ctx)
+ content = result.content if hasattr(result, "content") and result.content else ""
+ return ToolResult(content=[TextContent(type="text", text=content)])
diff --git a/hud/tools/tests/test_agent_tool.py b/hud/tools/tests/test_agent_tool.py
new file mode 100644
index 00000000..de8196c3
--- /dev/null
+++ b/hud/tools/tests/test_agent_tool.py
@@ -0,0 +1,355 @@
+"""Tests for AgentTool - scenario-to-agent composition."""
+
+from __future__ import annotations
+
+import inspect
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from hud.environment import Environment
+from hud.eval.task import Task
+from hud.tools.agent import AgentTool, _is_eval_only
+
+
+class TestIsEvalOnly:
+ """Tests for _is_eval_only helper function."""
+
+ def test_required_param_not_eval_only(self) -> None:
+ """Required params (no default) are not eval-only."""
+
+ def fn(x: str) -> None:
+ pass
+
+ sig = inspect.signature(fn)
+ param = sig.parameters["x"]
+ assert not _is_eval_only(param)
+
+ def test_optional_with_value_not_eval_only(self) -> None:
+ """Optional params with non-None default are not eval-only."""
+
+ def fn(x: str = "default") -> None:
+ pass
+
+ sig = inspect.signature(fn)
+ param = sig.parameters["x"]
+ assert not _is_eval_only(param)
+
+ def test_optional_none_without_union_not_eval_only(self) -> None:
+ """Optional with None default but no None in type is not eval-only."""
+
+ def fn(x: str = None) -> None: # type: ignore[assignment] # noqa: RUF013
+ pass
+
+ sig = inspect.signature(fn)
+ param = sig.parameters["x"]
+ assert not _is_eval_only(param)
+
+ def test_optional_none_with_union_is_eval_only(self) -> None:
+ """Params with `X | None = None` pattern are eval-only."""
+
+ def fn(x: str | None = None) -> None:
+ pass
+
+ sig = inspect.signature(fn)
+ param = sig.parameters["x"]
+ assert _is_eval_only(param)
+
+ def test_optional_int_none_is_eval_only(self) -> None:
+ """Works with int | None = None too."""
+
+ def fn(x: int | None = None) -> None:
+ pass
+
+ sig = inspect.signature(fn)
+ param = sig.parameters["x"]
+ assert _is_eval_only(param)
+
+ def test_string_annotation_with_none_union(self) -> None:
+ """Handles string annotations like 'str | None'."""
+ # Simulate string annotation
+ param = inspect.Parameter(
+ "x",
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
+ default=None,
+ annotation="str | None",
+ )
+ assert _is_eval_only(param)
+
+ def test_string_annotation_without_none(self) -> None:
+ """String annotations without None are not eval-only."""
+ param = inspect.Parameter(
+ "x",
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
+ default=None,
+ annotation="str",
+ )
+ assert not _is_eval_only(param)
+
+
+class TestAgentToolInit:
+ """Tests for AgentTool initialization."""
+
+ def test_requires_model_or_agent(self) -> None:
+ """Must provide either model or agent."""
+ task = Task(args={})
+
+ with pytest.raises(ValueError, match="Must provide either"):
+ AgentTool(task)
+
+ def test_cannot_provide_both_model_and_agent(self) -> None:
+ """Cannot provide both model and agent."""
+ task = Task(args={})
+ mock_agent = MagicMock()
+
+ with pytest.raises(ValueError, match="Cannot provide both"):
+ AgentTool(task, model="claude", agent=mock_agent) # type: ignore[arg-type]
+
+ def test_accepts_model_string(self) -> None:
+ """Can create with model string."""
+ task = Task(scenario="test", args={})
+ tool = AgentTool(task, model="claude")
+
+ assert tool._model == "claude"
+ assert tool._agent_cls is None
+
+ def test_accepts_agent_class(self) -> None:
+ """Can create with custom agent class."""
+ task = Task(scenario="test", args={})
+ mock_agent_cls = MagicMock()
+ tool = AgentTool(task, agent=mock_agent_cls) # type: ignore[arg-type]
+
+ assert tool._model is None
+ assert tool._agent_cls is mock_agent_cls
+
+ def test_name_defaults_to_scenario(self) -> None:
+ """Tool name defaults to scenario name."""
+ task = Task(scenario="investigate", args={})
+ tool = AgentTool(task, model="claude")
+
+ assert tool.name == "investigate"
+
+ def test_name_can_be_overridden(self) -> None:
+ """Tool name can be overridden."""
+ task = Task(scenario="investigate", args={})
+ tool = AgentTool(task, model="claude", name="custom_name")
+
+ assert tool.name == "custom_name"
+
+
+class TestAgentToolParamFiltering:
+ """Tests for parameter filtering (eval-only params hidden)."""
+
+ def test_filters_eval_only_params(self) -> None:
+ """Eval-only params (| None = None) are filtered from visible_params."""
+ env = Environment("test")
+
+ # Use Union syntax for consistency across Python versions
+ @env.scenario()
+ async def investigate(
+ issue_id: str,
+ include_traces: bool = True,
+ expected_cause: str | None = None, # Eval only
+ ):
+ yield {"task": f"Investigate {issue_id}"}
+
+ task = env("investigate")
+ tool = AgentTool(task, model="claude")
+
+ # visible_params should only have issue_id and include_traces
+ assert "issue_id" in tool._visible_params
+ assert "include_traces" in tool._visible_params
+ assert "expected_cause" not in tool._visible_params
+
+ def test_all_required_params_visible(self) -> None:
+ """All required params are visible."""
+ env = Environment("test")
+
+ @env.scenario()
+ async def search(query: str, limit: int):
+ yield {"task": f"Search: {query}"}
+
+ task = env("search")
+ tool = AgentTool(task, model="claude")
+
+ assert "query" in tool._visible_params
+ assert "limit" in tool._visible_params
+
+ def test_optional_with_default_visible(self) -> None:
+ """Optional params with non-None defaults are visible."""
+ env = Environment("test")
+
+ @env.scenario()
+ async def fetch(url: str, request_timeout: int = 30, retries: int = 3):
+ yield {"task": f"Fetch {url}"}
+
+ task = env("fetch")
+ tool = AgentTool(task, model="claude")
+
+ assert "url" in tool._visible_params
+ assert "request_timeout" in tool._visible_params
+ assert "retries" in tool._visible_params
+
+
+class TestAgentToolSchema:
+ """Tests for JSON schema generation."""
+
+ def test_builds_json_schema(self) -> None:
+ """Builds proper JSON schema from visible params."""
+ env = Environment("test")
+
+ @env.scenario()
+ async def investigate(issue_id: str, verbose: bool = False):
+ yield {"task": f"Investigate {issue_id}"}
+
+ task = env("investigate")
+ tool = AgentTool(task, model="claude")
+
+ schema = tool._param_schema
+ assert schema is not None
+ assert schema["type"] == "object"
+ assert "issue_id" in schema["properties"]
+ assert "verbose" in schema["properties"]
+ assert "issue_id" in schema["required"]
+ assert "verbose" not in schema["required"] # Has default
+
+ def test_schema_excludes_eval_only(self) -> None:
+ """Schema excludes eval-only params."""
+ env = Environment("test")
+
+ @env.scenario()
+ async def check(
+ item_id: str,
+ expected_status: str | None = None, # Eval only
+ ):
+ yield {"task": f"Check {item_id}"}
+
+ task = env("check")
+ tool = AgentTool(task, model="claude")
+
+ schema = tool._param_schema
+ assert schema is not None
+ assert "item_id" in schema["properties"]
+ assert "expected_status" not in schema["properties"]
+
+
+class TestAgentToolMCP:
+ """Tests for MCP tool integration."""
+
+ def test_mcp_property_returns_tool(self) -> None:
+ """The mcp property returns a FastMCP FunctionTool."""
+ from fastmcp.tools import FunctionTool
+
+ env = Environment("test")
+
+ @env.scenario()
+ async def greet(name: str):
+ yield {"task": f"Greet {name}"}
+
+ task = env("greet")
+ tool = AgentTool(task, model="claude")
+
+ mcp_tool = tool.mcp
+ assert isinstance(mcp_tool, FunctionTool)
+
+ def test_mcp_has_filtered_parameters(self) -> None:
+ """MCP tool has filtered parameter schema."""
+ env = Environment("test")
+
+ @env.scenario()
+ async def analyze(
+ data: str,
+ expected_result: str | None = None, # Eval only
+ ):
+ yield {"task": f"Analyze {data}"}
+
+ task = env("analyze")
+ tool = AgentTool(task, model="claude")
+
+ mcp_tool = tool.mcp
+ params = mcp_tool.parameters # FunctionTool uses 'parameters'
+
+ assert "data" in params["properties"]
+ assert "expected_result" not in params["properties"]
+
+
+class TestAgentToolCall:
+ """Tests for AgentTool.__call__."""
+
+ @pytest.mark.asyncio
+ async def test_filters_kwargs_to_visible_only(self) -> None:
+ """Call filters kwargs to visible params only."""
+ # Import modules first so patches work
+ import hud.agents
+ import hud.eval.manager # noqa: F401
+
+ env = Environment("test")
+
+ @env.scenario()
+ async def process(item: str, expected: str | None = None):
+ yield {"task": f"Process {item}"}
+
+ task = env("process")
+ tool = AgentTool(task, model="claude")
+
+ # Mock the eval context and agent
+ with (
+ patch("hud.eval.manager.run_eval") as mock_run_eval,
+ patch("hud.agents.create_agent") as mock_create_agent,
+ ):
+ mock_ctx = AsyncMock()
+ mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx)
+ mock_ctx.__aexit__ = AsyncMock(return_value=None)
+ mock_run_eval.return_value = mock_ctx
+
+ mock_agent = MagicMock()
+ mock_agent.run = AsyncMock(return_value=MagicMock(content="result"))
+ mock_create_agent.return_value = mock_agent
+
+ # Call with both visible and eval-only params
+ await tool(item="test", expected="should_be_filtered")
+
+ # Check that task was created with filtered args
+ call_args = mock_run_eval.call_args
+ task_arg = call_args[0][0]
+ assert "item" in task_arg.args
+ assert "expected" not in task_arg.args # Filtered out
+
+ @pytest.mark.asyncio
+ async def test_merges_template_args(self) -> None:
+ """Call merges kwargs with template args."""
+ # Import modules first so patches work
+ import hud.agents
+ import hud.eval.manager # noqa: F401
+
+ env = Environment("test")
+
+ @env.scenario()
+ async def search(query: str, limit: int = 10):
+ yield {"task": f"Search {query}"}
+
+ # Create template with some args pre-filled
+ task = env("search", limit=5)
+ tool = AgentTool(task, model="claude")
+
+ with (
+ patch("hud.eval.manager.run_eval") as mock_run_eval,
+ patch("hud.agents.create_agent") as mock_create_agent,
+ ):
+ mock_ctx = AsyncMock()
+ mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx)
+ mock_ctx.__aexit__ = AsyncMock(return_value=None)
+ mock_run_eval.return_value = mock_ctx
+
+ mock_agent = MagicMock()
+ mock_agent.run = AsyncMock(return_value=MagicMock(content="result"))
+ mock_create_agent.return_value = mock_agent
+
+ # Call with additional arg
+ await tool(query="test query")
+
+ # Check merged args
+ call_args = mock_run_eval.call_args
+ task_arg = call_args[0][0]
+ assert task_arg.args["query"] == "test query"
+ assert task_arg.args["limit"] == 5 # From template
diff --git a/hud/utils/strict_schema.py b/hud/utils/strict_schema.py
index 5d3fa0da..263919b3 100644
--- a/hud/utils/strict_schema.py
+++ b/hud/utils/strict_schema.py
@@ -118,7 +118,7 @@ def _ensure_strict_json_schema(
if "default" in json_schema:
json_schema.pop("default")
- for keyword in ("title", "examples"):
+ for keyword in ("title", "examples", "format"):
json_schema.pop(keyword, None)
ref = json_schema.get("$ref")
diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py
index 8ba18f35..1d40fb59 100644
--- a/hud/utils/tests/test_version.py
+++ b/hud/utils/tests/test_version.py
@@ -5,4 +5,4 @@ def test_import():
"""Test that the package can be imported."""
import hud
- assert hud.__version__ == "0.5.2"
+ assert hud.__version__ == "0.5.3"
diff --git a/hud/version.py b/hud/version.py
index 6fbebfff..c9ba33a6 100644
--- a/hud/version.py
+++ b/hud/version.py
@@ -4,4 +4,4 @@
from __future__ import annotations
-__version__ = "0.5.2"
+__version__ = "0.5.3"
diff --git a/pyproject.toml b/pyproject.toml
index 3cb5c7d3..b1ad8b5a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "hud-python"
-version = "0.5.2"
+version = "0.5.3"
description = "SDK for the HUD platform."
readme = "README.md"
requires-python = ">=3.11, <3.13"