diff --git a/docs/cookbooks/ops-diagnostics.mdx b/docs/cookbooks/ops-diagnostics.mdx new file mode 100644 index 00000000..558d3d2d --- /dev/null +++ b/docs/cookbooks/ops-diagnostics.mdx @@ -0,0 +1,478 @@ +--- +title: "Ops Diagnostics Agent" +description: "How we built a hierarchical agent to diagnose production issues across our infrastructure" +icon: "stethoscope" +--- + +At HUD, we run a complex stack: Sentry for errors, Supabase for data, Railway for deployments, and Kubernetes for orchestration. When something breaks, we wanted an agent that could investigate across all services and provide a unified diagnosis. + +This cookbook walks through how we built it—focusing on **environment design**, **hierarchical delegation**, and **practical patterns** for production agent systems. + +## Why Hierarchical? + +When you connect multiple MCP servers to a single environment, the agent sees all tools at once. For diagnostics across four services, this meant 60+ tools in a flat list. The cognitive load made it harder for the model to select the right tool for the job. + +We restructured into a hierarchy: an orchestrator that delegates to specialized subagents. + +```mermaid +flowchart TD + subgraph orch["Orchestrator"] + O["4 subagent tools"] + end + + subgraph sentry["Sentry Agent"] + S1["search_issues"] + S2["get_issue_details"] + S3["analyze_with_seer"] + end + + subgraph supabase["Supabase Agent"] + SU1["list_tables"] + SU2["execute_sql"] + SU3["get_logs"] + end + + subgraph railway["Railway Agent"] + R1["list_projects"] + R2["get_deployments"] + R3["get_logs"] + end + + subgraph kubectl["kubectl Agent"] + K1["get_pods"] + K2["get_events"] + K3["describe_pod"] + end + + O --> sentry + O --> supabase + O --> railway + O --> kubectl +``` + +The orchestrator sees only 4 tools—one per specialist. Each specialist has a focused toolset for its domain. + +## Environment Design + +Good environment design is the foundation. Each subagent is an `Environment` with: +- A **focused toolset** (only what's needed for this domain) +- A **single scenario** that defines the interface +- **Read-only constraints** for safety + +### Connecting to MCP Servers + +For services with official MCP servers (Sentry, Supabase), connect via `connect_mcp_config`: + +```python +# environments/sentry.py +from hud import Environment +import os +import platform + +sentry_env = Environment(name="sentry-agent") + +IS_WINDOWS = platform.system() == "Windows" +token = os.getenv("SENTRY_AUTH_TOKEN") + +if token: + config = { + "command": "cmd" if IS_WINDOWS else "npx", + "args": ["/c", "npx", "-y", "@sentry/mcp-server@latest"] if IS_WINDOWS + else ["-y", "@sentry/mcp-server@latest"], + "env": {"SENTRY_ACCESS_TOKEN": token} + } + sentry_env.connect_mcp_config({"sentry": config}) +``` + +### Custom Tools When Needed + +Railway's MCP server requires browser OAuth—not ideal for headless agents. We built custom tools using their GraphQL API: + +```python +# environments/tools/railway.py +from hud.server import MCPRouter +import httpx +import os + +router = MCPRouter() +RAILWAY_API = "https://backboard.railway.com/graphql/v2" + + +async def _graphql(query: str, variables: dict | None = None) -> dict: + token = os.getenv("RAILWAY_API_TOKEN") + async with httpx.AsyncClient() as client: + resp = await client.post( + RAILWAY_API, + headers={"Authorization": f"Bearer {token}"}, + json={"query": query, "variables": variables} + ) + return resp.json() + + +@router.tool() +async def railway_list_projects() -> dict: + """List all projects with their services.""" + return await _graphql(""" + query { + projects { + edges { node { id name } } + } + } + """) + + +@router.tool() +async def railway_get_deployment_logs(deployment_id: str) -> dict: + """Get logs for a deployment.""" + return await _graphql(""" + query($id: String!) { + deploymentLogs(deploymentId: $id) { + ... on Log { message timestamp severity } + } + } + """, {"id": deployment_id}) +``` + +Then include the router in your environment: + +```python +# environments/railway.py +from hud import Environment +from .tools.railway import router + +railway_env = Environment(name="railway-agent") +railway_env.include_router(router) +``` + +### Defining the Scenario + +The scenario is the contract between orchestrator and subagent: + +```python +@sentry_env.scenario("investigate") +async def investigate_issue( + query: str, # Orchestrator provides this + expected_finding: str | None = None, # Hidden from orchestrator (eval-only) +): + """Investigate errors in Sentry.""" + + prompt = f"""You are a Sentry specialist. Investigate: + +**Query:** {query} + +**IMPORTANT: This is a READ-ONLY investigation.** + +Provide findings, root cause analysis, and recommended fixes.""" + + response = yield prompt + + # Scoring for evals + if expected_finding and response: + yield 1.0 if expected_finding.lower() in response.lower() else 0.5 + else: + yield 1.0 if response else 0.0 +``` + + +**Eval-only parameters**: Parameters with `| None = None` are automatically hidden from the orchestrator's tool schema but available for evaluation scoring. + + +## Building the Orchestrator + +The orchestrator wraps each subagent's scenario as an `AgentTool`: + +```python +# orchestrator.py +from hud import Environment +from hud.tools import AgentTool +from hud.agents import create_agent +import hud + +from environments import sentry_env, supabase_env, railway_env, kubectl_env + + +async def diagnose(query: str, model: str = "claude-sonnet-4-5"): + orchestrator = Environment(name="ops-orchestrator") + + # Wrap each subagent as a tool + for name, env, desc in [ + ("investigate_sentry", sentry_env, "Check error monitoring"), + ("investigate_supabase", supabase_env, "Check database/auth"), + ("investigate_railway", railway_env, "Check deployments"), + ("investigate_kubernetes", kubectl_env, "Check cluster health"), + ]: + tool = AgentTool( + env("investigate"), + model=model, + name=name, + description=desc, + ) + orchestrator.add_tool(tool.mcp) + + @orchestrator.scenario("diagnose") + async def run_diagnosis(issue: str): + yield f"""You are an ops diagnostics orchestrator. + +**Issue:** {issue} + +You have READ-ONLY subagents for Sentry, Supabase, Railway, and Kubernetes. +Investigate systematically and correlate findings across services.""" + + task = orchestrator("diagnose", issue=query) + + async with hud.eval(task) as ctx: + agent = create_agent(model) + return await agent.run(ctx, max_steps=20) +``` + +### Trace Continuity + +All subagent activity appears in a single trace on the HUD platform. When the orchestrator calls a subagent tool, the inference and tool calls are recorded under the parent trace—no separate URLs to track. + +## The READ-ONLY Constraint + + +We tested and operated this environment directly on our production systems, so all scenarios enforce read-only constraints. We removed mutation tools like `kubectl_exec`, `railway_redeploy`, and Supabase DDL operations. + +Every prompt includes: **"This is a READ-ONLY investigation."** + + +## Sample Output + +Running against a real production issue: + +```bash +python orchestrator.py --model claude-sonnet-4-5 \ + "Failed to delete pod: 429 Too Many Requests. 7451 events, escalating." +``` + +The orchestrator delegates to `investigate_sentry`, `investigate_railway`, and `investigate_supabase`, then correlates findings across services. After about 5 minutes: + +```text Diagnosis +COMPREHENSIVE DIAGNOSIS REPORT + +Issue Summary + - Error: Failed to delete pod ████████████████████████████████████: 429 Too Many Requests + - Impact: 7,451 events over 5 days, 16 users affected, escalating state + - Project: Orchestrator / mcp-server + - Alert ID: ORCHESTRATOR-AC + +ROOT CAUSE ANALYSIS + + Primary Root Cause: Kubernetes API Rate Limiting + + The orchestrator service is hitting Kubernetes API server rate limits when + attempting to delete pods at scale. This is occurring in the + ████████.hud_gym.utils.kubernetes module. + + Key Contributing Factors: + + 1. Excessive Deletion Frequency: ~1,491 errors/day (~62/hour) indicates + aggressive pod deletion attempts + 2. No Retry/Backoff Logic: Code lacks exponential backoff when encountering + 429 responses + 3. High Concurrency: Service runs with 50 uvicorn workers + 32 Railway + replicas, amplifying concurrent API calls + 4. Burst Traffic Pattern: Correlated with API usage spikes (313 inference + calls/minute at peak) + 5. No Client-Side Rate Limiting: Kubernetes client not configured with QPS + limits + +CORRELATED FINDINGS ACROSS SERVICES + + Sentry (Error Tracking) + - 7,455 occurrences of the 429 error between ██████████████ + - Last occurrence: ████████████████████ + - Error originates from: ████████.hud_gym.utils.kubernetes logger + - Associated with HTTP PATCH to Supabase /rest/v1/environments endpoint + - Part of environment update/cleanup workflow + + Railway (Deployment Platform) + - Production service: 32 replicas in us-west2 + - Latest successful deployment: ████████████████████ (30 min AFTER last + Sentry error) + - Historical failures (██████): AWS EKS credential issues (now resolved) + - No current rate limiting errors in deployment logs + - Pod deletions working normally post-fix + + Supabase (Database/API) + - API burst traffic spike: 313 calls/minute at ████████████████████ + - ████ Team (22 members, free tier): 15,933 inference calls/24h - prime + candidate for "16 users" + - Connection pool saturation: 49 waiting connections out of 52 + - Security vulnerabilities: 38 tables with RLS enabled but NO policies + - Performance issues: 52 unindexed foreign keys, inefficient RLS policies + - 429 errors occur at API gateway layer (not visible in Postgres logs) + + Kubernetes + - Investigation unavailable due to response size (cluster likely healthy + but under load) + +CORRELATION & TIMELINE + + ██████████████: 7,455 pod deletion failures (continuous) + - ████████████████████: Last 429 error recorded in Sentry + - ████████████████████: New production deployment (likely contained fix) + - ████████████████████: API traffic spike (313 req/min) + + Pattern Identified: + + 1. Orchestrator creates ephemeral pods for task execution (inference + workloads) + 2. High inference API traffic (15,933 calls/day from ████ team) triggers + frequent pod creation + 3. Cleanup attempts to delete pods rapidly after task completion + 4. Kubernetes API rate limits exceeded due to: + - 50 uvicorn workers x 32 Railway replicas = 1,600 concurrent processes + - No client-side rate limiting or request coordination + - No exponential backoff on 429 responses + 5. Failed deletions likely retry aggressively, compounding the problem + +RECOMMENDED ACTIONS FOR HUMANS + + CRITICAL - Immediate Actions + + 1. Implement Exponential Backoff + Location: ████████.hud_gym.utils.kubernetes + + def delete_pod_with_retry(api_client, pod_name, namespace, max_retries=5): + for attempt in range(max_retries): + try: + api_client.delete_namespaced_pod(pod_name, namespace) + return True + except ApiException as e: + if e.status == 429 and attempt < max_retries - 1: + wait_time = (2 ** attempt) + random.uniform(0, 1) + logger.warning(f"Rate limited, retrying in {wait_time:.1f}s") + time.sleep(wait_time) + continue + raise + return False + + 2. Configure Kubernetes Client Rate Limiting + + configuration = client.Configuration() + configuration.qps = 5.0 # Max 5 queries per second per client + configuration.burst = 10 # Allow bursts up to 10 + + 3. Add Concurrency Control + + k8s_api_semaphore = Semaphore(10) # Max 10 concurrent API calls + + async def delete_pod_rate_limited(pod_name, namespace): + async with k8s_api_semaphore: + return await delete_pod_with_retry(pod_name, namespace) + + HIGH PRIORITY - Within 48 Hours + + 4. Optimize Worker Configuration + - Current: 50 uvicorn workers x 32 Railway replicas = 1,600 processes + - Recommendation: Reduce uvicorn workers to 10-20 per replica + - Why: Excessive concurrency amplifies K8s API load + + 5. Implement Pod Deletion Queue + - Use background queue (Redis, Celery) for pod deletions + - Process deletions with controlled rate (e.g., 100/minute globally) + - Provides visibility into deletion backlog + + 6. Fix Supabase Security Issues + - URGENT: Add RLS policies to 38 tables currently without policies + - Enable leaked password protection + - Reduce OTP expiry to < 1 hour + - Index 52 foreign keys for query performance + - Remove 5 duplicate indexes + + 7. Upgrade ████ Team or Implement Graduated Rate Limits + - ████ team (22 members, free tier) using 15,933 API calls/day + (enterprise-level) + - Either upgrade to paid tier or implement request throttling + - Add monitoring for teams exceeding tier limits + + MEDIUM PRIORITY - Within 1 Week + + 8. Add Monitoring & Alerting + - Track pod deletion success/failure rates + - Monitor K8s API rate limit headers (X-RateLimit-Remaining) + - Alert when deletion failure rate > 5% + - Add dashboards for pod lifecycle metrics + + 9. Implement Circuit Breaker Pattern + + k8s_breaker = CircuitBreaker(fail_max=5, timeout_duration=60) + + @k8s_breaker + def delete_pod_protected(pod_name, namespace): + return delete_pod_with_retry(pod_name, namespace) + + 10. Optimize Pod Lifecycle + - Review if pods can be longer-lived (reduce churn) + - Consider pod pooling/reuse for similar tasks + - Use K8s native garbage collection where possible + - Set propagationPolicy=Background for async cleanup + + 11. Fix Supabase Connection Pool + - Switch auth server to percentage-based connection allocation + - Current: 49 waiting connections out of 52 (saturation) + - Monitor connection wait times and adjust pool size + + LOW PRIORITY - Technical Debt + + 12. Update Deprecated Dependencies + - Replace close() with aclose() for Redis connections + - Update Supabase client for new parameter configuration + - Address deprecation warnings in logs + + 13. Add Request Coalescing + - Batch multiple pod deletions into single API calls where possible + - Implement request deduplication for identical operations + +VALIDATION STEPS + + After implementing fixes, validate with: + + 1. Sentry: Monitor ORCHESTRATOR-AC for decreased error frequency (target: 0 + errors) + 2. Kubernetes: Check API server metrics for reduced throttling events + 3. Railway: Verify pod deletion logs show successful operations + 4. Supabase: Confirm API traffic patterns stay within rate limits + 5. Metrics: Track pod deletion latency and success rate + +COMMIT MESSAGE TEMPLATE + + fix: implement exponential backoff for K8s pod deletions + + - Add retry logic with exponential backoff for 429 errors + - Configure client-side rate limiting (5 QPS, 10 burst) + - Add concurrency control with semaphore (max 10 concurrent) + - Reduce uvicorn workers from 50 to 20 per replica + + Fixes ORCHESTRATOR-AC + Resolves rate limiting issues affecting 16 users over 5 days + +SUCCESS CRITERIA + + - Zero 429 errors in Sentry for 7 consecutive days + - Pod deletion success rate > 99.9% + - Average deletion latency < 2 seconds + - No user-facing impact from pod lifecycle operations + - Supabase API calls stay within tier limits + +Investigation Status: Complete +Next Review: After fix deployment (monitor for 48 hours) +``` + +The entire investigation—from initial query to actionable recommendations—took about 5 minutes across the specialized subagents. + +## What We Learned + +1. **Environment design matters.** A focused toolset per domain outperforms a flat list of everything. + +2. **Scenarios are contracts.** They define what the orchestrator can ask and what the subagent returns. + +3. **Custom tools fill gaps.** When MCP servers don't fit your auth model, build direct API integrations. + +## See Also + +- [AgentTool Reference](/reference/tools#agenttool) +- [Building Environments](/build-environments) +- [Scenarios](/reference/environments#scenarios) diff --git a/docs/docs.json b/docs/docs.json index 114ba090..d3f7332c 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -33,7 +33,7 @@ "icon": "code", "versions": [ { - "version": "0.5.2", + "version": "0.5.3", "groups": [ { "group": "Get Started", @@ -63,7 +63,8 @@ { "group": "Cookbooks", "pages": [ - "cookbooks/codex-coding" + "cookbooks/codex-coding", + "cookbooks/ops-diagnostics" ] }, { diff --git a/docs/reference/environments.mdx b/docs/reference/environments.mdx index 1e6fc107..21fc0ce7 100644 --- a/docs/reference/environments.mdx +++ b/docs/reference/environments.mdx @@ -266,6 +266,81 @@ env.unmock() # Disable mock mode | `mock_tool(name, output)` | Set specific mock output | | `is_mock` | Check if mock mode is enabled | +## Serving as MCP Server + +Environment can serve its tools over MCP protocols, either standalone or mounted on an existing server. + +### serve() + +Start a standalone MCP server: + +```python +from hud import Environment + +env = Environment("my-env") + +@env.tool() +def greet(name: str) -> str: + return f"Hello, {name}!" + +# Run as MCP server (blocking) +env.serve() +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `transport` | `Literal["stdio", "sse", "streamable-http"]` | Transport protocol | `"streamable-http"` | +| `host` | `str` | Host address to bind | `"0.0.0.0"` | +| `port` | `int` | Port to bind | `8000` | + +```python +# Serve over stdio (for CLI tools) +env.serve(transport="stdio") + +# Serve over HTTP on custom port +env.serve(transport="streamable-http", host="0.0.0.0", port=8765) +``` + +### http_app() + +Get a Starlette/ASGI app to mount on an existing FastAPI server: + +```python +from fastapi import FastAPI +from hud import Environment + +app = FastAPI() +env = Environment("my-env") + +@env.tool() +def my_tool(arg: str) -> str: + return f"Got: {arg}" + +# Mount the HUD environment's MCP endpoint at /mcp +app.mount("/mcp", env.http_app()) + +# Your other FastAPI routes work normally +@app.get("/health") +def health(): + return {"status": "ok"} +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `path` | `str \| None` | Internal path for the MCP endpoint | `"/"` | +| `transport` | `Literal["http", "streamable-http", "sse"]` | Transport protocol | `"http"` | +| `middleware` | `list[ASGIMiddleware] \| None` | Starlette middleware | `None` | +| `json_response` | `bool \| None` | Use JSON response format | `None` | +| `stateless_http` | `bool \| None` | Use stateless HTTP mode | `None` | + +MCP clients can then connect at `http://your-server/mcp`: + +```python +# Client connecting to mounted environment +env.connect_url("http://localhost:8000/mcp") +``` + + ## Properties | Property | Type | Description | diff --git a/docs/reference/tools.mdx b/docs/reference/tools.mdx index 3d12e0b9..bf5b208c 100644 --- a/docs/reference/tools.mdx +++ b/docs/reference/tools.mdx @@ -69,6 +69,93 @@ async def url_match(url: str) -> EvaluationResult: # Agents call: evaluators(name="url_match", arguments={"url": "..."}) ``` +## Agent Tools + +### AgentTool + +```python +from hud.tools import AgentTool +``` + +Wraps a scenario as a tool that can be called by another agent. Essential for building **hierarchical agent systems** where an orchestrator delegates to specialized subagents. + +**Constructor Parameters:** +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `task` | `Task` | Task template from `env("scenario_name")` | Required | +| `model` | `str` | Model for subagent (via gateway) | `None` | +| `agent` | `type[MCPAgent]` | Custom agent class | `None` | +| `agent_params` | `dict` | Additional agent parameters | `{}` | +| `name` | `str` | Tool name for orchestrator | From scenario | +| `description` | `str` | Tool description | Auto-generated | +| `trace` | `bool` | Enable tracing for standalone runs | `False` | + +Must provide either `model` or `agent`, not both. + +**Eval-Only Parameters:** + +Parameters with `| None = None` are hidden from the orchestrator but available for evaluation: + +```python +@env.scenario("investigate") +async def investigate( + query: str, # Visible - orchestrator passes this + expected_finding: str | None = None, # Hidden - only used in eval scoring +): + response = yield f"Investigate: {query}" + + # Scoring uses expected_finding but orchestrator never sees it + if expected_finding and response: + yield 1.0 if expected_finding in response else 0.5 + else: + yield 1.0 if response else 0.0 +``` + +**Usage:** +```python +from hud import Environment +from hud.tools import AgentTool + +# Subagent environment with scenario +sentry_env = Environment(name="sentry-agent") + +@sentry_env.scenario("investigate") +async def investigate_sentry(query: str): + yield f"Investigate Sentry: {query}" + +# Create orchestrator +orchestrator = Environment(name="orchestrator") + +# Wrap subagent scenario as tool +tool = AgentTool( + sentry_env("investigate"), # Task template + model="gpt-4o-mini", + name="investigate_sentry", + description="Investigate errors in Sentry", +) +orchestrator.add_tool(tool.mcp) + +# Now orchestrator agent can call investigate_sentry(query="...") +``` + +**Trace Continuity:** + +When called from within an eval context, AgentTool automatically: +1. Inherits the parent's trace_id +2. Skips duplicate trace registration +3. Routes all inference/tool calls to the parent trace + +```python +async with hud.eval(task) as ctx: + agent = create_agent("gpt-4o") + result = await agent.run(ctx) + # All subagent activity appears in this single trace +``` + +**See Also:** [Ops Diagnostics Cookbook](/cookbook/ops-diagnostics) for a complete hierarchical agent example. + +--- + ## Core Tools ### BashTool diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py index 547d876b..03f9512a 100644 --- a/hud/agents/__init__.py +++ b/hud/agents/__init__.py @@ -1,19 +1,82 @@ from __future__ import annotations +from typing import Any + from .base import MCPAgent from .openai import OpenAIAgent from .openai_chat import OpenAIChatAgent from .operator import OperatorAgent -# Note: These agents are not exported here to avoid requiring optional dependencies. -# Import directly if needed: -# from hud.agents.claude import ClaudeAgent # requires anthropic -# from hud.agents.gemini import GeminiAgent # requires google-genai -# from hud.agents.gemini_cua import GeminiCUAAgent # requires google-genai - __all__ = [ "MCPAgent", "OpenAIAgent", "OpenAIChatAgent", "OperatorAgent", + "create_agent", ] + + +def create_agent(model: str, **kwargs: Any) -> MCPAgent: + """Create an agent for a gateway model. + + This routes ALL requests through the HUD gateway. For direct API access + (using your own API keys), use the agent classes directly. + + Args: + model: Model name (e.g., "gpt-4o", "claude-sonnet-4-5"). + **kwargs: Additional params passed to agent.create(). + + Returns: + Configured MCPAgent instance with gateway routing. + + Example: + ```python + # Gateway routing (recommended) + agent = create_agent("gpt-4o") + agent = create_agent("claude-sonnet-4-5", temperature=0.7) + + # Direct API access (use agent classes) + from hud.agents.claude import ClaudeAgent + + agent = ClaudeAgent.create(model="claude-sonnet-4-5") + ``` + """ + from hud.agents.gateway import build_gateway_client + from hud.agents.resolver import resolve_cls + + # Resolve class and gateway info + agent_cls, gateway_info = resolve_cls(model) + + # Get model ID from gateway info or use input + model_id = model + if gateway_info: + model_id = gateway_info.get("model") or gateway_info.get("id") or model + + # Determine provider: from gateway info, or infer from agent class + if gateway_info: + provider = gateway_info.get("provider") or "openai" + else: + # Map agent class to provider for known types + from hud.agents.claude import ClaudeAgent + from hud.agents.gemini import GeminiAgent + + _AGENT_TO_PROVIDER = { + ClaudeAgent: "anthropic", + GeminiAgent: "google", + } + provider = _AGENT_TO_PROVIDER.get(agent_cls, "openai") + + client = build_gateway_client(provider) + + # Set up kwargs + kwargs.setdefault("model", model_id) + + # Use correct client key based on agent type + if agent_cls == OpenAIChatAgent: + kwargs.setdefault("openai_client", client) + else: + # Claude and other agents use model_client and validate_api_key + kwargs.setdefault("model_client", client) + kwargs.setdefault("validate_api_key", False) + + return agent_cls.create(**kwargs) diff --git a/hud/agents/gateway.py b/hud/agents/gateway.py new file mode 100644 index 00000000..4d0973f8 --- /dev/null +++ b/hud/agents/gateway.py @@ -0,0 +1,42 @@ +"""Gateway client utilities for HUD inference gateway.""" + +from __future__ import annotations + +from typing import Any + + +def build_gateway_client(provider: str) -> Any: + """Build a client configured for HUD gateway routing. + + Args: + provider: Provider name ("anthropic", "openai", "gemini", etc.) + + Returns: + Configured async client for the provider. + """ + from hud.settings import settings + + provider = provider.lower() + + if provider == "anthropic": + from anthropic import AsyncAnthropic + + return AsyncAnthropic(api_key=settings.api_key, base_url=settings.hud_gateway_url) + + if provider == "gemini": + from google import genai + from google.genai.types import HttpOptions + + return genai.Client( + api_key="PLACEHOLDER", + http_options=HttpOptions( + api_version="v1beta", + base_url=settings.hud_gateway_url, + headers={"Authorization": f"Bearer {settings.api_key}"}, + ), + ) + + # OpenAI-compatible (openai, azure, together, groq, fireworks, etc.) + from openai import AsyncOpenAI + + return AsyncOpenAI(api_key=settings.api_key, base_url=settings.hud_gateway_url) diff --git a/hud/agents/resolver.py b/hud/agents/resolver.py new file mode 100644 index 00000000..80351800 --- /dev/null +++ b/hud/agents/resolver.py @@ -0,0 +1,70 @@ +"""Model resolution - maps model strings to agent classes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from hud.agents.base import MCPAgent + +__all__ = ["resolve_cls"] + +_models_cache: list[dict[str, Any]] | None = None + +# Provider name → AgentType value (only anthropic differs) +_PROVIDER_TO_AGENT = {"anthropic": "claude"} + + +def _fetch_gateway_models() -> list[dict[str, Any]]: + """Fetch available models from HUD gateway (cached).""" + global _models_cache + if _models_cache is not None: + return _models_cache + + import httpx + + from hud.settings import settings + + if not settings.api_key: + return [] + + try: + resp = httpx.get( + f"{settings.hud_gateway_url}/models", + headers={"Authorization": f"Bearer {settings.api_key}"}, + timeout=10.0, + ) + resp.raise_for_status() + data = resp.json() + _models_cache = data.get("data", data) if isinstance(data, dict) else data + return _models_cache or [] + except Exception: + return [] + + +def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]: + """Resolve model string to (agent_class, gateway_info). + + Returns: + (agent_class, None) for known AgentTypes + (agent_class, gateway_model_info) for gateway models + """ + from hud.types import AgentType + + # Known AgentType → no gateway info + try: + return AgentType(model).cls, None + except ValueError: + pass + + # Gateway lookup + for m in _fetch_gateway_models(): + if model in (m.get("id"), m.get("name"), m.get("model")): + provider = (m.get("provider") or "openai_compatible").lower() + agent_str = _PROVIDER_TO_AGENT.get(provider, provider) + try: + return AgentType(agent_str).cls, m + except ValueError: + return AgentType.OPENAI_COMPATIBLE.cls, m + + raise ValueError(f"Model '{model}' not found") diff --git a/hud/agents/tests/test_resolver.py b/hud/agents/tests/test_resolver.py new file mode 100644 index 00000000..04e6f51e --- /dev/null +++ b/hud/agents/tests/test_resolver.py @@ -0,0 +1,192 @@ +"""Tests for model resolution and create_agent.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from hud.agents import create_agent +from hud.agents.resolver import resolve_cls + + +class TestResolveCls: + """Tests for resolve_cls function.""" + + def test_resolves_known_agent_type(self) -> None: + """Known AgentType strings resolve to their class.""" + from hud.agents.claude import ClaudeAgent + + cls, gateway_info = resolve_cls("claude") + assert cls == ClaudeAgent + assert gateway_info is None + + def test_resolves_openai(self) -> None: + """Resolves 'openai' to OpenAIAgent.""" + from hud.agents import OpenAIAgent + + cls, _gateway_info = resolve_cls("openai") + assert cls == OpenAIAgent + + def test_resolves_gemini(self) -> None: + """Resolves 'gemini' to GeminiAgent.""" + from hud.agents.gemini import GeminiAgent + + cls, _gateway_info = resolve_cls("gemini") + assert cls == GeminiAgent + + def test_unknown_model_without_gateway_raises(self) -> None: + """Unknown model with no gateway models raises ValueError.""" + with ( + patch("hud.agents.resolver._fetch_gateway_models", return_value=[]), + pytest.raises(ValueError, match="not found"), + ): + resolve_cls("unknown-model-xyz") + + def test_resolves_gateway_model(self) -> None: + """Resolves model found in gateway.""" + from hud.agents import OpenAIAgent + + mock_models = [ + {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"}, + ] + + with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models): + cls, info = resolve_cls("gpt-4o") + assert cls == OpenAIAgent + assert info is not None + assert info["id"] == "gpt-4o" + + def test_resolves_anthropic_provider_to_claude(self) -> None: + """Provider 'anthropic' maps to ClaudeAgent.""" + from hud.agents.claude import ClaudeAgent + + mock_models = [ + {"id": "claude-sonnet", "model": "claude-3-sonnet", "provider": "anthropic"}, + ] + + with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models): + cls, _info = resolve_cls("claude-sonnet") + assert cls == ClaudeAgent + + def test_resolves_unknown_provider_to_openai_compatible(self) -> None: + """Unknown provider maps to OpenAIChatAgent.""" + from hud.agents.openai_chat import OpenAIChatAgent + + mock_models = [ + {"id": "custom-model", "model": "custom", "provider": "custom-provider"}, + ] + + with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models): + cls, _info = resolve_cls("custom-model") + assert cls == OpenAIChatAgent + + +class TestCreateAgent: + """Tests for create_agent function - gateway-only.""" + + def test_creates_with_gateway_client(self) -> None: + """create_agent always uses gateway routing.""" + from hud.agents import OpenAIAgent + + mock_models = [ + {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"}, + ] + + with ( + patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models), + patch.object(OpenAIAgent, "create") as mock_create, + patch("hud.agents.gateway.build_gateway_client") as mock_build_client, + ): + mock_client = MagicMock() + mock_build_client.return_value = mock_client + mock_agent = MagicMock() + mock_create.return_value = mock_agent + + agent = create_agent("gpt-4o") + + # Should have set model and model_client + call_kwargs = mock_create.call_args.kwargs + assert call_kwargs["model"] == "gpt-4o" + assert "model_client" in call_kwargs + assert agent == mock_agent + + def test_passes_kwargs_to_create(self) -> None: + """Extra kwargs are passed to agent.create().""" + from hud.agents import OpenAIAgent + + mock_models = [ + {"id": "gpt-4o", "model": "gpt-4o", "provider": "openai"}, + ] + + with ( + patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models), + patch.object(OpenAIAgent, "create") as mock_create, + patch("hud.agents.gateway.build_gateway_client"), + ): + mock_create.return_value = MagicMock() + + create_agent("gpt-4o", temperature=0.5, max_tokens=1000) + + call_kwargs = mock_create.call_args.kwargs + assert call_kwargs["temperature"] == 0.5 + assert call_kwargs["max_tokens"] == 1000 + + def test_known_agent_type_also_uses_gateway(self) -> None: + """Even 'claude' string uses gateway (it's a gateway shortcut).""" + from hud.agents.claude import ClaudeAgent + + with ( + patch.object(ClaudeAgent, "create") as mock_create, + patch("hud.agents.gateway.build_gateway_client") as mock_build_client, + ): + mock_client = MagicMock() + mock_build_client.return_value = mock_client + mock_create.return_value = MagicMock() + + create_agent("claude") + + # Should still build gateway client + mock_build_client.assert_called_once() + call_kwargs = mock_create.call_args.kwargs + assert "model_client" in call_kwargs + + +class TestBuildGatewayClient: + """Tests for build_gateway_client function.""" + + def test_builds_anthropic_client(self) -> None: + """Builds AsyncAnthropic for anthropic provider.""" + from hud.agents.gateway import build_gateway_client + + with patch("hud.settings.settings") as mock_settings: + mock_settings.api_key = "test-key" + mock_settings.hud_gateway_url = "https://gateway.hud.ai" + + with patch("anthropic.AsyncAnthropic") as mock_client_cls: + build_gateway_client("anthropic") + mock_client_cls.assert_called_once() + + def test_builds_openai_client_for_openai(self) -> None: + """Builds AsyncOpenAI for openai provider.""" + from hud.agents.gateway import build_gateway_client + + with patch("hud.settings.settings") as mock_settings: + mock_settings.api_key = "test-key" + mock_settings.hud_gateway_url = "https://gateway.hud.ai" + + with patch("openai.AsyncOpenAI") as mock_client_cls: + build_gateway_client("openai") + mock_client_cls.assert_called_once() + + def test_builds_openai_client_for_unknown(self) -> None: + """Builds AsyncOpenAI for unknown providers (openai-compatible).""" + from hud.agents.gateway import build_gateway_client + + with patch("hud.settings.settings") as mock_settings: + mock_settings.api_key = "test-key" + mock_settings.hud_gateway_url = "https://gateway.hud.ai" + + with patch("openai.AsyncOpenAI") as mock_client_cls: + build_gateway_client("together") + mock_client_cls.assert_called_once() diff --git a/hud/cli/eval.py b/hud/cli/eval.py index eb13ce34..faedb107 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -338,47 +338,27 @@ def get_agent_kwargs(self) -> dict[str, Any]: # Configure gateway mode - route LLM API calls through HUD gateway if self.gateway: - hud_api_key = settings.api_key - if not hud_api_key: + if not settings.api_key: raise typer.Exit(1) # Already validated in validate_api_keys() - if self.agent_type == AgentType.CLAUDE: - from anthropic import AsyncAnthropic - - kwargs["model_client"] = AsyncAnthropic( - api_key=hud_api_key, - base_url=settings.hud_gateway_url, - ) - hud_console.info("🌐 Using HUD Gateway for Claude API") - elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR): - from openai import AsyncOpenAI + from hud.agents.gateway import build_gateway_client - kwargs["model_client"] = AsyncOpenAI( - api_key=hud_api_key, - base_url=settings.hud_gateway_url, - ) - hud_console.info("🌐 Using HUD Gateway for OpenAI API") - elif self.agent_type == AgentType.OPENAI_COMPATIBLE: - from openai import AsyncOpenAI + # Map AgentType to provider + agent_to_provider = { + AgentType.CLAUDE: "anthropic", + AgentType.OPENAI: "openai", + AgentType.OPERATOR: "openai", + AgentType.GEMINI: "gemini", + AgentType.GEMINI_CUA: "gemini", + AgentType.OPENAI_COMPATIBLE: "openai", + } + provider = agent_to_provider.get(self.agent_type, "openai") + client = build_gateway_client(provider) - kwargs["openai_client"] = AsyncOpenAI( - api_key=hud_api_key, - base_url=settings.hud_gateway_url, - ) - hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API") - elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA): - from google import genai - from google.genai.types import HttpOptions - - kwargs["model_client"] = genai.Client( - api_key="PLACEHOLDER", - http_options=HttpOptions( - api_version="v1beta", - base_url=settings.hud_gateway_url, - headers={"Authorization": f"Bearer {hud_api_key}"}, - ), - ) - hud_console.info("🌐 Using HUD Gateway for Gemini API") + # OpenAI-compatible uses openai_client key + is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE + kwargs["openai_client" if is_oai_compat else "model_client"] = client + hud_console.info(f"🌐 Using HUD Gateway for {provider} API") return kwargs diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py index 0c1982f2..d313228e 100644 --- a/hud/datasets/loader.py +++ b/hud/datasets/loader.py @@ -63,7 +63,8 @@ def _load_from_file(path: Path) -> list[Task]: from hud.eval.task import Task raw_items = _load_raw_from_file(path) - return [Task(**item) for item in raw_items] + # Default args to {} for runnable tasks (None = template) + return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]: @@ -99,7 +100,8 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]: raw_items = _load_raw_from_huggingface(dataset_name) from hud.eval.task import Task - return [Task(**item) for item in raw_items] + # Default args to {} for runnable tasks (None = template) + return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]: @@ -138,7 +140,8 @@ def _load_from_api(dataset_name: str) -> list[Task]: from hud.eval.task import Task raw_items = _load_raw_from_api(dataset_name) - return [Task(**item) for item in raw_items] + # Default args to {} for runnable tasks (None = template) + return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] @overload diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py index 3b4b1162..1a27074f 100644 --- a/hud/datasets/runner.py +++ b/hud/datasets/runner.py @@ -40,7 +40,7 @@ async def run_dataset( - A source string (file path, API slug) - loaded via load_tasks() - A single TaskInput (Task, LegacyTask, or dict) - A list of TaskInput objects - agent_type: Type of agent to create (e.g., "claude", "openai", AgentType.CLAUDE). + agent_type: Agent type (e.g., "claude", "openai", AgentType.CLAUDE). agent_params: Parameters to pass to agent.create(). max_steps: Maximum steps per task. max_concurrent: Maximum concurrent tasks (for parallel execution). @@ -70,6 +70,10 @@ async def run_dataset( from hud.datasets.loader import load_tasks from hud.eval.task import Task + # Normalize agent_type to AgentType enum + if isinstance(agent_type, str): + agent_type = AgentType(agent_type) + # Normalize tasks to list[Task] task_list: list[Task] if isinstance(tasks, str): @@ -86,10 +90,6 @@ async def run_dataset( if not task_list: raise ValueError("No tasks to run") - # Resolve agent class - agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type) - agent_cls = agent_type_enum.cls - # Use hud.eval() for both single and parallel execution async with hud.eval( task_list, @@ -97,8 +97,8 @@ async def run_dataset( max_concurrent=max_concurrent, quiet=quiet, ) as ctx: - # Create agent fresh for each context (ensures correct tool initialization) - agent = agent_cls.create(**(agent_params or {})) + # Create agent using AgentType.cls.create() + agent = agent_type.cls.create(**(agent_params or {})) await agent.run(ctx, max_steps=max_steps) # Reward is computed by EvalContext.__aexit__ from evaluate tools @@ -198,9 +198,8 @@ async def run_single_task( if ctx.system_prompt and "system_prompt" not in final_agent_params: final_agent_params["system_prompt"] = ctx.system_prompt - # Create agent inside ctx so it has access to context-derived values - agent_cls = agent_type.cls - agent = agent_cls.create(**final_agent_params) + # Create agent using AgentType.cls.create() + agent = agent_type.cls.create(**final_agent_params) # Store metadata if provided if metadata: diff --git a/hud/environment/environment.py b/hud/environment/environment.py index 83924cd2..4ed44b32 100644 --- a/hud/environment/environment.py +++ b/hud/environment/environment.py @@ -362,6 +362,22 @@ async def __aexit__( await asyncio.gather(*[c.disconnect() for c in self._connections.values()]) self._router.clear() + async def run_async( + self, + transport: Literal["stdio", "http", "sse"] | None = None, + show_banner: bool = True, + **transport_kwargs: Any, + ) -> None: + """Run the MCP server, auto-connecting all connectors first. + + This ensures that tools from external MCP servers (via connect_mcp_config) + are discovered and available when the server starts. + """ + async with self: # Connect all connectors via __aenter__ + await super().run_async( + transport=transport, show_banner=show_banner, **transport_kwargs + ) + async def _build_routing(self) -> None: """Build tool routing from local tools and connection caches.""" # Use get_tools() not list_tools() - it includes mounted servers without @@ -376,6 +392,27 @@ async def _build_routing(self) -> None: # Populate mock schemas for auto-generated mock values self._populate_mock_schemas() + # ========================================================================= + # MCP Protocol Overrides - Include connector tools in MCP responses + # ========================================================================= + + def _setup_handlers(self) -> None: + """Override FastMCP to register our custom handlers for tools.""" + # Call parent to set up all standard handlers + super()._setup_handlers() + # Re-register our custom handlers (overwrites parent's registrations) + self._mcp_server.list_tools()(self._env_list_tools) + self._mcp_server.call_tool()(self._env_call_tool) + + async def _env_list_tools(self) -> list[mcp_types.Tool]: + """Return all tools including those from connectors.""" + return self._router.tools + + async def _env_call_tool(self, name: str, arguments: dict[str, Any] | None = None) -> list[Any]: + """Route tool calls through our router (handles both local and connector tools).""" + result = await self._execute_tool(name, arguments or {}) + return result.content or [] + # ========================================================================= # Tool Operations # ========================================================================= diff --git a/hud/environment/tests/test_environment.py b/hud/environment/tests/test_environment.py index 44febe88..60b544e7 100644 --- a/hud/environment/tests/test_environment.py +++ b/hud/environment/tests/test_environment.py @@ -159,3 +159,171 @@ def test_chaining_multiple_setup_calls(self) -> None: ) assert len(env._setup_calls) == 2 + + +class TestEnvironmentMCPProtocol: + """Tests for MCP protocol overrides - Environment._env_list_tools and _env_call_tool. + + These test that Environment properly exposes connector tools via MCP handlers. + """ + + @pytest.mark.asyncio + async def test_env_list_tools_includes_local_tools(self) -> None: + """_env_list_tools returns local tools after routing is built.""" + from hud.environment import Environment + + env = Environment("test") + + @env.tool() + def my_tool(x: int) -> int: + """A test tool.""" + return x * 2 + + # Build routing (simulates what __aenter__ does) + await env._build_routing() + + # Call the handler that MCP will call + tools = await env._env_list_tools() + + assert len(tools) == 1 + assert tools[0].name == "my_tool" + + @pytest.mark.asyncio + async def test_env_list_tools_includes_connector_tools(self) -> None: + """_env_list_tools returns tools from connectors (the key feature).""" + import mcp.types as mcp_types + + from hud.environment import Environment + + env = Environment("test") + + # Create a mock connector with cached tools + mock_tools = [ + mcp_types.Tool( + name="remote_tool", + description="A remote tool", + inputSchema={"type": "object"}, + ) + ] + + class MockConnector: + is_connected = True + _tools_cache = mock_tools + + @property + def cached_tools(self) -> list[mcp_types.Tool]: + return self._tools_cache + + async def connect(self) -> None: + pass + + async def disconnect(self) -> None: + pass + + async def list_tools(self) -> list[mcp_types.Tool]: + return self._tools_cache + + # Add the mock connector + env._connections["mock"] = MockConnector() # type: ignore + + # Build routing + await env._build_routing() + + # Call the handler that MCP will call + tools = await env._env_list_tools() + + # Should include the remote tool + tool_names = [t.name for t in tools] + assert "remote_tool" in tool_names + + @pytest.mark.asyncio + async def test_env_call_tool_routes_to_local(self) -> None: + """_env_call_tool routes local tool calls correctly.""" + from hud.environment import Environment + + env = Environment("test") + called_with: list[int] = [] + + @env.tool() + def my_tool(x: int) -> str: + """A test tool.""" + called_with.append(x) + return f"result: {x}" + + # Build routing + await env._build_routing() + + # Call the handler that MCP will call + result = await env._env_call_tool("my_tool", {"x": 42}) + + assert called_with == [42] + assert len(result) == 1 + + @pytest.mark.asyncio + async def test_env_call_tool_routes_to_connector(self) -> None: + """_env_call_tool routes connector tool calls correctly.""" + from unittest.mock import AsyncMock + + import mcp.types as mcp_types + + from hud.environment import Environment + from hud.types import MCPToolResult + + env = Environment("test") + + # Create a mock connector + mock_tools = [ + mcp_types.Tool( + name="remote_tool", + description="A remote tool", + inputSchema={"type": "object"}, + ) + ] + + class MockConnector: + is_connected = True + _tools_cache = mock_tools + call_tool = AsyncMock( + return_value=MCPToolResult( + content=[mcp_types.TextContent(type="text", text="remote result")], + isError=False, + ) + ) + + @property + def cached_tools(self) -> list[mcp_types.Tool]: + return self._tools_cache + + async def connect(self) -> None: + pass + + async def disconnect(self) -> None: + pass + + async def list_tools(self) -> list[mcp_types.Tool]: + return self._tools_cache + + mock_conn = MockConnector() + env._connections["mock"] = mock_conn # type: ignore + + # Build routing + await env._build_routing() + + # Call the handler that MCP will call + result = await env._env_call_tool("remote_tool", {"arg": "value"}) + + # Verify the connector was called + mock_conn.call_tool.assert_called_once_with("remote_tool", {"arg": "value"}) + assert len(result) == 1 + + def test_setup_handlers_registers_custom_handlers(self) -> None: + """Verify _setup_handlers registers our _env_list_tools and _env_call_tool.""" + from hud.environment import Environment + + env = Environment("test") + + # Verify the custom handlers exist + assert hasattr(env, "_env_list_tools") + assert hasattr(env, "_env_call_tool") + assert callable(env._env_list_tools) + assert callable(env._env_call_tool) diff --git a/hud/eval/context.py b/hud/eval/context.py index ca0704f5..e7d4be14 100644 --- a/hud/eval/context.py +++ b/hud/eval/context.py @@ -302,10 +302,20 @@ def from_task( code_snippet: Code being evaluated trace: Whether to send traces to backend quiet: Whether to suppress output + + Raises: + ValueError: If task.args is None (template tasks cannot be run directly) """ from hud.environment import Environment from hud.eval.task import build_eval_name + # Validate that task has args (not a template) + if task.args is None: + raise ValueError( + f"Cannot run task with args=None (this is a template). " + f"Provide args when creating the task: env('{task.scenario}', **args)" + ) + eval_name = name or build_eval_name(task.scenario, task.args) # task.env is guaranteed to be Environment after Task.__post_init__ @@ -343,7 +353,7 @@ async def _run_task_scenario_setup(self) -> None: if self._task is None or self._task.scenario is None: return - prompt = await self.run_scenario_setup(self._task.scenario, self._task.args) + prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {}) if prompt: self.prompt = prompt diff --git a/hud/eval/task.py b/hud/eval/task.py index 085f1bf8..cfa6d64a 100644 --- a/hud/eval/task.py +++ b/hud/eval/task.py @@ -148,7 +148,10 @@ class Task(BaseModel): env: Any = Field(default=None) # Typed as Any for input flexibility, validated below scenario: str | None = None id: str | None = None - args: dict[str, Any] = Field(default_factory=dict) + args: dict[str, Any] | None = Field( + default=None, + description="Scenario arguments. None indicates a template (args filled in later).", + ) validation: list[MCPToolCall] | None = None # Agent config - settings passed to agent (system_prompt, etc.) @@ -335,6 +338,6 @@ def copy(self) -> Task: id=self.id, env=self.env, # Share reference scenario=self.scenario, - args=self.args.copy() if self.args else {}, + args=self.args.copy() if self.args is not None else None, validation=self.validation.copy() if self.validation else None, ) diff --git a/hud/eval/tests/test_eval.py b/hud/eval/tests/test_eval.py index 6d470808..ea958af4 100644 --- a/hud/eval/tests/test_eval.py +++ b/hud/eval/tests/test_eval.py @@ -16,7 +16,7 @@ def test_init_defaults(self) -> None: assert task.env is None assert task.scenario is None - assert task.args == {} + assert task.args is None # None = template, {} = runnable with no args def test_init_with_env_dict(self) -> None: """Task auto-converts env dict to Environment via validator.""" diff --git a/hud/telemetry/tests/test_eval_telemetry.py b/hud/telemetry/tests/test_eval_telemetry.py index 8849cd13..bfb61004 100644 --- a/hud/telemetry/tests/test_eval_telemetry.py +++ b/hud/telemetry/tests/test_eval_telemetry.py @@ -49,8 +49,8 @@ async def greet(name: str) -> str: """Say hello.""" return f"Hello, {name}!" - # Create task from environment - task = Task(env=env) + # Create task from environment (args={} = runnable, args=None = template) + task = Task(env=env, args={}) with ( patch("hud.settings.settings") as mock_settings, @@ -110,7 +110,7 @@ async def failing_tool() -> str: """Always fails.""" raise ValueError("Tool error") - task = Task(env=env) + task = Task(env=env, args={}) with ( patch("hud.settings.settings") as mock_settings, @@ -162,7 +162,7 @@ async def multiply(a: int, b: int) -> int: """Multiply two numbers.""" return a * b - task = Task(env=env) + task = Task(env=env, args={}) with ( patch("hud.settings.settings") as mock_settings, @@ -195,7 +195,7 @@ async def test_flush_called_on_context_exit(self): async def simple_tool() -> str: return "done" - task = Task(env=env) + task = Task(env=env, args={}) with ( patch("hud.eval.context.flush") as mock_flush, @@ -229,7 +229,7 @@ def should_not_be_called(*args: Any, **kwargs: Any) -> bool: async def test_tool() -> str: return "ok" - task = Task(env=env) + task = Task(env=env, args={}) with ( patch("hud.settings.settings") as mock_settings, @@ -272,7 +272,7 @@ def capture_upload( async def echo(message: str) -> str: return message - task = Task(env=env) + task = Task(env=env, args={}) with ( patch("hud.settings.settings") as mock_settings, @@ -329,7 +329,7 @@ def capture_upload( async def noop() -> None: pass - task = Task(env=env) + task = Task(env=env, args={}) with ( patch("hud.settings.settings") as mock_settings, diff --git a/hud/tools/__init__.py b/hud/tools/__init__.py index 8451a04f..26495d33 100644 --- a/hud/tools/__init__.py +++ b/hud/tools/__init__.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any +from .agent import AgentTool from .base import BaseHub, BaseTool from .bash import BashTool from .edit import EditTool @@ -21,6 +22,7 @@ ) __all__ = [ + "AgentTool", "AnthropicComputerTool", "BaseHub", "BaseTool", diff --git a/hud/tools/agent.py b/hud/tools/agent.py new file mode 100644 index 00000000..2f5ad377 --- /dev/null +++ b/hud/tools/agent.py @@ -0,0 +1,216 @@ +"""AgentTool - run a Task with an agent as a tool.""" + +from __future__ import annotations + +import inspect +from typing import TYPE_CHECKING, Any, Union, get_args, get_origin + +from fastmcp.tools.tool import FunctionTool, ToolResult +from mcp.types import TextContent + +from hud.tools.base import BaseTool + +if TYPE_CHECKING: + from hud.agents.base import MCPAgent + from hud.eval.task import Task + +__all__ = ["AgentTool"] + + +def _is_eval_only(param: inspect.Parameter) -> bool: + """Check if param is eval-only: has None default AND None in type union. + + Handles both runtime types and string annotations (PEP 563). + """ + # Must have default of None + if param.default is not None: + return False + if param.annotation is inspect.Parameter.empty: + return False + + annotation = param.annotation + + # Handle string annotations (from __future__ annotations or quoted) + if isinstance(annotation, str): + # Check if it looks like "X | None", "Union[X, None]", or "Optional[X]" + return ( + "| None" in annotation + or "None |" in annotation + or "Optional[" in annotation + or ("Union[" in annotation and "None" in annotation) + ) + + # Handle runtime type annotations + origin = get_origin(annotation) + + # Union types (X | None or Union[X, None]) + if origin is Union: + return type(None) in get_args(annotation) + + # For Python 3.10+ union syntax at runtime (types.UnionType) + try: + import types + + if isinstance(annotation, types.UnionType): + return type(None) in get_args(annotation) + except (ImportError, AttributeError): + pass + + return False + + +class AgentTool(BaseTool): + """Tool that runs a Task template with an agent. + + Parameters with `| None = None` are eval-only and hidden from the tool schema. + + Example: + ```python + @env.scenario() + async def investigate( + issue_id: str, # Required - orchestrator sees + expected_cause: str | None = None, # Eval only - hidden + ): + yield {"task": f"Investigate {issue_id}"} + + + seer = AgentTool(env("investigate"), model="ft:seer-v2") + ``` + """ + + def __init__( + self, + task: Task, + *, + model: str | None = None, + agent: type[MCPAgent] | None = None, + agent_params: dict[str, Any] | None = None, + name: str | None = None, + description: str | None = None, + trace: bool = False, + ) -> None: + if not model and agent is None: + raise ValueError("Must provide either 'model' or 'agent'") + if model and agent is not None: + raise ValueError("Cannot provide both 'model' and 'agent'") + + self._task = task + self._model = model + self._agent_cls = agent + self._agent_params = agent_params or {} + self._trace = trace + + # Get visible params from scenario function + self._visible_params: set[str] = set() + self._param_schema: dict[str, Any] = { + "type": "object", + "properties": {}, + "required": [], + } + + if task.env and task.scenario: + scenario_fn = task.env._scenarios.get(task.scenario) + if scenario_fn: + sig = inspect.signature(scenario_fn) + visible = {name: p for name, p in sig.parameters.items() if not _is_eval_only(p)} + self._visible_params = set(visible.keys()) + self._param_schema = self._build_schema(visible) + + tool_name = name or task.scenario or "agent_tool" + tool_desc = description or f"Run scenario: {task.scenario}" + + super().__init__(name=tool_name, description=tool_desc) + + def _build_schema(self, params: dict[str, inspect.Parameter]) -> dict[str, Any]: + """Build JSON schema using Pydantic TypeAdapter.""" + from pydantic import TypeAdapter + + properties: dict[str, Any] = {} + required: list[str] = [] + + for name, param in params.items(): + if param.annotation is not inspect.Parameter.empty: + try: + # Handle string annotations + annotation = param.annotation + if isinstance(annotation, str): + # Try to evaluate the annotation + try: + annotation = eval(annotation) # noqa: S307 + except Exception: + # Fall back to string type but don't skip required handling + annotation = None + + if annotation is not None: + adapter = TypeAdapter(annotation) + properties[name] = adapter.json_schema() + else: + properties[name] = {"type": "string"} + except Exception: + properties[name] = {"type": "string"} + else: + properties[name] = {"type": "string"} + + if param.default is inspect.Parameter.empty: + required.append(name) + elif param.default is not None: + properties[name]["default"] = param.default + + return {"type": "object", "properties": properties, "required": required} + + @property + def mcp(self) -> FunctionTool: + """Get as FastMCP FunctionTool with filtered schema.""" + if not hasattr(self, "_mcp_tool"): + # Directly instantiate FunctionTool with our callable and schema + # This bypasses from_function's signature parsing + self._mcp_tool = FunctionTool( + name=self.name, + description=self.description or "", + parameters=self._param_schema, + fn=self._execute_with_args, + ) + return self._mcp_tool + + async def _execute_with_args(self, **kwargs: Any) -> ToolResult: + """Internal executor that FastMCP calls with parsed arguments.""" + return await self(**kwargs) + + async def __call__(self, **kwargs: Any) -> ToolResult: + """Execute the task with a fresh agent.""" + from hud.eval.context import get_current_trace_id + from hud.eval.manager import run_eval + + # Filter to visible params only + filtered = {k: v for k, v in kwargs.items() if k in self._visible_params} + + # Merge with template args + base_args = self._task.args or {} + task = self._task.model_copy(update={"args": {**base_args, **filtered}}) + + # Use parent trace if available (for hierarchical agents) + parent_trace_id = get_current_trace_id() + + # If nested (has parent), skip subagent's enter/exit registration + # Tool calls are still recorded via the shared trace_id's context + is_nested = parent_trace_id is not None + + # Trace if explicitly requested AND not nested (nested uses parent trace) + should_trace = self._trace and not is_nested + + async with run_eval( + task, + trace=should_trace, + trace_id=parent_trace_id, + quiet=True, + ) as ctx: + if self._model: + from hud.agents import create_agent + + agent = create_agent(self._model, **self._agent_params) + else: + agent = self._agent_cls.create(**self._agent_params) # type: ignore + + result = await agent.run(ctx) + content = result.content if hasattr(result, "content") and result.content else "" + return ToolResult(content=[TextContent(type="text", text=content)]) diff --git a/hud/tools/tests/test_agent_tool.py b/hud/tools/tests/test_agent_tool.py new file mode 100644 index 00000000..de8196c3 --- /dev/null +++ b/hud/tools/tests/test_agent_tool.py @@ -0,0 +1,355 @@ +"""Tests for AgentTool - scenario-to-agent composition.""" + +from __future__ import annotations + +import inspect +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from hud.environment import Environment +from hud.eval.task import Task +from hud.tools.agent import AgentTool, _is_eval_only + + +class TestIsEvalOnly: + """Tests for _is_eval_only helper function.""" + + def test_required_param_not_eval_only(self) -> None: + """Required params (no default) are not eval-only.""" + + def fn(x: str) -> None: + pass + + sig = inspect.signature(fn) + param = sig.parameters["x"] + assert not _is_eval_only(param) + + def test_optional_with_value_not_eval_only(self) -> None: + """Optional params with non-None default are not eval-only.""" + + def fn(x: str = "default") -> None: + pass + + sig = inspect.signature(fn) + param = sig.parameters["x"] + assert not _is_eval_only(param) + + def test_optional_none_without_union_not_eval_only(self) -> None: + """Optional with None default but no None in type is not eval-only.""" + + def fn(x: str = None) -> None: # type: ignore[assignment] # noqa: RUF013 + pass + + sig = inspect.signature(fn) + param = sig.parameters["x"] + assert not _is_eval_only(param) + + def test_optional_none_with_union_is_eval_only(self) -> None: + """Params with `X | None = None` pattern are eval-only.""" + + def fn(x: str | None = None) -> None: + pass + + sig = inspect.signature(fn) + param = sig.parameters["x"] + assert _is_eval_only(param) + + def test_optional_int_none_is_eval_only(self) -> None: + """Works with int | None = None too.""" + + def fn(x: int | None = None) -> None: + pass + + sig = inspect.signature(fn) + param = sig.parameters["x"] + assert _is_eval_only(param) + + def test_string_annotation_with_none_union(self) -> None: + """Handles string annotations like 'str | None'.""" + # Simulate string annotation + param = inspect.Parameter( + "x", + inspect.Parameter.POSITIONAL_OR_KEYWORD, + default=None, + annotation="str | None", + ) + assert _is_eval_only(param) + + def test_string_annotation_without_none(self) -> None: + """String annotations without None are not eval-only.""" + param = inspect.Parameter( + "x", + inspect.Parameter.POSITIONAL_OR_KEYWORD, + default=None, + annotation="str", + ) + assert not _is_eval_only(param) + + +class TestAgentToolInit: + """Tests for AgentTool initialization.""" + + def test_requires_model_or_agent(self) -> None: + """Must provide either model or agent.""" + task = Task(args={}) + + with pytest.raises(ValueError, match="Must provide either"): + AgentTool(task) + + def test_cannot_provide_both_model_and_agent(self) -> None: + """Cannot provide both model and agent.""" + task = Task(args={}) + mock_agent = MagicMock() + + with pytest.raises(ValueError, match="Cannot provide both"): + AgentTool(task, model="claude", agent=mock_agent) # type: ignore[arg-type] + + def test_accepts_model_string(self) -> None: + """Can create with model string.""" + task = Task(scenario="test", args={}) + tool = AgentTool(task, model="claude") + + assert tool._model == "claude" + assert tool._agent_cls is None + + def test_accepts_agent_class(self) -> None: + """Can create with custom agent class.""" + task = Task(scenario="test", args={}) + mock_agent_cls = MagicMock() + tool = AgentTool(task, agent=mock_agent_cls) # type: ignore[arg-type] + + assert tool._model is None + assert tool._agent_cls is mock_agent_cls + + def test_name_defaults_to_scenario(self) -> None: + """Tool name defaults to scenario name.""" + task = Task(scenario="investigate", args={}) + tool = AgentTool(task, model="claude") + + assert tool.name == "investigate" + + def test_name_can_be_overridden(self) -> None: + """Tool name can be overridden.""" + task = Task(scenario="investigate", args={}) + tool = AgentTool(task, model="claude", name="custom_name") + + assert tool.name == "custom_name" + + +class TestAgentToolParamFiltering: + """Tests for parameter filtering (eval-only params hidden).""" + + def test_filters_eval_only_params(self) -> None: + """Eval-only params (| None = None) are filtered from visible_params.""" + env = Environment("test") + + # Use Union syntax for consistency across Python versions + @env.scenario() + async def investigate( + issue_id: str, + include_traces: bool = True, + expected_cause: str | None = None, # Eval only + ): + yield {"task": f"Investigate {issue_id}"} + + task = env("investigate") + tool = AgentTool(task, model="claude") + + # visible_params should only have issue_id and include_traces + assert "issue_id" in tool._visible_params + assert "include_traces" in tool._visible_params + assert "expected_cause" not in tool._visible_params + + def test_all_required_params_visible(self) -> None: + """All required params are visible.""" + env = Environment("test") + + @env.scenario() + async def search(query: str, limit: int): + yield {"task": f"Search: {query}"} + + task = env("search") + tool = AgentTool(task, model="claude") + + assert "query" in tool._visible_params + assert "limit" in tool._visible_params + + def test_optional_with_default_visible(self) -> None: + """Optional params with non-None defaults are visible.""" + env = Environment("test") + + @env.scenario() + async def fetch(url: str, request_timeout: int = 30, retries: int = 3): + yield {"task": f"Fetch {url}"} + + task = env("fetch") + tool = AgentTool(task, model="claude") + + assert "url" in tool._visible_params + assert "request_timeout" in tool._visible_params + assert "retries" in tool._visible_params + + +class TestAgentToolSchema: + """Tests for JSON schema generation.""" + + def test_builds_json_schema(self) -> None: + """Builds proper JSON schema from visible params.""" + env = Environment("test") + + @env.scenario() + async def investigate(issue_id: str, verbose: bool = False): + yield {"task": f"Investigate {issue_id}"} + + task = env("investigate") + tool = AgentTool(task, model="claude") + + schema = tool._param_schema + assert schema is not None + assert schema["type"] == "object" + assert "issue_id" in schema["properties"] + assert "verbose" in schema["properties"] + assert "issue_id" in schema["required"] + assert "verbose" not in schema["required"] # Has default + + def test_schema_excludes_eval_only(self) -> None: + """Schema excludes eval-only params.""" + env = Environment("test") + + @env.scenario() + async def check( + item_id: str, + expected_status: str | None = None, # Eval only + ): + yield {"task": f"Check {item_id}"} + + task = env("check") + tool = AgentTool(task, model="claude") + + schema = tool._param_schema + assert schema is not None + assert "item_id" in schema["properties"] + assert "expected_status" not in schema["properties"] + + +class TestAgentToolMCP: + """Tests for MCP tool integration.""" + + def test_mcp_property_returns_tool(self) -> None: + """The mcp property returns a FastMCP FunctionTool.""" + from fastmcp.tools import FunctionTool + + env = Environment("test") + + @env.scenario() + async def greet(name: str): + yield {"task": f"Greet {name}"} + + task = env("greet") + tool = AgentTool(task, model="claude") + + mcp_tool = tool.mcp + assert isinstance(mcp_tool, FunctionTool) + + def test_mcp_has_filtered_parameters(self) -> None: + """MCP tool has filtered parameter schema.""" + env = Environment("test") + + @env.scenario() + async def analyze( + data: str, + expected_result: str | None = None, # Eval only + ): + yield {"task": f"Analyze {data}"} + + task = env("analyze") + tool = AgentTool(task, model="claude") + + mcp_tool = tool.mcp + params = mcp_tool.parameters # FunctionTool uses 'parameters' + + assert "data" in params["properties"] + assert "expected_result" not in params["properties"] + + +class TestAgentToolCall: + """Tests for AgentTool.__call__.""" + + @pytest.mark.asyncio + async def test_filters_kwargs_to_visible_only(self) -> None: + """Call filters kwargs to visible params only.""" + # Import modules first so patches work + import hud.agents + import hud.eval.manager # noqa: F401 + + env = Environment("test") + + @env.scenario() + async def process(item: str, expected: str | None = None): + yield {"task": f"Process {item}"} + + task = env("process") + tool = AgentTool(task, model="claude") + + # Mock the eval context and agent + with ( + patch("hud.eval.manager.run_eval") as mock_run_eval, + patch("hud.agents.create_agent") as mock_create_agent, + ): + mock_ctx = AsyncMock() + mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx) + mock_ctx.__aexit__ = AsyncMock(return_value=None) + mock_run_eval.return_value = mock_ctx + + mock_agent = MagicMock() + mock_agent.run = AsyncMock(return_value=MagicMock(content="result")) + mock_create_agent.return_value = mock_agent + + # Call with both visible and eval-only params + await tool(item="test", expected="should_be_filtered") + + # Check that task was created with filtered args + call_args = mock_run_eval.call_args + task_arg = call_args[0][0] + assert "item" in task_arg.args + assert "expected" not in task_arg.args # Filtered out + + @pytest.mark.asyncio + async def test_merges_template_args(self) -> None: + """Call merges kwargs with template args.""" + # Import modules first so patches work + import hud.agents + import hud.eval.manager # noqa: F401 + + env = Environment("test") + + @env.scenario() + async def search(query: str, limit: int = 10): + yield {"task": f"Search {query}"} + + # Create template with some args pre-filled + task = env("search", limit=5) + tool = AgentTool(task, model="claude") + + with ( + patch("hud.eval.manager.run_eval") as mock_run_eval, + patch("hud.agents.create_agent") as mock_create_agent, + ): + mock_ctx = AsyncMock() + mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx) + mock_ctx.__aexit__ = AsyncMock(return_value=None) + mock_run_eval.return_value = mock_ctx + + mock_agent = MagicMock() + mock_agent.run = AsyncMock(return_value=MagicMock(content="result")) + mock_create_agent.return_value = mock_agent + + # Call with additional arg + await tool(query="test query") + + # Check merged args + call_args = mock_run_eval.call_args + task_arg = call_args[0][0] + assert task_arg.args["query"] == "test query" + assert task_arg.args["limit"] == 5 # From template diff --git a/hud/utils/strict_schema.py b/hud/utils/strict_schema.py index 5d3fa0da..263919b3 100644 --- a/hud/utils/strict_schema.py +++ b/hud/utils/strict_schema.py @@ -118,7 +118,7 @@ def _ensure_strict_json_schema( if "default" in json_schema: json_schema.pop("default") - for keyword in ("title", "examples"): + for keyword in ("title", "examples", "format"): json_schema.pop(keyword, None) ref = json_schema.get("$ref") diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py index 8ba18f35..1d40fb59 100644 --- a/hud/utils/tests/test_version.py +++ b/hud/utils/tests/test_version.py @@ -5,4 +5,4 @@ def test_import(): """Test that the package can be imported.""" import hud - assert hud.__version__ == "0.5.2" + assert hud.__version__ == "0.5.3" diff --git a/hud/version.py b/hud/version.py index 6fbebfff..c9ba33a6 100644 --- a/hud/version.py +++ b/hud/version.py @@ -4,4 +4,4 @@ from __future__ import annotations -__version__ = "0.5.2" +__version__ = "0.5.3" diff --git a/pyproject.toml b/pyproject.toml index 3cb5c7d3..b1ad8b5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "hud-python" -version = "0.5.2" +version = "0.5.3" description = "SDK for the HUD platform." readme = "README.md" requires-python = ">=3.11, <3.13"