Merge branch 'feat/multi-agent-trace-context-propagation' of https://github.com/hud-evals/hud-python into l/fmt-server

lorenss-m · lorenss-m · commit 4082bedd0dde · 2026-01-22T12:28:23.000-08:00
diff --git a/hud/environment/connection.py b/hud/environment/connection.py
@@ -141,9 +141,10 @@ async def list_tools(self) -> list[mcp_types.Tool]:
         Always fetches fresh data from the server (no caching check).
         The result is cached for use by router.build() via cached_tools property.
         """
-        if self.client is None:
+        client = self.client
+        if client is None:
             raise RuntimeError("Not connected - call connect() first")
-        tools = await self.client.list_tools()
+        tools = await client.list_tools()
 
         result: list[mcp_types.Tool] = []
         for tool in tools:
@@ -188,12 +189,54 @@ async def call_tool(
         self, name: str, arguments: dict[str, Any] | None = None
     ) -> mcp_types.CallToolResult:
         """Call a tool, stripping prefix if needed."""
-        if self.client is None:
+        client = self.client
+        if client is None:
             raise RuntimeError("Not connected - call connect() first")
         # Strip prefix when calling remote
         if self.config.prefix and name.startswith(f"{self.config.prefix}_"):
             name = name[len(self.config.prefix) + 1 :]
-        return await self.client.call_tool_mcp(name, arguments or {})
+
+        from hud.eval.context import get_current_trace_id
+
+        args = dict(arguments or {})
+        trace_id = get_current_trace_id()
+        meta = {"_hud_trace_id": trace_id} if trace_id else None
+
+        if meta:
+            try:
+                meta_kwargs: dict[str, Any] = {"meta": meta}
+                result = await client.call_tool(name=name, arguments=args, **meta_kwargs)
+            except TypeError as e:
+                if "unexpected keyword argument" not in str(e):
+                    raise
+                try:
+                    meta_kwargs = {"_meta": meta}
+                    result = await client.call_tool(name=name, arguments=args, **meta_kwargs)
+                except TypeError as e2:
+                    if "unexpected keyword argument" not in str(e2):
+                        raise
+                    result = await client.call_tool(name=name, arguments=args)
+        else:
+            result = await client.call_tool(name=name, arguments=args)
+
+        # FastMCP and mcp-python use slightly different result shapes/types.
+        # Normalize to mcp.types.CallToolResult for the rest of HUD.
+        is_error = getattr(result, "isError", None)
+        if is_error is None:
+            is_error = getattr(result, "is_error", False)
+        structured = getattr(result, "structuredContent", None)
+        if structured is None:
+            structured = getattr(result, "structured_content", None)
+
+        content = getattr(result, "content", None)
+        if content is None:
+            content = []
+
+        return mcp_types.CallToolResult(
+            content=content,
+            isError=bool(is_error),
+            structuredContent=structured,
+        )
 
     async def list_resources(self) -> list[mcp_types.Resource]:
         """Fetch resources from server and cache.
diff --git a/hud/environment/environment.py b/hud/environment/environment.py
@@ -512,9 +512,26 @@ async def _env_list_tools(self) -> list[mcp_types.Tool]:
             await self._build_tool_routing()
         return self._router.tools
 
-    async def _env_call_tool(self, name: str, arguments: dict[str, Any] | None = None) -> list[Any]:
+    async def _env_call_tool(
+        self, name: str, arguments: dict[str, Any] | None = None, **kwargs: Any
+    ) -> list[Any]:
         """Route tool calls through our router (handles both local and connector tools)."""
-        result = await self._execute_tool(name, arguments or {})
+        args = dict(arguments or {})
+
+        # Extract trace context propagated via MCP request (meta or arguments)
+        trace_id = args.pop("_hud_trace_id", None)
+        meta = kwargs.get("_meta") or kwargs.get("meta")
+        if not trace_id and isinstance(meta, dict):
+            trace_id = meta.get("_hud_trace_id") or meta.get("trace_id")
+
+        if trace_id:
+            from hud.eval.context import set_trace_context
+
+            with set_trace_context(trace_id):
+                result = await self._execute_tool(name, args)
+        else:
+            result = await self._execute_tool(name, args)
+
         return result.content or []
 
     # =========================================================================
diff --git a/hud/environment/tests/test_connection.py b/hud/environment/tests/test_connection.py
@@ -281,13 +281,13 @@ async def test_call_tool_strips_prefix(self) -> None:
 
         mock_result = mcp_types.CallToolResult(content=[], isError=False)
         mock_client = MagicMock()
-        mock_client.call_tool_mcp = AsyncMock(return_value=mock_result)
+        mock_client.call_tool = AsyncMock(return_value=mock_result)
         connector.client = mock_client
 
         await connector.call_tool("myprefix_tool1", {"arg": "value"})
 
         # Prefix should be stripped
-        mock_client.call_tool_mcp.assert_called_once_with("tool1", {"arg": "value"})
+        mock_client.call_tool.assert_called_once_with(name="tool1", arguments={"arg": "value"})
 
     @pytest.mark.asyncio
     async def test_call_tool_raises_when_not_connected(self) -> None:
diff --git a/hud/eval/context.py b/hud/eval/context.py
@@ -12,6 +12,7 @@
 import contextvars
 import logging
 import uuid
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Self
 
 from hud.environment import Environment
@@ -20,6 +21,7 @@
 from hud.telemetry import flush, instrument
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
     from types import TracebackType
 
     from hud.eval.task import Task
@@ -58,6 +60,20 @@ def get_current_trace_id() -> str | None:
     return None
 
 
+@contextmanager
+def set_trace_context(trace_id: str) -> Generator[None, None, None]:
+    """Temporarily set trace context from an external trace_id.
+
+    Used by MCP tool handlers to propagate parent trace context into sub-processes.
+    """
+    headers = {"Trace-Id": trace_id}
+    token = _current_trace_headers.set(headers)
+    try:
+        yield
+    finally:
+        _current_trace_headers.reset(token)
+
+
 def get_current_api_key() -> str | None:
     """Get the current API key override from context.
 
@@ -724,4 +740,5 @@ def _print_single_result(self, error_msg: str | None) -> None:
     "get_current_api_key",
     "get_current_trace_headers",
     "get_current_trace_id",
+    "set_trace_context",
 ]
diff --git a/hud/eval/tests/test_context.py b/hud/eval/tests/test_context.py
@@ -9,6 +9,8 @@
 from hud.eval.context import (
     EvalContext,
     get_current_trace_headers,
+    get_current_trace_id,
+    set_trace_context,
 )
 
 
@@ -90,6 +92,15 @@ async def test_context_manager_sets_headers(self) -> None:
 
             assert get_current_trace_headers() is None
 
+    def test_set_trace_context(self) -> None:
+        """set_trace_context sets and resets Trace-Id."""
+        assert get_current_trace_id() is None
+
+        with set_trace_context("test-trace-123"):
+            assert get_current_trace_id() == "test-trace-123"
+
+        assert get_current_trace_id() is None
+
     def test_repr(self) -> None:
         """__repr__ shows useful info."""
         ctx = EvalContext(