hud-evals · lorenss-m · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/docs/docs.json b/docs/docs.json
@@ -33,7 +33,7 @@
         "icon": "code",
         "versions": [
           {
-            "version": "0.5.3",
+            "version": "0.5.4",
             "groups": [
               {
                 "group": "Get Started",

diff --git a/hud/agents/claude.py b/hud/agents/claude.py
@@ -76,10 +76,18 @@ def __init__(self, params: ClaudeCreateParams | None = None, **kwargs: Any) -> N
 
         model_client = self.config.model_client
         if model_client is None:
-            api_key = settings.anthropic_api_key
-            if not api_key:
-                raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY.")
-            model_client = AsyncAnthropic(api_key=api_key)
+            # Default to HUD gateway when HUD_API_KEY is available
+            if settings.api_key:
+                from hud.agents.gateway import build_gateway_client
+
+                model_client = build_gateway_client("anthropic")
+            elif settings.anthropic_api_key:
+                model_client = AsyncAnthropic(api_key=settings.anthropic_api_key)
+            else:
+                raise ValueError(
+                    "No API key found. Set HUD_API_KEY for HUD gateway, "
+                    "or ANTHROPIC_API_KEY for direct Anthropic access."
+                )
 
         self.anthropic_client = model_client
         self.max_tokens = self.config.max_tokens

diff --git a/hud/agents/gemini.py b/hud/agents/gemini.py
@@ -61,10 +61,18 @@ def __init__(self, params: GeminiCreateParams | None = None, **kwargs: Any) -> N
 
         model_client = self.config.model_client
         if model_client is None:
-            api_key = settings.gemini_api_key
-            if not api_key:
-                raise ValueError("Gemini API key not found. Set GEMINI_API_KEY.")
-            model_client = genai.Client(api_key=api_key)
+            # Default to HUD gateway when HUD_API_KEY is available
+            if settings.api_key:
+                from hud.agents.gateway import build_gateway_client
+
+                model_client = build_gateway_client("gemini")
+            elif settings.gemini_api_key:
+                model_client = genai.Client(api_key=settings.gemini_api_key)
+            else:
+                raise ValueError(
+                    "No API key found. Set HUD_API_KEY for HUD gateway, "
+                    "or GEMINI_API_KEY for direct Gemini access."
+                )
 
         if self.config.validate_api_key:
             try:

diff --git a/hud/agents/openai.py b/hud/agents/openai.py
@@ -79,10 +79,18 @@ def __init__(self, params: OpenAICreateParams | None = None, **kwargs: Any) -> N
 
         model_client = self.config.model_client
         if model_client is None:
-            api_key = settings.openai_api_key
-            if not api_key:
-                raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
-            model_client = AsyncOpenAI(api_key=api_key)
+            # Default to HUD gateway when HUD_API_KEY is available
+            if settings.api_key:
+                from hud.agents.gateway import build_gateway_client
+
+                model_client = build_gateway_client("openai")
+            elif settings.openai_api_key:
+                model_client = AsyncOpenAI(api_key=settings.openai_api_key)
+            else:
+                raise ValueError(
+                    "No API key found. Set HUD_API_KEY for HUD gateway, "
+                    "or OPENAI_API_KEY for direct OpenAI access."
+                )
 
         if self.config.validate_api_key:
             try:

diff --git a/hud/agents/tests/test_openai.py b/hud/agents/tests/test_openai.py
@@ -128,8 +128,9 @@ async def test_init_with_parameters(self, mock_openai: AsyncOpenAI) -> None:
     async def test_init_without_client_no_api_key(self) -> None:
         """Test agent initialization fails without API key."""
         with patch("hud.agents.openai.settings") as mock_settings:
+            mock_settings.api_key = None
             mock_settings.openai_api_key = None
-            with pytest.raises(ValueError, match="OpenAI API key not found"):
+            with pytest.raises(ValueError, match="No API key found"):
                 OpenAIAgent.create()
 
     @pytest.mark.asyncio

diff --git a/hud/environment/environment.py b/hud/environment/environment.py
@@ -129,6 +129,7 @@ def __init__(
         super().__init__(name=name, instructions=instructions, **fastmcp_kwargs)
         self._connections: dict[str, Connector] = {}
         self._router = ToolRouter(conflict_resolution=conflict_resolution)
+        self._routing_built = False  # Track if _build_routing has been called
         self._in_context = False
 
         # Tool call queues - run after connections established
@@ -361,6 +362,7 @@ async def __aexit__(
         if self._connections:
             await asyncio.gather(*[c.disconnect() for c in self._connections.values()])
         self._router.clear()
+        self._routing_built = False
 
     async def run_async(
         self,
@@ -389,6 +391,7 @@ async def _build_routing(self) -> None:
             connections=self._connections,
             connection_order=list(self._connections.keys()),
         )
+        self._routing_built = True
         # Populate mock schemas for auto-generated mock values
         self._populate_mock_schemas()
 
@@ -406,6 +409,8 @@ def _setup_handlers(self) -> None:
 
     async def _env_list_tools(self) -> list[mcp_types.Tool]:
         """Return all tools including those from connectors."""
+        if not self._routing_built:
+            await self._build_routing()
         return self._router.tools
 
     async def _env_call_tool(self, name: str, arguments: dict[str, Any] | None = None) -> list[Any]:

diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py
@@ -199,8 +199,23 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) ->
                 except Exception:
                     available = "(could not fetch available scenarios)"
 
+                # Check if the prompt exists - if so, the error is something else
+                original_error = str(e)
+                if prompt_id in scenario_prompts:
+                    # Prompt exists but get_prompt failed for another reason
+                    raise ValueError(
+                        f"⚠️ ERROR: Scenario '{prompt_id}' exists but failed to execute.\n\n"
+                        f"The scenario was found but encountered an error during setup:\n"
+                        f"  {original_error}\n\n"
+                        f"This could be caused by:\n"
+                        f"  - Missing or invalid scenario arguments\n"
+                        f"  - An error in the scenario's setup function\n"
+                        f"  - Connection or serialization issues\n\n"
+                        f"Check the scenario definition and required arguments."
+                    ) from e
+
                 raise ValueError(
-                    f"Scenario not found.\n\n"
+                    f"⚠️ ERROR: Scenario not found.\n\n"
                     f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
                     f"If you only specify 'scenario_name', the SDK uses your task's env name "
                     f"as the prefix.\n"
@@ -362,7 +377,7 @@ def decorator(
                     # Only include JSON-serializable defaults
                     default_val = p.default
                     if default_val is None or isinstance(
-                        default_val, (str, int, float, bool, list, dict)
+                        default_val, (str | int | float | bool | list | dict)
                     ):
                         arg_info["default"] = default_val
 
@@ -413,26 +428,51 @@ async def prompt_handler(**handler_args: Any) -> list[str]:
 
                 # Deserialize JSON-encoded arguments using Pydantic TypeAdapter
                 # This properly handles: Pydantic models, enums, datetime, lists, dicts
+                # MCP prompts only support string arguments, so we JSON-serialize complex
+                # types on the sending side and deserialize them here
                 deserialized_args: dict[str, Any] = {}
                 for arg_name, arg_value in handler_args.items():
                     annotation = param_annotations.get(arg_name)
-                    if (
-                        annotation is not None
-                        and annotation is not str
-                        and isinstance(arg_value, str)
-                    ):
-                        # Try TypeAdapter.validate_json for proper type coercion
+
+                    # Only attempt deserialization on string values
+                    if not isinstance(arg_value, str):
+                        deserialized_args[arg_name] = arg_value
+                        continue
+
+                    # If annotation is explicitly str, keep as string (no deserialization)
+                    if annotation is str:
+                        deserialized_args[arg_name] = arg_value
+                        continue
+
+                    # If we have a non-str type annotation, use TypeAdapter
+                    if annotation is not None:
                         try:
                             adapter = TypeAdapter(annotation)
                             deserialized_args[arg_name] = adapter.validate_json(arg_value)
-                        except Exception:
-                            # Fall back to plain json.loads if TypeAdapter fails
-                            try:
-                                deserialized_args[arg_name] = json.loads(arg_value)
-                            except json.JSONDecodeError:
-                                deserialized_args[arg_name] = arg_value
-                    else:
-                        deserialized_args[arg_name] = arg_value
+                            continue
+                        except Exception:  # noqa: S110
+                            pass  # Fall through to generic JSON decode
+
+                    # No type annotation - try JSON decode for strings that look like JSON
+                    # (arrays, objects, numbers, booleans, null)
+                    stripped = arg_value.strip()
+                    if (stripped and stripped[0] in "[{") or stripped in ("true", "false", "null"):
+                        try:
+                            deserialized_args[arg_name] = json.loads(arg_value)
+                            continue
+                        except json.JSONDecodeError:
+                            pass  # Keep as string
+
+                    # Also try to decode if it looks like a number
+                    if stripped.lstrip("-").replace(".", "", 1).isdigit():
+                        try:
+                            deserialized_args[arg_name] = json.loads(arg_value)
+                            continue
+                        except json.JSONDecodeError:
+                            pass
+
+                    # Keep as string
+                    deserialized_args[arg_name] = arg_value
 
                 # Create generator instance with deserialized args
                 gen = scenario_fn(**deserialized_args)

diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py
@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
 
-    assert hud.__version__ == "0.5.3"
+    assert hud.__version__ == "0.5.4"
diff --git a/hud/version.py b/hud/version.py
@@ -4,4 +4,4 @@
 
 from __future__ import annotations
 
-__version__ = "0.5.3"
+__version__ = "0.5.4"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.5.3"
+version = "0.5.4"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.13"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,4 @@

		from __future__ import annotations

		__version__ = "0.5.3"
		__version__ = "0.5.4"