test(sdk): cover mixed malformed tool-call retries

enyst · openhands-agent · enyst · commit 1a5b24c67d76 · 2026-04-21T15:19:30.000Z
Add a regression test for a raw LLM response that mixes one valid tool call
with one malformed tool call. The test verifies the malformed batch is rejected
and retried at the LLM layer before any partial ActionEvent reaches the agent.

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/tests/sdk/agent/test_malformed_tool_call_arguments.py b/tests/sdk/agent/test_malformed_tool_call_arguments.py
@@ -1,8 +1,14 @@
-"""Tests that malformed tool call arguments are rejected at the LLM layer
-and retried automatically, preventing malformed data from ever reaching
-the agent (see #2887).
+"""Tests for malformed tool call arguments returned by the raw LLM transport.
+
+The fix for #2887 now lives in ``LLM.completion()``: raw LiteLLM responses
+with malformed tool-call JSON are rejected and retried before the agent sees
+any action events.
 """
 
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Self
+from unittest.mock import patch
+
 import pytest
 from litellm import ChatCompletionMessageToolCall
 from litellm.types.utils import (
@@ -11,14 +17,74 @@
     Message as LiteLLMMessage,
     ModelResponse,
 )
+from pydantic import SecretStr
 
+from openhands.sdk.agent import Agent
+from openhands.sdk.conversation import Conversation
+from openhands.sdk.event import (
+    ActionEvent,
+    AgentErrorEvent,
+    MessageEvent,
+    ObservationEvent,
+)
+from openhands.sdk.llm import LLM, Message, TextContent
 from openhands.sdk.llm.exceptions import LLMMalformedToolArgsError
 from openhands.sdk.llm.llm import _validate_tool_call_args
+from openhands.sdk.tool import Action, Observation, Tool, ToolExecutor, register_tool
+from openhands.sdk.tool.tool import ToolDefinition
+
+
+if TYPE_CHECKING:
+    from openhands.sdk.conversation.state import ConversationState
+
+
+class RetryValidationAction(Action):
+    command: str = ""
+    path: str = ""
+    old_str: str = ""
+
+
+class RetryValidationObservation(Observation):
+    result: str = ""
 
 
-def _make_response(arguments: str) -> ModelResponse:
+class RetryValidationExecutor(
+    ToolExecutor[RetryValidationAction, RetryValidationObservation]
+):
+    def __call__(
+        self, action: RetryValidationAction, conversation=None
+    ) -> RetryValidationObservation:
+        return RetryValidationObservation(result=f"ok:{action.path}")
+
+
+class RetryValidationTool(
+    ToolDefinition[RetryValidationAction, RetryValidationObservation]
+):
+    name = "retry_validation_tool"
+
+    @classmethod
+    def create(cls, conv_state: "ConversationState | None" = None) -> Sequence[Self]:
+        return [
+            cls(
+                description="Tool for malformed tool-call retry regression tests",
+                action_type=RetryValidationAction,
+                observation_type=RetryValidationObservation,
+                executor=RetryValidationExecutor(),
+            )
+        ]
+
+
+register_tool("RetryValidationTool", RetryValidationTool)
+
+
+def _make_response(
+    arguments: str,
+    *,
+    response_id: str = "resp-1",
+    tool_name: str = "file_editor",
+) -> ModelResponse:
     return ModelResponse(
-        id="resp-1",
+        id=response_id,
         choices=[
             Choices(
                 index=0,
@@ -30,7 +96,7 @@ def _make_response(arguments: str) -> ModelResponse:
                             id="call_1",
                             type="function",
                             function=Function(
-                                name="file_editor",
+                                name=tool_name,
                                 arguments=arguments,
                             ),
                         )
@@ -81,3 +147,124 @@ def test_malformed_tool_args_in_retry_exceptions():
     from openhands.sdk.llm.llm import LLM_RETRY_EXCEPTIONS
 
     assert LLMMalformedToolArgsError in LLM_RETRY_EXCEPTIONS
+
+
+def test_mixed_tool_batch_is_retried_before_any_action_event_is_persisted():
+    """Reject the whole raw LLM response before the agent sees partial actions.
+
+    Regression target:
+    1. First transport response contains one valid tool call and one malformed one.
+    2. The LLM layer must reject that *entire* response and retry.
+    3. Only the retried response may produce ActionEvent/ObservationEvent entries.
+
+    We intentionally patch ``litellm_completion`` instead of using ``TestLLM``
+    here because the fix lives inside ``LLM.completion()`` before raw provider
+    responses are converted into SDK ``Message`` objects.
+    """
+    malformed_batch = ModelResponse(
+        id="resp-bad",
+        choices=[
+            Choices(
+                index=0,
+                message=LiteLLMMessage(
+                    role="assistant",
+                    content="bad batch",
+                    tool_calls=[
+                        ChatCompletionMessageToolCall(
+                            id="call_good",
+                            type="function",
+                            function=Function(
+                                name="retry_validation_tool",
+                                arguments='{"command":"view","path":"/first","old_str":"x"}',
+                            ),
+                        ),
+                        ChatCompletionMessageToolCall(
+                            id="call_bad",
+                            type="function",
+                            function=Function(
+                                name="retry_validation_tool",
+                                arguments='{"command":"create","path":"/broken","old_str":"unterminated',
+                            ),
+                        ),
+                    ],
+                ),
+                finish_reason="tool_calls",
+            )
+        ],
+        created=0,
+        model="test-model",
+        object="chat.completion",
+    )
+    retried_response = ModelResponse(
+        id="resp-good",
+        choices=[
+            Choices(
+                index=0,
+                message=LiteLLMMessage(
+                    role="assistant",
+                    content="retry batch",
+                    tool_calls=[
+                        ChatCompletionMessageToolCall(
+                            id="call_retry",
+                            type="function",
+                            function=Function(
+                                name="retry_validation_tool",
+                                arguments='{"command":"view","path":"/retry","old_str":"y"}',
+                            ),
+                        )
+                    ],
+                ),
+                finish_reason="tool_calls",
+            )
+        ],
+        created=0,
+        model="test-model",
+        object="chat.completion",
+    )
+
+    llm = LLM(
+        usage_id="test-llm",
+        model="test-model",
+        api_key=SecretStr("test-key"),
+        base_url="http://test",
+        num_retries=2,
+        retry_min_wait=0,
+        retry_max_wait=0,
+    )
+    agent = Agent(llm=llm, tools=[Tool(name="RetryValidationTool")])
+    conversation = Conversation(agent=agent)
+
+    with patch(
+        "openhands.sdk.llm.llm.litellm_completion",
+        side_effect=[malformed_batch, retried_response],
+    ) as completion_mock:
+        conversation.send_message(
+            Message(role="user", content=[TextContent(text="Do something")])
+        )
+        agent.step(conversation, on_event=conversation._on_event)
+
+    action_events = [e for e in conversation.state.events if isinstance(e, ActionEvent)]
+    observation_events = [
+        e for e in conversation.state.events if isinstance(e, ObservationEvent)
+    ]
+    error_events = [
+        e for e in conversation.state.events if isinstance(e, AgentErrorEvent)
+    ]
+    user_events = [
+        e
+        for e in conversation.state.events
+        if isinstance(e, MessageEvent) and e.source == "user"
+    ]
+
+    # The malformed batch should be rejected inside LLM.completion(), which
+    # means the transport must be called again and the first batch must leave
+    # no trace in the persisted event stream.
+    assert completion_mock.call_count == 2
+    assert [event.tool_call_id for event in action_events] == ["call_retry"]
+    assert [event.tool_call_id for event in observation_events] == ["call_retry"]
+    assert error_events == []
+
+    # If the old agent-layer bug regresses, these IDs/messages would appear.
+    assert {event.tool_call_id for event in action_events} == {"call_retry"}
+    assert {event.tool_call_id for event in observation_events} == {"call_retry"}
+    assert len(user_events) == 1