Skip to content

Commit 1a5b24c

Browse files
test(sdk): cover mixed malformed tool-call retries
Add a regression test for a raw LLM response that mixes one valid tool call with one malformed tool call. The test verifies the malformed batch is rejected and retried at the LLM layer before any partial ActionEvent reaches the agent. Co-authored-by: openhands <openhands@all-hands.dev>
1 parent bf841e8 commit 1a5b24c

1 file changed

Lines changed: 193 additions & 6 deletions

File tree

tests/sdk/agent/test_malformed_tool_call_arguments.py

Lines changed: 193 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
"""Tests that malformed tool call arguments are rejected at the LLM layer
2-
and retried automatically, preventing malformed data from ever reaching
3-
the agent (see #2887).
1+
"""Tests for malformed tool call arguments returned by the raw LLM transport.
2+
3+
The fix for #2887 now lives in ``LLM.completion()``: raw LiteLLM responses
4+
with malformed tool-call JSON are rejected and retried before the agent sees
5+
any action events.
46
"""
57

8+
from collections.abc import Sequence
9+
from typing import TYPE_CHECKING, Self
10+
from unittest.mock import patch
11+
612
import pytest
713
from litellm import ChatCompletionMessageToolCall
814
from litellm.types.utils import (
@@ -11,14 +17,74 @@
1117
Message as LiteLLMMessage,
1218
ModelResponse,
1319
)
20+
from pydantic import SecretStr
1421

22+
from openhands.sdk.agent import Agent
23+
from openhands.sdk.conversation import Conversation
24+
from openhands.sdk.event import (
25+
ActionEvent,
26+
AgentErrorEvent,
27+
MessageEvent,
28+
ObservationEvent,
29+
)
30+
from openhands.sdk.llm import LLM, Message, TextContent
1531
from openhands.sdk.llm.exceptions import LLMMalformedToolArgsError
1632
from openhands.sdk.llm.llm import _validate_tool_call_args
33+
from openhands.sdk.tool import Action, Observation, Tool, ToolExecutor, register_tool
34+
from openhands.sdk.tool.tool import ToolDefinition
35+
36+
37+
if TYPE_CHECKING:
38+
from openhands.sdk.conversation.state import ConversationState
39+
40+
41+
class RetryValidationAction(Action):
42+
command: str = ""
43+
path: str = ""
44+
old_str: str = ""
45+
46+
47+
class RetryValidationObservation(Observation):
48+
result: str = ""
1749

1850

19-
def _make_response(arguments: str) -> ModelResponse:
51+
class RetryValidationExecutor(
52+
ToolExecutor[RetryValidationAction, RetryValidationObservation]
53+
):
54+
def __call__(
55+
self, action: RetryValidationAction, conversation=None
56+
) -> RetryValidationObservation:
57+
return RetryValidationObservation(result=f"ok:{action.path}")
58+
59+
60+
class RetryValidationTool(
61+
ToolDefinition[RetryValidationAction, RetryValidationObservation]
62+
):
63+
name = "retry_validation_tool"
64+
65+
@classmethod
66+
def create(cls, conv_state: "ConversationState | None" = None) -> Sequence[Self]:
67+
return [
68+
cls(
69+
description="Tool for malformed tool-call retry regression tests",
70+
action_type=RetryValidationAction,
71+
observation_type=RetryValidationObservation,
72+
executor=RetryValidationExecutor(),
73+
)
74+
]
75+
76+
77+
register_tool("RetryValidationTool", RetryValidationTool)
78+
79+
80+
def _make_response(
81+
arguments: str,
82+
*,
83+
response_id: str = "resp-1",
84+
tool_name: str = "file_editor",
85+
) -> ModelResponse:
2086
return ModelResponse(
21-
id="resp-1",
87+
id=response_id,
2288
choices=[
2389
Choices(
2490
index=0,
@@ -30,7 +96,7 @@ def _make_response(arguments: str) -> ModelResponse:
3096
id="call_1",
3197
type="function",
3298
function=Function(
33-
name="file_editor",
99+
name=tool_name,
34100
arguments=arguments,
35101
),
36102
)
@@ -81,3 +147,124 @@ def test_malformed_tool_args_in_retry_exceptions():
81147
from openhands.sdk.llm.llm import LLM_RETRY_EXCEPTIONS
82148

83149
assert LLMMalformedToolArgsError in LLM_RETRY_EXCEPTIONS
150+
151+
152+
def test_mixed_tool_batch_is_retried_before_any_action_event_is_persisted():
153+
"""Reject the whole raw LLM response before the agent sees partial actions.
154+
155+
Regression target:
156+
1. First transport response contains one valid tool call and one malformed one.
157+
2. The LLM layer must reject that *entire* response and retry.
158+
3. Only the retried response may produce ActionEvent/ObservationEvent entries.
159+
160+
We intentionally patch ``litellm_completion`` instead of using ``TestLLM``
161+
here because the fix lives inside ``LLM.completion()`` before raw provider
162+
responses are converted into SDK ``Message`` objects.
163+
"""
164+
malformed_batch = ModelResponse(
165+
id="resp-bad",
166+
choices=[
167+
Choices(
168+
index=0,
169+
message=LiteLLMMessage(
170+
role="assistant",
171+
content="bad batch",
172+
tool_calls=[
173+
ChatCompletionMessageToolCall(
174+
id="call_good",
175+
type="function",
176+
function=Function(
177+
name="retry_validation_tool",
178+
arguments='{"command":"view","path":"/first","old_str":"x"}',
179+
),
180+
),
181+
ChatCompletionMessageToolCall(
182+
id="call_bad",
183+
type="function",
184+
function=Function(
185+
name="retry_validation_tool",
186+
arguments='{"command":"create","path":"/broken","old_str":"unterminated',
187+
),
188+
),
189+
],
190+
),
191+
finish_reason="tool_calls",
192+
)
193+
],
194+
created=0,
195+
model="test-model",
196+
object="chat.completion",
197+
)
198+
retried_response = ModelResponse(
199+
id="resp-good",
200+
choices=[
201+
Choices(
202+
index=0,
203+
message=LiteLLMMessage(
204+
role="assistant",
205+
content="retry batch",
206+
tool_calls=[
207+
ChatCompletionMessageToolCall(
208+
id="call_retry",
209+
type="function",
210+
function=Function(
211+
name="retry_validation_tool",
212+
arguments='{"command":"view","path":"/retry","old_str":"y"}',
213+
),
214+
)
215+
],
216+
),
217+
finish_reason="tool_calls",
218+
)
219+
],
220+
created=0,
221+
model="test-model",
222+
object="chat.completion",
223+
)
224+
225+
llm = LLM(
226+
usage_id="test-llm",
227+
model="test-model",
228+
api_key=SecretStr("test-key"),
229+
base_url="http://test",
230+
num_retries=2,
231+
retry_min_wait=0,
232+
retry_max_wait=0,
233+
)
234+
agent = Agent(llm=llm, tools=[Tool(name="RetryValidationTool")])
235+
conversation = Conversation(agent=agent)
236+
237+
with patch(
238+
"openhands.sdk.llm.llm.litellm_completion",
239+
side_effect=[malformed_batch, retried_response],
240+
) as completion_mock:
241+
conversation.send_message(
242+
Message(role="user", content=[TextContent(text="Do something")])
243+
)
244+
agent.step(conversation, on_event=conversation._on_event)
245+
246+
action_events = [e for e in conversation.state.events if isinstance(e, ActionEvent)]
247+
observation_events = [
248+
e for e in conversation.state.events if isinstance(e, ObservationEvent)
249+
]
250+
error_events = [
251+
e for e in conversation.state.events if isinstance(e, AgentErrorEvent)
252+
]
253+
user_events = [
254+
e
255+
for e in conversation.state.events
256+
if isinstance(e, MessageEvent) and e.source == "user"
257+
]
258+
259+
# The malformed batch should be rejected inside LLM.completion(), which
260+
# means the transport must be called again and the first batch must leave
261+
# no trace in the persisted event stream.
262+
assert completion_mock.call_count == 2
263+
assert [event.tool_call_id for event in action_events] == ["call_retry"]
264+
assert [event.tool_call_id for event in observation_events] == ["call_retry"]
265+
assert error_events == []
266+
267+
# If the old agent-layer bug regresses, these IDs/messages would appear.
268+
assert {event.tool_call_id for event in action_events} == {"call_retry"}
269+
assert {event.tool_call_id for event in observation_events} == {"call_retry"}
270+
assert len(user_events) == 1

0 commit comments

Comments
 (0)