1- """Tests that malformed tool call arguments are rejected at the LLM layer
2- and retried automatically, preventing malformed data from ever reaching
3- the agent (see #2887).
1+ """Tests for malformed tool call arguments returned by the raw LLM transport.
2+
3+ The fix for #2887 now lives in ``LLM.completion()``: raw LiteLLM responses
4+ with malformed tool-call JSON are rejected and retried before the agent sees
5+ any action events.
46"""
57
8+ from collections .abc import Sequence
9+ from typing import TYPE_CHECKING , Self
10+ from unittest .mock import patch
11+
612import pytest
713from litellm import ChatCompletionMessageToolCall
814from litellm .types .utils import (
1117 Message as LiteLLMMessage ,
1218 ModelResponse ,
1319)
20+ from pydantic import SecretStr
1421
22+ from openhands .sdk .agent import Agent
23+ from openhands .sdk .conversation import Conversation
24+ from openhands .sdk .event import (
25+ ActionEvent ,
26+ AgentErrorEvent ,
27+ MessageEvent ,
28+ ObservationEvent ,
29+ )
30+ from openhands .sdk .llm import LLM , Message , TextContent
1531from openhands .sdk .llm .exceptions import LLMMalformedToolArgsError
1632from openhands .sdk .llm .llm import _validate_tool_call_args
33+ from openhands .sdk .tool import Action , Observation , Tool , ToolExecutor , register_tool
34+ from openhands .sdk .tool .tool import ToolDefinition
35+
36+
37+ if TYPE_CHECKING :
38+ from openhands .sdk .conversation .state import ConversationState
39+
40+
41+ class RetryValidationAction (Action ):
42+ command : str = ""
43+ path : str = ""
44+ old_str : str = ""
45+
46+
47+ class RetryValidationObservation (Observation ):
48+ result : str = ""
1749
1850
19- def _make_response (arguments : str ) -> ModelResponse :
51+ class RetryValidationExecutor (
52+ ToolExecutor [RetryValidationAction , RetryValidationObservation ]
53+ ):
54+ def __call__ (
55+ self , action : RetryValidationAction , conversation = None
56+ ) -> RetryValidationObservation :
57+ return RetryValidationObservation (result = f"ok:{ action .path } " )
58+
59+
60+ class RetryValidationTool (
61+ ToolDefinition [RetryValidationAction , RetryValidationObservation ]
62+ ):
63+ name = "retry_validation_tool"
64+
65+ @classmethod
66+ def create (cls , conv_state : "ConversationState | None" = None ) -> Sequence [Self ]:
67+ return [
68+ cls (
69+ description = "Tool for malformed tool-call retry regression tests" ,
70+ action_type = RetryValidationAction ,
71+ observation_type = RetryValidationObservation ,
72+ executor = RetryValidationExecutor (),
73+ )
74+ ]
75+
76+
77+ register_tool ("RetryValidationTool" , RetryValidationTool )
78+
79+
80+ def _make_response (
81+ arguments : str ,
82+ * ,
83+ response_id : str = "resp-1" ,
84+ tool_name : str = "file_editor" ,
85+ ) -> ModelResponse :
2086 return ModelResponse (
21- id = "resp-1" ,
87+ id = response_id ,
2288 choices = [
2389 Choices (
2490 index = 0 ,
@@ -30,7 +96,7 @@ def _make_response(arguments: str) -> ModelResponse:
3096 id = "call_1" ,
3197 type = "function" ,
3298 function = Function (
33- name = "file_editor" ,
99+ name = tool_name ,
34100 arguments = arguments ,
35101 ),
36102 )
@@ -81,3 +147,124 @@ def test_malformed_tool_args_in_retry_exceptions():
81147 from openhands .sdk .llm .llm import LLM_RETRY_EXCEPTIONS
82148
83149 assert LLMMalformedToolArgsError in LLM_RETRY_EXCEPTIONS
150+
151+
152+ def test_mixed_tool_batch_is_retried_before_any_action_event_is_persisted ():
153+ """Reject the whole raw LLM response before the agent sees partial actions.
154+
155+ Regression target:
156+ 1. First transport response contains one valid tool call and one malformed one.
157+ 2. The LLM layer must reject that *entire* response and retry.
158+ 3. Only the retried response may produce ActionEvent/ObservationEvent entries.
159+
160+ We intentionally patch ``litellm_completion`` instead of using ``TestLLM``
161+ here because the fix lives inside ``LLM.completion()`` before raw provider
162+ responses are converted into SDK ``Message`` objects.
163+ """
164+ malformed_batch = ModelResponse (
165+ id = "resp-bad" ,
166+ choices = [
167+ Choices (
168+ index = 0 ,
169+ message = LiteLLMMessage (
170+ role = "assistant" ,
171+ content = "bad batch" ,
172+ tool_calls = [
173+ ChatCompletionMessageToolCall (
174+ id = "call_good" ,
175+ type = "function" ,
176+ function = Function (
177+ name = "retry_validation_tool" ,
178+ arguments = '{"command":"view","path":"/first","old_str":"x"}' ,
179+ ),
180+ ),
181+ ChatCompletionMessageToolCall (
182+ id = "call_bad" ,
183+ type = "function" ,
184+ function = Function (
185+ name = "retry_validation_tool" ,
186+ arguments = '{"command":"create","path":"/broken","old_str":"unterminated' ,
187+ ),
188+ ),
189+ ],
190+ ),
191+ finish_reason = "tool_calls" ,
192+ )
193+ ],
194+ created = 0 ,
195+ model = "test-model" ,
196+ object = "chat.completion" ,
197+ )
198+ retried_response = ModelResponse (
199+ id = "resp-good" ,
200+ choices = [
201+ Choices (
202+ index = 0 ,
203+ message = LiteLLMMessage (
204+ role = "assistant" ,
205+ content = "retry batch" ,
206+ tool_calls = [
207+ ChatCompletionMessageToolCall (
208+ id = "call_retry" ,
209+ type = "function" ,
210+ function = Function (
211+ name = "retry_validation_tool" ,
212+ arguments = '{"command":"view","path":"/retry","old_str":"y"}' ,
213+ ),
214+ )
215+ ],
216+ ),
217+ finish_reason = "tool_calls" ,
218+ )
219+ ],
220+ created = 0 ,
221+ model = "test-model" ,
222+ object = "chat.completion" ,
223+ )
224+
225+ llm = LLM (
226+ usage_id = "test-llm" ,
227+ model = "test-model" ,
228+ api_key = SecretStr ("test-key" ),
229+ base_url = "http://test" ,
230+ num_retries = 2 ,
231+ retry_min_wait = 0 ,
232+ retry_max_wait = 0 ,
233+ )
234+ agent = Agent (llm = llm , tools = [Tool (name = "RetryValidationTool" )])
235+ conversation = Conversation (agent = agent )
236+
237+ with patch (
238+ "openhands.sdk.llm.llm.litellm_completion" ,
239+ side_effect = [malformed_batch , retried_response ],
240+ ) as completion_mock :
241+ conversation .send_message (
242+ Message (role = "user" , content = [TextContent (text = "Do something" )])
243+ )
244+ agent .step (conversation , on_event = conversation ._on_event )
245+
246+ action_events = [e for e in conversation .state .events if isinstance (e , ActionEvent )]
247+ observation_events = [
248+ e for e in conversation .state .events if isinstance (e , ObservationEvent )
249+ ]
250+ error_events = [
251+ e for e in conversation .state .events if isinstance (e , AgentErrorEvent )
252+ ]
253+ user_events = [
254+ e
255+ for e in conversation .state .events
256+ if isinstance (e , MessageEvent ) and e .source == "user"
257+ ]
258+
259+ # The malformed batch should be rejected inside LLM.completion(), which
260+ # means the transport must be called again and the first batch must leave
261+ # no trace in the persisted event stream.
262+ assert completion_mock .call_count == 2
263+ assert [event .tool_call_id for event in action_events ] == ["call_retry" ]
264+ assert [event .tool_call_id for event in observation_events ] == ["call_retry" ]
265+ assert error_events == []
266+
267+ # If the old agent-layer bug regresses, these IDs/messages would appear.
268+ assert {event .tool_call_id for event in action_events } == {"call_retry" }
269+ assert {event .tool_call_id for event in observation_events } == {"call_retry" }
270+ assert len (user_events ) == 1
0 commit comments