Skip to content

Commit 72c8202

Browse files
committed
add tests
1 parent f6cc032 commit 72c8202

File tree

1 file changed

+340
-0
lines changed

1 file changed

+340
-0
lines changed
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
"""Regression tests for Codex subscription mode fixes.
2+
3+
Tests cover four bugs that made LLM.subscription_login() unusable:
4+
1. prompt_cache_retention rejected by Codex endpoint (400)
5+
2. include/reasoning params cause silent empty output
6+
3. Streaming output items lost (response.completed has output=[])
7+
4. Reasoning item IDs cause 404 on follow-up requests (store=false)
8+
9+
See: https://github.com/OpenHands/software-agent-sdk/issues/2797
10+
"""
11+
12+
import json
13+
from types import SimpleNamespace
14+
from typing import Any
15+
16+
import pytest
17+
from litellm.types.llms.base import BaseLiteLLMOpenAIResponseObject
18+
from openai.types.responses.response_function_tool_call import (
19+
ResponseFunctionToolCall,
20+
)
21+
22+
from openhands.sdk.llm.llm import LLM
23+
from openhands.sdk.llm.message import (
24+
Message,
25+
MessageToolCall,
26+
ReasoningItemModel,
27+
TextContent,
28+
)
29+
from openhands.sdk.llm.options.responses_options import select_responses_options
30+
31+
32+
# ---------------------------------------------------------------------------
33+
# Helpers
34+
# ---------------------------------------------------------------------------
35+
36+
37+
def _make_subscription_llm() -> LLM:
38+
"""Create a minimal subscription-mode LLM for testing."""
39+
llm = LLM(
40+
model="openai/gpt-5.2-codex",
41+
base_url="https://chatgpt.com/backend-api/codex",
42+
reasoning_effort="high",
43+
)
44+
llm._is_subscription = True
45+
llm.enable_encrypted_reasoning = True
46+
return llm
47+
48+
49+
def _make_generic_output_item(**kwargs: Any) -> BaseLiteLLMOpenAIResponseObject:
50+
"""Build a BaseLiteLLMOpenAIResponseObject (the type litellm uses for
51+
streaming output items) with the given attributes."""
52+
return BaseLiteLLMOpenAIResponseObject.model_construct(**kwargs)
53+
54+
55+
# ---------------------------------------------------------------------------
56+
# Bug 1 & 2: Unsupported params must be skipped in subscription mode
57+
# ---------------------------------------------------------------------------
58+
59+
60+
@pytest.mark.parametrize(
61+
"param",
62+
[
63+
"prompt_cache_retention",
64+
"include",
65+
"reasoning",
66+
"temperature",
67+
"max_output_tokens",
68+
],
69+
)
70+
def test_subscription_skips_unsupported_param(param: str):
71+
"""The Codex subscription endpoint rejects or silently mishandles these
72+
parameters. They must be omitted when is_subscription is True."""
73+
llm = _make_subscription_llm()
74+
llm.max_output_tokens = 4096
75+
opts = select_responses_options(llm, {}, include=["text.output_text"], store=None)
76+
assert param not in opts
77+
78+
79+
@pytest.mark.parametrize(
80+
"param,expected_value",
81+
[
82+
("prompt_cache_retention", "24h"),
83+
("temperature", 1.0),
84+
],
85+
)
86+
def test_non_subscription_keeps_scalar_param(param: str, expected_value: Any):
87+
"""Non-subscription GPT-5 models should still send these params."""
88+
llm = LLM(model="openai/gpt-5.2-codex", reasoning_effort="high")
89+
llm.enable_encrypted_reasoning = True
90+
assert not llm.is_subscription
91+
opts = select_responses_options(llm, {}, include=None, store=None)
92+
assert opts.get(param) == expected_value
93+
94+
95+
@pytest.mark.parametrize(
96+
"param,check",
97+
[
98+
("include", lambda v: "reasoning.encrypted_content" in v),
99+
("reasoning", lambda v: v["effort"] == "high"),
100+
],
101+
)
102+
def test_non_subscription_keeps_structured_param(param: str, check: Any):
103+
"""Non-subscription LLMs should send include and reasoning normally."""
104+
llm = LLM(model="openai/gpt-5.2-codex", reasoning_effort="high")
105+
llm.enable_encrypted_reasoning = True
106+
assert not llm.is_subscription
107+
opts = select_responses_options(llm, {}, include=["text.output_text"], store=None)
108+
assert param in opts
109+
assert check(opts[param])
110+
111+
112+
# ---------------------------------------------------------------------------
113+
# Bug 3: from_llm_responses_output must handle generic litellm types
114+
# ---------------------------------------------------------------------------
115+
116+
117+
def _generic_function_call_item() -> BaseLiteLLMOpenAIResponseObject:
118+
return _make_generic_output_item(
119+
id="fc_abc",
120+
type="function_call",
121+
name="terminal",
122+
arguments='{"command": "ls"}',
123+
call_id="call_123",
124+
status="completed",
125+
)
126+
127+
128+
def _generic_message_item() -> BaseLiteLLMOpenAIResponseObject:
129+
text_part = SimpleNamespace(type="output_text", text="Hello world")
130+
return _make_generic_output_item(
131+
id="m_1",
132+
type="message",
133+
role="assistant",
134+
status="completed",
135+
content=[text_part],
136+
)
137+
138+
139+
def _generic_reasoning_item() -> BaseLiteLLMOpenAIResponseObject:
140+
summary = SimpleNamespace(type="summary_text", text="thinking")
141+
return _make_generic_output_item(
142+
id="rs_abc",
143+
type="reasoning",
144+
summary=[summary],
145+
content=None,
146+
encrypted_content=None,
147+
status="completed",
148+
)
149+
150+
151+
def _dict_function_call_item() -> dict[str, Any]:
152+
return {
153+
"type": "function_call",
154+
"name": "file_editor",
155+
"arguments": '{"command": "view"}',
156+
"call_id": "call_456",
157+
"id": "fc_456",
158+
}
159+
160+
161+
def _dict_message_item() -> dict[str, Any]:
162+
return {
163+
"type": "message",
164+
"role": "assistant",
165+
"content": [{"type": "output_text", "text": "Hi"}],
166+
}
167+
168+
169+
def _typed_function_call_item() -> ResponseFunctionToolCall:
170+
return ResponseFunctionToolCall(
171+
type="function_call",
172+
name="think",
173+
arguments="{}",
174+
call_id="fc_typed",
175+
id="fc_typed",
176+
)
177+
178+
179+
@pytest.mark.parametrize(
180+
"item_factory,expected_tool,expected_text",
181+
[
182+
pytest.param(
183+
_generic_function_call_item,
184+
{"name": "terminal", "arguments": '{"command": "ls"}', "id": "call_123"},
185+
None,
186+
id="generic-function-call",
187+
),
188+
pytest.param(
189+
_dict_function_call_item,
190+
{
191+
"name": "file_editor",
192+
"arguments": '{"command": "view"}',
193+
"id": "call_456",
194+
},
195+
None,
196+
id="dict-function-call",
197+
),
198+
pytest.param(
199+
_typed_function_call_item,
200+
{"name": "think", "arguments": "{}", "id": "fc_typed"},
201+
None,
202+
id="typed-function-call",
203+
),
204+
pytest.param(
205+
_generic_message_item,
206+
None,
207+
"Hello world",
208+
id="generic-message",
209+
),
210+
pytest.param(
211+
_dict_message_item,
212+
None,
213+
"Hi",
214+
id="dict-message",
215+
),
216+
],
217+
)
218+
def test_from_llm_responses_output_item_type(
219+
item_factory: Any,
220+
expected_tool: dict[str, str] | None,
221+
expected_text: str | None,
222+
):
223+
"""from_llm_responses_output must parse function_call and message items
224+
regardless of whether they arrive as typed Pydantic objects, generic
225+
BaseLiteLLMOpenAIResponseObject, or plain dicts."""
226+
item = item_factory()
227+
msg = Message.from_llm_responses_output([item])
228+
229+
if expected_tool is not None:
230+
assert msg.tool_calls is not None
231+
assert len(msg.tool_calls) == 1
232+
tc = msg.tool_calls[0]
233+
assert tc.name == expected_tool["name"]
234+
assert tc.arguments == expected_tool["arguments"]
235+
assert tc.id == expected_tool["id"]
236+
if expected_text is not None:
237+
assert len(msg.content) == 1
238+
assert isinstance(msg.content[0], TextContent)
239+
assert msg.content[0].text == expected_text
240+
241+
242+
@pytest.mark.parametrize(
243+
"item_factory,expected_id,expected_summary",
244+
[
245+
pytest.param(
246+
_generic_reasoning_item,
247+
"rs_abc",
248+
["thinking"],
249+
id="generic-reasoning",
250+
),
251+
],
252+
)
253+
def test_from_llm_responses_output_reasoning_item(
254+
item_factory: Any,
255+
expected_id: str,
256+
expected_summary: list[str],
257+
):
258+
"""Reasoning items from streaming should be parsed into ReasoningItemModel."""
259+
item = item_factory()
260+
msg = Message.from_llm_responses_output([item])
261+
assert msg.responses_reasoning_item is not None
262+
assert msg.responses_reasoning_item.id == expected_id
263+
assert msg.responses_reasoning_item.summary == expected_summary
264+
265+
266+
def test_mixed_typed_and_generic_items():
267+
"""Parser should handle a mix of typed and generic items in one call."""
268+
typed_fc = _typed_function_call_item()
269+
generic_fc = _generic_function_call_item()
270+
msg = Message.from_llm_responses_output([typed_fc, generic_fc])
271+
assert msg.tool_calls is not None
272+
assert len(msg.tool_calls) == 2
273+
assert {tc.name for tc in msg.tool_calls} == {"think", "terminal"}
274+
275+
276+
# ---------------------------------------------------------------------------
277+
# Bug 4: Reasoning item IDs must be stripped in subscription mode
278+
# ---------------------------------------------------------------------------
279+
280+
281+
def _make_conversation_messages() -> tuple[Message, Message, Message, Message]:
282+
"""Build a minimal multi-turn conversation with a reasoning item."""
283+
sys_msg = Message(
284+
role="system",
285+
content=[TextContent(text="You are a helpful assistant.")],
286+
)
287+
user_msg = Message(
288+
role="user",
289+
content=[TextContent(text="Now create FACTS.txt")],
290+
)
291+
assistant_msg = Message(
292+
role="assistant",
293+
content=[TextContent(text="I'll look at the files.")],
294+
tool_calls=[
295+
MessageToolCall(
296+
id="call_1",
297+
name="terminal",
298+
arguments='{"command": "ls"}',
299+
origin="responses",
300+
)
301+
],
302+
responses_reasoning_item=ReasoningItemModel(
303+
id="rs_should_be_stripped",
304+
summary=["thinking about files"],
305+
content=None,
306+
encrypted_content=None,
307+
status="completed",
308+
),
309+
)
310+
tool_msg = Message(
311+
role="tool",
312+
content=[TextContent(text="file1.py file2.py")],
313+
tool_call_id="call_1",
314+
)
315+
return sys_msg, user_msg, assistant_msg, tool_msg
316+
317+
318+
@pytest.mark.parametrize(
319+
"is_subscription,reasoning_id_present",
320+
[
321+
pytest.param(True, False, id="subscription-strips-reasoning"),
322+
pytest.param(False, True, id="non-subscription-preserves-reasoning"),
323+
],
324+
)
325+
def test_format_messages_reasoning_item_handling(
326+
is_subscription: bool, reasoning_id_present: bool
327+
):
328+
"""Subscription mode must strip reasoning item IDs (store=false means they
329+
can't be resolved). Non-subscription mode must preserve them."""
330+
llm = LLM(model="openai/gpt-5.2-codex")
331+
if is_subscription:
332+
llm._is_subscription = True
333+
334+
sys_msg, user_msg, assistant_msg, tool_msg = _make_conversation_messages()
335+
_, input_items = llm.format_messages_for_responses(
336+
[sys_msg, user_msg, assistant_msg, tool_msg]
337+
)
338+
339+
serialized = json.dumps(input_items, default=str)
340+
assert ("rs_should_be_stripped" in serialized) == reasoning_id_present

0 commit comments

Comments
 (0)