|
| 1 | +"""Regression tests for Codex subscription mode fixes. |
| 2 | +
|
| 3 | +Tests cover four bugs that made LLM.subscription_login() unusable: |
| 4 | +1. prompt_cache_retention rejected by Codex endpoint (400) |
| 5 | +2. include/reasoning params cause silent empty output |
| 6 | +3. Streaming output items lost (response.completed has output=[]) |
| 7 | +4. Reasoning item IDs cause 404 on follow-up requests (store=false) |
| 8 | +
|
| 9 | +See: https://github.com/OpenHands/software-agent-sdk/issues/2797 |
| 10 | +""" |
| 11 | + |
| 12 | +import json |
| 13 | +from types import SimpleNamespace |
| 14 | +from typing import Any |
| 15 | + |
| 16 | +import pytest |
| 17 | +from litellm.types.llms.base import BaseLiteLLMOpenAIResponseObject |
| 18 | +from openai.types.responses.response_function_tool_call import ( |
| 19 | + ResponseFunctionToolCall, |
| 20 | +) |
| 21 | + |
| 22 | +from openhands.sdk.llm.llm import LLM |
| 23 | +from openhands.sdk.llm.message import ( |
| 24 | + Message, |
| 25 | + MessageToolCall, |
| 26 | + ReasoningItemModel, |
| 27 | + TextContent, |
| 28 | +) |
| 29 | +from openhands.sdk.llm.options.responses_options import select_responses_options |
| 30 | + |
| 31 | + |
| 32 | +# --------------------------------------------------------------------------- |
| 33 | +# Helpers |
| 34 | +# --------------------------------------------------------------------------- |
| 35 | + |
| 36 | + |
| 37 | +def _make_subscription_llm() -> LLM: |
| 38 | + """Create a minimal subscription-mode LLM for testing.""" |
| 39 | + llm = LLM( |
| 40 | + model="openai/gpt-5.2-codex", |
| 41 | + base_url="https://chatgpt.com/backend-api/codex", |
| 42 | + reasoning_effort="high", |
| 43 | + ) |
| 44 | + llm._is_subscription = True |
| 45 | + llm.enable_encrypted_reasoning = True |
| 46 | + return llm |
| 47 | + |
| 48 | + |
| 49 | +def _make_generic_output_item(**kwargs: Any) -> BaseLiteLLMOpenAIResponseObject: |
| 50 | + """Build a BaseLiteLLMOpenAIResponseObject (the type litellm uses for |
| 51 | + streaming output items) with the given attributes.""" |
| 52 | + return BaseLiteLLMOpenAIResponseObject.model_construct(**kwargs) |
| 53 | + |
| 54 | + |
| 55 | +# --------------------------------------------------------------------------- |
| 56 | +# Bug 1 & 2: Unsupported params must be skipped in subscription mode |
| 57 | +# --------------------------------------------------------------------------- |
| 58 | + |
| 59 | + |
| 60 | +@pytest.mark.parametrize( |
| 61 | + "param", |
| 62 | + [ |
| 63 | + "prompt_cache_retention", |
| 64 | + "include", |
| 65 | + "reasoning", |
| 66 | + "temperature", |
| 67 | + "max_output_tokens", |
| 68 | + ], |
| 69 | +) |
| 70 | +def test_subscription_skips_unsupported_param(param: str): |
| 71 | + """The Codex subscription endpoint rejects or silently mishandles these |
| 72 | + parameters. They must be omitted when is_subscription is True.""" |
| 73 | + llm = _make_subscription_llm() |
| 74 | + llm.max_output_tokens = 4096 |
| 75 | + opts = select_responses_options(llm, {}, include=["text.output_text"], store=None) |
| 76 | + assert param not in opts |
| 77 | + |
| 78 | + |
| 79 | +@pytest.mark.parametrize( |
| 80 | + "param,expected_value", |
| 81 | + [ |
| 82 | + ("prompt_cache_retention", "24h"), |
| 83 | + ("temperature", 1.0), |
| 84 | + ], |
| 85 | +) |
| 86 | +def test_non_subscription_keeps_scalar_param(param: str, expected_value: Any): |
| 87 | + """Non-subscription GPT-5 models should still send these params.""" |
| 88 | + llm = LLM(model="openai/gpt-5.2-codex", reasoning_effort="high") |
| 89 | + llm.enable_encrypted_reasoning = True |
| 90 | + assert not llm.is_subscription |
| 91 | + opts = select_responses_options(llm, {}, include=None, store=None) |
| 92 | + assert opts.get(param) == expected_value |
| 93 | + |
| 94 | + |
| 95 | +@pytest.mark.parametrize( |
| 96 | + "param,check", |
| 97 | + [ |
| 98 | + ("include", lambda v: "reasoning.encrypted_content" in v), |
| 99 | + ("reasoning", lambda v: v["effort"] == "high"), |
| 100 | + ], |
| 101 | +) |
| 102 | +def test_non_subscription_keeps_structured_param(param: str, check: Any): |
| 103 | + """Non-subscription LLMs should send include and reasoning normally.""" |
| 104 | + llm = LLM(model="openai/gpt-5.2-codex", reasoning_effort="high") |
| 105 | + llm.enable_encrypted_reasoning = True |
| 106 | + assert not llm.is_subscription |
| 107 | + opts = select_responses_options(llm, {}, include=["text.output_text"], store=None) |
| 108 | + assert param in opts |
| 109 | + assert check(opts[param]) |
| 110 | + |
| 111 | + |
| 112 | +# --------------------------------------------------------------------------- |
| 113 | +# Bug 3: from_llm_responses_output must handle generic litellm types |
| 114 | +# --------------------------------------------------------------------------- |
| 115 | + |
| 116 | + |
| 117 | +def _generic_function_call_item() -> BaseLiteLLMOpenAIResponseObject: |
| 118 | + return _make_generic_output_item( |
| 119 | + id="fc_abc", |
| 120 | + type="function_call", |
| 121 | + name="terminal", |
| 122 | + arguments='{"command": "ls"}', |
| 123 | + call_id="call_123", |
| 124 | + status="completed", |
| 125 | + ) |
| 126 | + |
| 127 | + |
| 128 | +def _generic_message_item() -> BaseLiteLLMOpenAIResponseObject: |
| 129 | + text_part = SimpleNamespace(type="output_text", text="Hello world") |
| 130 | + return _make_generic_output_item( |
| 131 | + id="m_1", |
| 132 | + type="message", |
| 133 | + role="assistant", |
| 134 | + status="completed", |
| 135 | + content=[text_part], |
| 136 | + ) |
| 137 | + |
| 138 | + |
| 139 | +def _generic_reasoning_item() -> BaseLiteLLMOpenAIResponseObject: |
| 140 | + summary = SimpleNamespace(type="summary_text", text="thinking") |
| 141 | + return _make_generic_output_item( |
| 142 | + id="rs_abc", |
| 143 | + type="reasoning", |
| 144 | + summary=[summary], |
| 145 | + content=None, |
| 146 | + encrypted_content=None, |
| 147 | + status="completed", |
| 148 | + ) |
| 149 | + |
| 150 | + |
| 151 | +def _dict_function_call_item() -> dict[str, Any]: |
| 152 | + return { |
| 153 | + "type": "function_call", |
| 154 | + "name": "file_editor", |
| 155 | + "arguments": '{"command": "view"}', |
| 156 | + "call_id": "call_456", |
| 157 | + "id": "fc_456", |
| 158 | + } |
| 159 | + |
| 160 | + |
| 161 | +def _dict_message_item() -> dict[str, Any]: |
| 162 | + return { |
| 163 | + "type": "message", |
| 164 | + "role": "assistant", |
| 165 | + "content": [{"type": "output_text", "text": "Hi"}], |
| 166 | + } |
| 167 | + |
| 168 | + |
| 169 | +def _typed_function_call_item() -> ResponseFunctionToolCall: |
| 170 | + return ResponseFunctionToolCall( |
| 171 | + type="function_call", |
| 172 | + name="think", |
| 173 | + arguments="{}", |
| 174 | + call_id="fc_typed", |
| 175 | + id="fc_typed", |
| 176 | + ) |
| 177 | + |
| 178 | + |
| 179 | +@pytest.mark.parametrize( |
| 180 | + "item_factory,expected_tool,expected_text", |
| 181 | + [ |
| 182 | + pytest.param( |
| 183 | + _generic_function_call_item, |
| 184 | + {"name": "terminal", "arguments": '{"command": "ls"}', "id": "call_123"}, |
| 185 | + None, |
| 186 | + id="generic-function-call", |
| 187 | + ), |
| 188 | + pytest.param( |
| 189 | + _dict_function_call_item, |
| 190 | + { |
| 191 | + "name": "file_editor", |
| 192 | + "arguments": '{"command": "view"}', |
| 193 | + "id": "call_456", |
| 194 | + }, |
| 195 | + None, |
| 196 | + id="dict-function-call", |
| 197 | + ), |
| 198 | + pytest.param( |
| 199 | + _typed_function_call_item, |
| 200 | + {"name": "think", "arguments": "{}", "id": "fc_typed"}, |
| 201 | + None, |
| 202 | + id="typed-function-call", |
| 203 | + ), |
| 204 | + pytest.param( |
| 205 | + _generic_message_item, |
| 206 | + None, |
| 207 | + "Hello world", |
| 208 | + id="generic-message", |
| 209 | + ), |
| 210 | + pytest.param( |
| 211 | + _dict_message_item, |
| 212 | + None, |
| 213 | + "Hi", |
| 214 | + id="dict-message", |
| 215 | + ), |
| 216 | + ], |
| 217 | +) |
| 218 | +def test_from_llm_responses_output_item_type( |
| 219 | + item_factory: Any, |
| 220 | + expected_tool: dict[str, str] | None, |
| 221 | + expected_text: str | None, |
| 222 | +): |
| 223 | + """from_llm_responses_output must parse function_call and message items |
| 224 | + regardless of whether they arrive as typed Pydantic objects, generic |
| 225 | + BaseLiteLLMOpenAIResponseObject, or plain dicts.""" |
| 226 | + item = item_factory() |
| 227 | + msg = Message.from_llm_responses_output([item]) |
| 228 | + |
| 229 | + if expected_tool is not None: |
| 230 | + assert msg.tool_calls is not None |
| 231 | + assert len(msg.tool_calls) == 1 |
| 232 | + tc = msg.tool_calls[0] |
| 233 | + assert tc.name == expected_tool["name"] |
| 234 | + assert tc.arguments == expected_tool["arguments"] |
| 235 | + assert tc.id == expected_tool["id"] |
| 236 | + if expected_text is not None: |
| 237 | + assert len(msg.content) == 1 |
| 238 | + assert isinstance(msg.content[0], TextContent) |
| 239 | + assert msg.content[0].text == expected_text |
| 240 | + |
| 241 | + |
| 242 | +@pytest.mark.parametrize( |
| 243 | + "item_factory,expected_id,expected_summary", |
| 244 | + [ |
| 245 | + pytest.param( |
| 246 | + _generic_reasoning_item, |
| 247 | + "rs_abc", |
| 248 | + ["thinking"], |
| 249 | + id="generic-reasoning", |
| 250 | + ), |
| 251 | + ], |
| 252 | +) |
| 253 | +def test_from_llm_responses_output_reasoning_item( |
| 254 | + item_factory: Any, |
| 255 | + expected_id: str, |
| 256 | + expected_summary: list[str], |
| 257 | +): |
| 258 | + """Reasoning items from streaming should be parsed into ReasoningItemModel.""" |
| 259 | + item = item_factory() |
| 260 | + msg = Message.from_llm_responses_output([item]) |
| 261 | + assert msg.responses_reasoning_item is not None |
| 262 | + assert msg.responses_reasoning_item.id == expected_id |
| 263 | + assert msg.responses_reasoning_item.summary == expected_summary |
| 264 | + |
| 265 | + |
| 266 | +def test_mixed_typed_and_generic_items(): |
| 267 | + """Parser should handle a mix of typed and generic items in one call.""" |
| 268 | + typed_fc = _typed_function_call_item() |
| 269 | + generic_fc = _generic_function_call_item() |
| 270 | + msg = Message.from_llm_responses_output([typed_fc, generic_fc]) |
| 271 | + assert msg.tool_calls is not None |
| 272 | + assert len(msg.tool_calls) == 2 |
| 273 | + assert {tc.name for tc in msg.tool_calls} == {"think", "terminal"} |
| 274 | + |
| 275 | + |
| 276 | +# --------------------------------------------------------------------------- |
| 277 | +# Bug 4: Reasoning item IDs must be stripped in subscription mode |
| 278 | +# --------------------------------------------------------------------------- |
| 279 | + |
| 280 | + |
| 281 | +def _make_conversation_messages() -> tuple[Message, Message, Message, Message]: |
| 282 | + """Build a minimal multi-turn conversation with a reasoning item.""" |
| 283 | + sys_msg = Message( |
| 284 | + role="system", |
| 285 | + content=[TextContent(text="You are a helpful assistant.")], |
| 286 | + ) |
| 287 | + user_msg = Message( |
| 288 | + role="user", |
| 289 | + content=[TextContent(text="Now create FACTS.txt")], |
| 290 | + ) |
| 291 | + assistant_msg = Message( |
| 292 | + role="assistant", |
| 293 | + content=[TextContent(text="I'll look at the files.")], |
| 294 | + tool_calls=[ |
| 295 | + MessageToolCall( |
| 296 | + id="call_1", |
| 297 | + name="terminal", |
| 298 | + arguments='{"command": "ls"}', |
| 299 | + origin="responses", |
| 300 | + ) |
| 301 | + ], |
| 302 | + responses_reasoning_item=ReasoningItemModel( |
| 303 | + id="rs_should_be_stripped", |
| 304 | + summary=["thinking about files"], |
| 305 | + content=None, |
| 306 | + encrypted_content=None, |
| 307 | + status="completed", |
| 308 | + ), |
| 309 | + ) |
| 310 | + tool_msg = Message( |
| 311 | + role="tool", |
| 312 | + content=[TextContent(text="file1.py file2.py")], |
| 313 | + tool_call_id="call_1", |
| 314 | + ) |
| 315 | + return sys_msg, user_msg, assistant_msg, tool_msg |
| 316 | + |
| 317 | + |
| 318 | +@pytest.mark.parametrize( |
| 319 | + "is_subscription,reasoning_id_present", |
| 320 | + [ |
| 321 | + pytest.param(True, False, id="subscription-strips-reasoning"), |
| 322 | + pytest.param(False, True, id="non-subscription-preserves-reasoning"), |
| 323 | + ], |
| 324 | +) |
| 325 | +def test_format_messages_reasoning_item_handling( |
| 326 | + is_subscription: bool, reasoning_id_present: bool |
| 327 | +): |
| 328 | + """Subscription mode must strip reasoning item IDs (store=false means they |
| 329 | + can't be resolved). Non-subscription mode must preserve them.""" |
| 330 | + llm = LLM(model="openai/gpt-5.2-codex") |
| 331 | + if is_subscription: |
| 332 | + llm._is_subscription = True |
| 333 | + |
| 334 | + sys_msg, user_msg, assistant_msg, tool_msg = _make_conversation_messages() |
| 335 | + _, input_items = llm.format_messages_for_responses( |
| 336 | + [sys_msg, user_msg, assistant_msg, tool_msg] |
| 337 | + ) |
| 338 | + |
| 339 | + serialized = json.dumps(input_items, default=str) |
| 340 | + assert ("rs_should_be_stripped" in serialized) == reasoning_id_present |
0 commit comments