From d81891fa57154912d73a3af5caf549a34953c812 Mon Sep 17 00:00:00 2001 From: habema Date: Wed, 24 Sep 2025 18:39:17 +0300 Subject: [PATCH 1/3] fix: reorder tool messages to resolve interleaved thinking bug --- src/agents/extensions/models/litellm_model.py | 111 ++++++++++++++++++ src/agents/models/chatcmpl_converter.py | 8 +- 2 files changed, 117 insertions(+), 2 deletions(-) diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py index 877951119..551d2c830 100644 --- a/src/agents/extensions/models/litellm_model.py +++ b/src/agents/extensions/models/litellm_model.py @@ -23,6 +23,7 @@ ChatCompletionChunk, ChatCompletionMessageCustomToolCall, ChatCompletionMessageFunctionToolCall, + ChatCompletionMessageParam, ) from openai.types.chat.chat_completion_message import ( Annotation, @@ -267,6 +268,10 @@ async def _fetch_response( input, preserve_thinking_blocks=preserve_thinking_blocks ) + # Fix for interleaved thinking bug: reorder messages to ensure tool_use comes before tool_result # noqa: E501 + if preserve_thinking_blocks: + converted_messages = self._fix_tool_message_ordering(converted_messages) + if system_instructions: converted_messages.insert( 0, @@ -379,6 +384,112 @@ async def _fetch_response( ) return response, ret + def _fix_tool_message_ordering( + self, messages: list[ChatCompletionMessageParam] + ) -> list[ChatCompletionMessageParam]: + """ + Fix the ordering of tool messages to ensure tool_use messages come before tool_result messages. + + This addresses the interleaved thinking bug where conversation histories may contain + tool results before their corresponding tool calls, causing Anthropic API to reject the request. + """ # noqa: E501 + if not messages: + return messages + + # Collect all tool calls and tool results + tool_call_messages = {} # tool_id -> (index, message) + tool_result_messages = {} # tool_id -> (index, message) + other_messages = [] # (index, message) for non-tool messages + + for i, message in enumerate(messages): + if not isinstance(message, dict): + other_messages.append((i, message)) + continue + + role = message.get("role") + + if role == "assistant" and message.get("tool_calls"): + # Extract tool calls from this assistant message + tool_calls = message.get("tool_calls", []) + if isinstance(tool_calls, list): + for tool_call in tool_calls: + if isinstance(tool_call, dict): + tool_id = tool_call.get("id") + if tool_id: + # Create a separate assistant message for each tool call + single_tool_msg = cast(dict[str, Any], message.copy()) + single_tool_msg["tool_calls"] = [tool_call] + tool_call_messages[tool_id] = ( + i, + cast(ChatCompletionMessageParam, single_tool_msg), + ) + + elif role == "tool": + tool_call_id = message.get("tool_call_id") + if tool_call_id: + tool_result_messages[tool_call_id] = (i, message) + else: + other_messages.append((i, message)) + else: + other_messages.append((i, message)) + + # Create the fixed message sequence + fixed_messages: list[ChatCompletionMessageParam] = [] + used_indices = set() + + # Add messages in their original order, but ensure tool_use → tool_result pairing + for i, original_message in enumerate(messages): + if i in used_indices: + continue + + if not isinstance(original_message, dict): + fixed_messages.append(original_message) + used_indices.add(i) + continue + + role = original_message.get("role") + + if role == "assistant" and original_message.get("tool_calls"): + # Process each tool call in this assistant message + tool_calls = original_message.get("tool_calls", []) + if isinstance(tool_calls, list): + for tool_call in tool_calls: + if isinstance(tool_call, dict): + tool_id = tool_call.get("id") + if ( + tool_id + and tool_id in tool_call_messages + and tool_id in tool_result_messages + ): + # Add tool_use → tool_result pair + _, tool_call_msg = tool_call_messages[tool_id] + _, tool_result_msg = tool_result_messages[tool_id] + + fixed_messages.append(tool_call_msg) + fixed_messages.append(tool_result_msg) + + # Mark both as used + used_indices.add(tool_call_messages[tool_id][0]) + used_indices.add(tool_result_messages[tool_id][0]) + elif tool_id and tool_id in tool_call_messages: + # Tool call without result - add just the tool call + _, tool_call_msg = tool_call_messages[tool_id] + fixed_messages.append(tool_call_msg) + used_indices.add(tool_call_messages[tool_id][0]) + + used_indices.add(i) # Mark original multi-tool message as used + + elif role == "tool": + # Skip - these will be handled as part of tool pairs above + used_indices.add(i) + + else: + # Regular message - add it normally + fixed_messages.append(original_message) + used_indices.add(i) + + return fixed_messages + def _remove_not_given(self, value: Any) -> Any: if isinstance(value, NotGiven): return None diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py index f17e8f126..2ce5c127b 100644 --- a/src/agents/models/chatcmpl_converter.py +++ b/src/agents/models/chatcmpl_converter.py @@ -533,7 +533,7 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: if content_items and preserve_thinking_blocks: # Reconstruct thinking blocks from content and signature - pending_thinking_blocks = [] + reconstructed_thinking_blocks = [] for content_item in content_items: if ( isinstance(content_item, dict) @@ -546,7 +546,11 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: # Add signatures if available if signatures: thinking_block["signature"] = signatures.pop(0) - pending_thinking_blocks.append(thinking_block) + reconstructed_thinking_blocks.append(thinking_block) + + # Store thinking blocks as pending for the next assistant message + # This preserves the original behavior + pending_thinking_blocks = reconstructed_thinking_blocks # 8) If we haven't recognized it => fail or ignore else: From 6ed8cef18b277215b2eb264fd930736636b0103b Mon Sep 17 00:00:00 2001 From: habema Date: Wed, 24 Sep 2025 19:16:21 +0300 Subject: [PATCH 2/3] codex code review --- src/agents/extensions/models/litellm_model.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py index 551d2c830..a4c8da3ab 100644 --- a/src/agents/extensions/models/litellm_model.py +++ b/src/agents/extensions/models/litellm_model.py @@ -433,6 +433,13 @@ def _fix_tool_message_ordering( else: other_messages.append((i, message)) + # First, identify which tool results will be paired to avoid duplicates + paired_tool_result_indices = set() + for tool_id in tool_call_messages: + if tool_id in tool_result_messages: + tool_result_idx, _ = tool_result_messages[tool_id] + paired_tool_result_indices.add(tool_result_idx) + # Create the fixed message sequence fixed_messages: list[ChatCompletionMessageParam] = [] used_indices = set() @@ -463,14 +470,14 @@ def _fix_tool_message_ordering( ): # Add tool_use → tool_result pair _, tool_call_msg = tool_call_messages[tool_id] - _, tool_result_msg = tool_result_messages[tool_id] + tool_result_idx, tool_result_msg = tool_result_messages[tool_id] fixed_messages.append(tool_call_msg) fixed_messages.append(tool_result_msg) # Mark both as used used_indices.add(tool_call_messages[tool_id][0]) - used_indices.add(tool_result_messages[tool_id][0]) + used_indices.add(tool_result_idx) elif tool_id and tool_id in tool_call_messages: # Tool call without result - add just the tool call _, tool_call_msg = tool_call_messages[tool_id] @@ -480,7 +487,9 @@ def _fix_tool_message_ordering( used_indices.add(i) # Mark original multi-tool message as used elif role == "tool": - # Skip - these will be handled as part of tool pairs above + # Only preserve unmatched tool results to avoid duplicates + if i not in paired_tool_result_indices: + fixed_messages.append(original_message) used_indices.add(i) else: From bb7a3a46caed264543cf1a8ccfa8c25c01c8292c Mon Sep 17 00:00:00 2001 From: habema Date: Wed, 24 Sep 2025 19:16:37 +0300 Subject: [PATCH 3/3] add test for extended thinking message reordering --- tests/test_extended_thinking_message_order.py | 293 ++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 tests/test_extended_thinking_message_order.py diff --git a/tests/test_extended_thinking_message_order.py b/tests/test_extended_thinking_message_order.py new file mode 100644 index 000000000..3bc525623 --- /dev/null +++ b/tests/test_extended_thinking_message_order.py @@ -0,0 +1,293 @@ +"""Tests for the extended thinking message order bug fix in LitellmModel.""" + +from __future__ import annotations + +from openai.types.chat import ChatCompletionMessageParam + +from agents.extensions.models.litellm_model import LitellmModel + + +class TestExtendedThinkingMessageOrder: + """Test the _fix_tool_message_ordering method.""" + + def test_basic_reordering_tool_result_before_call(self): + """Test that a tool result appearing before its tool call gets reordered correctly.""" + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Hello"}, + {"role": "tool", "tool_call_id": "call_123", "content": "Result for call_123"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_123", + "type": "function", + "function": {"name": "test", "arguments": "{}"}, + } + ], + }, + {"role": "user", "content": "Thanks"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + # Should reorder to: user, assistant+tool_call, tool_result, user + assert len(result) == 4 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert result[1]["tool_calls"][0]["id"] == "call_123" # type: ignore + assert result[2]["role"] == "tool" + assert result[2]["tool_call_id"] == "call_123" + assert result[3]["role"] == "user" + + def test_consecutive_tool_calls_get_separated(self): + """Test that consecutive assistant messages with tool calls get properly paired with results.""" # noqa: E501 + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "test1", "arguments": "{}"}, + } + ], + }, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_2", + "type": "function", + "function": {"name": "test2", "arguments": "{}"}, + } + ], + }, + {"role": "tool", "tool_call_id": "call_1", "content": "Result 1"}, + {"role": "tool", "tool_call_id": "call_2", "content": "Result 2"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + # Should pair each tool call with its result immediately + assert len(result) == 5 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert result[1]["tool_calls"][0]["id"] == "call_1" # type: ignore + assert result[2]["role"] == "tool" + assert result[2]["tool_call_id"] == "call_1" + assert result[3]["role"] == "assistant" + assert result[3]["tool_calls"][0]["id"] == "call_2" # type: ignore + assert result[4]["role"] == "tool" + assert result[4]["tool_call_id"] == "call_2" + + def test_unmatched_tool_results_preserved(self): + """Test that tool results without matching tool calls are preserved.""" + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "test", "arguments": "{}"}, + } + ], + }, + {"role": "tool", "tool_call_id": "call_1", "content": "Matched result"}, + {"role": "tool", "tool_call_id": "call_orphan", "content": "Orphaned result"}, + {"role": "user", "content": "End"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + # Should preserve the orphaned tool result + assert len(result) == 5 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert result[2]["role"] == "tool" + assert result[2]["tool_call_id"] == "call_1" + assert result[3]["role"] == "tool" # Orphaned result preserved + assert result[3]["tool_call_id"] == "call_orphan" + assert result[4]["role"] == "user" + + def test_tool_calls_without_results_preserved(self): + """Test that tool calls without results are still included.""" + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "test", "arguments": "{}"}, + } + ], + }, + {"role": "user", "content": "End"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + # Should preserve the tool call even without a result + assert len(result) == 3 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert result[1]["tool_calls"][0]["id"] == "call_1" # type: ignore + assert result[2]["role"] == "user" + + def test_correctly_ordered_messages_unchanged(self): + """Test that correctly ordered messages remain in the same order.""" + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "test", "arguments": "{}"}, + } + ], + }, + {"role": "tool", "tool_call_id": "call_1", "content": "Result"}, + {"role": "assistant", "content": "Done"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + # Should remain exactly the same + assert len(result) == 4 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert result[1]["tool_calls"][0]["id"] == "call_1" # type: ignore + assert result[2]["role"] == "tool" + assert result[2]["tool_call_id"] == "call_1" + assert result[3]["role"] == "assistant" + + def test_multiple_tool_calls_single_message(self): + """Test assistant message with multiple tool calls gets split properly.""" + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "test1", "arguments": "{}"}, + }, + { + "id": "call_2", + "type": "function", + "function": {"name": "test2", "arguments": "{}"}, + }, + ], + }, + {"role": "tool", "tool_call_id": "call_1", "content": "Result 1"}, + {"role": "tool", "tool_call_id": "call_2", "content": "Result 2"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + # Should split the multi-tool message and pair each properly + assert len(result) == 5 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert len(result[1]["tool_calls"]) == 1 # type: ignore + assert result[1]["tool_calls"][0]["id"] == "call_1" # type: ignore + assert result[2]["role"] == "tool" + assert result[2]["tool_call_id"] == "call_1" + assert result[3]["role"] == "assistant" + assert len(result[3]["tool_calls"]) == 1 # type: ignore + assert result[3]["tool_calls"][0]["id"] == "call_2" # type: ignore + assert result[4]["role"] == "tool" + assert result[4]["tool_call_id"] == "call_2" + + def test_empty_messages_list(self): + """Test that empty message list is handled correctly.""" + messages: list[ChatCompletionMessageParam] = [] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + assert result == [] + + def test_no_tool_messages(self): + """Test that messages without tool calls are left unchanged.""" + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + {"role": "user", "content": "How are you?"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + assert result == messages + + def test_complex_mixed_scenario(self): + """Test a complex scenario with various message types and orderings.""" + messages: list[ChatCompletionMessageParam] = [ + {"role": "user", "content": "Start"}, + { + "role": "tool", + "tool_call_id": "call_out_of_order", + "content": "Out of order result", + }, # This comes before its call + {"role": "assistant", "content": "Regular response"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_out_of_order", + "type": "function", + "function": {"name": "test", "arguments": "{}"}, + } + ], + }, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_normal", + "type": "function", + "function": {"name": "test2", "arguments": "{}"}, + } + ], + }, + {"role": "tool", "tool_call_id": "call_normal", "content": "Normal result"}, + { + "role": "tool", + "tool_call_id": "call_orphan", + "content": "Orphaned result", + }, # No matching call + {"role": "user", "content": "End"}, + ] + + model = LitellmModel("test-model") + result = model._fix_tool_message_ordering(messages) + + # Should reorder properly while preserving all messages + assert len(result) == 8 + assert result[0]["role"] == "user" # Start + assert result[1]["role"] == "assistant" # Regular response + assert result[2]["role"] == "assistant" # call_out_of_order + assert result[2]["tool_calls"][0]["id"] == "call_out_of_order" # type: ignore + assert result[3]["role"] == "tool" # Out of order result (now properly paired) + assert result[3]["tool_call_id"] == "call_out_of_order" + assert result[4]["role"] == "assistant" # call_normal + assert result[4]["tool_calls"][0]["id"] == "call_normal" # type: ignore + assert result[5]["role"] == "tool" # Normal result + assert result[5]["tool_call_id"] == "call_normal" + assert result[6]["role"] == "tool" # Orphaned result (preserved) + assert result[6]["tool_call_id"] == "call_orphan" + assert result[7]["role"] == "user" # End