From d81891fa57154912d73a3af5caf549a34953c812 Mon Sep 17 00:00:00 2001
From: habema <abualhaj02@gmail.com>
Date: Wed, 24 Sep 2025 18:39:17 +0300
Subject: [PATCH 1/3] fix: reorder tool messages to resolve interleaved
 thinking bug

---
 src/agents/extensions/models/litellm_model.py | 111 ++++++++++++++++++
 src/agents/models/chatcmpl_converter.py       |   8 +-
 2 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py
index 877951119..551d2c830 100644
--- a/src/agents/extensions/models/litellm_model.py
+++ b/src/agents/extensions/models/litellm_model.py
@@ -23,6 +23,7 @@
     ChatCompletionChunk,
     ChatCompletionMessageCustomToolCall,
     ChatCompletionMessageFunctionToolCall,
+    ChatCompletionMessageParam,
 )
 from openai.types.chat.chat_completion_message import (
     Annotation,
@@ -267,6 +268,10 @@ async def _fetch_response(
             input, preserve_thinking_blocks=preserve_thinking_blocks
         )
 
+        # Fix for interleaved thinking bug: reorder messages to ensure tool_use comes before tool_result  # noqa: E501
+        if preserve_thinking_blocks:
+            converted_messages = self._fix_tool_message_ordering(converted_messages)
+
         if system_instructions:
             converted_messages.insert(
                 0,
@@ -379,6 +384,112 @@ async def _fetch_response(
         )
         return response, ret
 
+    def _fix_tool_message_ordering(
+        self, messages: list[ChatCompletionMessageParam]
+    ) -> list[ChatCompletionMessageParam]:
+        """
+        Fix the ordering of tool messages to ensure tool_use messages come before tool_result messages.
+
+        This addresses the interleaved thinking bug where conversation histories may contain
+        tool results before their corresponding tool calls, causing Anthropic API to reject the request.
+        """  # noqa: E501
+        if not messages:
+            return messages
+
+        # Collect all tool calls and tool results
+        tool_call_messages = {}  # tool_id -> (index, message)
+        tool_result_messages = {}  # tool_id -> (index, message)
+        other_messages = []  # (index, message) for non-tool messages
+
+        for i, message in enumerate(messages):
+            if not isinstance(message, dict):
+                other_messages.append((i, message))
+                continue
+
+            role = message.get("role")
+
+            if role == "assistant" and message.get("tool_calls"):
+                # Extract tool calls from this assistant message
+                tool_calls = message.get("tool_calls", [])
+                if isinstance(tool_calls, list):
+                    for tool_call in tool_calls:
+                        if isinstance(tool_call, dict):
+                            tool_id = tool_call.get("id")
+                            if tool_id:
+                                # Create a separate assistant message for each tool call
+                                single_tool_msg = cast(dict[str, Any], message.copy())
+                                single_tool_msg["tool_calls"] = [tool_call]
+                                tool_call_messages[tool_id] = (
+                                    i,
+                                    cast(ChatCompletionMessageParam, single_tool_msg),
+                                )
+
+            elif role == "tool":
+                tool_call_id = message.get("tool_call_id")
+                if tool_call_id:
+                    tool_result_messages[tool_call_id] = (i, message)
+                else:
+                    other_messages.append((i, message))
+            else:
+                other_messages.append((i, message))
+
+        # Create the fixed message sequence
+        fixed_messages: list[ChatCompletionMessageParam] = []
+        used_indices = set()
+
+        # Add messages in their original order, but ensure tool_use → tool_result pairing
+        for i, original_message in enumerate(messages):
+            if i in used_indices:
+                continue
+
+            if not isinstance(original_message, dict):
+                fixed_messages.append(original_message)
+                used_indices.add(i)
+                continue
+
+            role = original_message.get("role")
+
+            if role == "assistant" and original_message.get("tool_calls"):
+                # Process each tool call in this assistant message
+                tool_calls = original_message.get("tool_calls", [])
+                if isinstance(tool_calls, list):
+                    for tool_call in tool_calls:
+                        if isinstance(tool_call, dict):
+                            tool_id = tool_call.get("id")
+                            if (
+                                tool_id
+                                and tool_id in tool_call_messages
+                                and tool_id in tool_result_messages
+                            ):
+                                # Add tool_use → tool_result pair
+                                _, tool_call_msg = tool_call_messages[tool_id]
+                                _, tool_result_msg = tool_result_messages[tool_id]
+
+                                fixed_messages.append(tool_call_msg)
+                                fixed_messages.append(tool_result_msg)
+
+                                # Mark both as used
+                                used_indices.add(tool_call_messages[tool_id][0])
+                                used_indices.add(tool_result_messages[tool_id][0])
+                            elif tool_id and tool_id in tool_call_messages:
+                                # Tool call without result - add just the tool call
+                                _, tool_call_msg = tool_call_messages[tool_id]
+                                fixed_messages.append(tool_call_msg)
+                                used_indices.add(tool_call_messages[tool_id][0])
+
+                used_indices.add(i)  # Mark original multi-tool message as used
+
+            elif role == "tool":
+                # Skip - these will be handled as part of tool pairs above
+                used_indices.add(i)
+
+            else:
+                # Regular message - add it normally
+                fixed_messages.append(original_message)
+                used_indices.add(i)
+
+        return fixed_messages
+
     def _remove_not_given(self, value: Any) -> Any:
         if isinstance(value, NotGiven):
             return None
diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py
index f17e8f126..2ce5c127b 100644
--- a/src/agents/models/chatcmpl_converter.py
+++ b/src/agents/models/chatcmpl_converter.py
@@ -533,7 +533,7 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
 
                 if content_items and preserve_thinking_blocks:
                     # Reconstruct thinking blocks from content and signature
-                    pending_thinking_blocks = []
+                    reconstructed_thinking_blocks = []
                     for content_item in content_items:
                         if (
                             isinstance(content_item, dict)
@@ -546,7 +546,11 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
                             # Add signatures if available
                             if signatures:
                                 thinking_block["signature"] = signatures.pop(0)
-                            pending_thinking_blocks.append(thinking_block)
+                            reconstructed_thinking_blocks.append(thinking_block)
+
+                    # Store thinking blocks as pending for the next assistant message
+                    # This preserves the original behavior
+                    pending_thinking_blocks = reconstructed_thinking_blocks
 
             # 8) If we haven't recognized it => fail or ignore
             else:

From 6ed8cef18b277215b2eb264fd930736636b0103b Mon Sep 17 00:00:00 2001
From: habema <abualhaj02@gmail.com>
Date: Wed, 24 Sep 2025 19:16:21 +0300
Subject: [PATCH 2/3] codex code review

---
 src/agents/extensions/models/litellm_model.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py
index 551d2c830..a4c8da3ab 100644
--- a/src/agents/extensions/models/litellm_model.py
+++ b/src/agents/extensions/models/litellm_model.py
@@ -433,6 +433,13 @@ def _fix_tool_message_ordering(
             else:
                 other_messages.append((i, message))
 
+        # First, identify which tool results will be paired to avoid duplicates
+        paired_tool_result_indices = set()
+        for tool_id in tool_call_messages:
+            if tool_id in tool_result_messages:
+                tool_result_idx, _ = tool_result_messages[tool_id]
+                paired_tool_result_indices.add(tool_result_idx)
+
         # Create the fixed message sequence
         fixed_messages: list[ChatCompletionMessageParam] = []
         used_indices = set()
@@ -463,14 +470,14 @@ def _fix_tool_message_ordering(
                             ):
                                 # Add tool_use → tool_result pair
                                 _, tool_call_msg = tool_call_messages[tool_id]
-                                _, tool_result_msg = tool_result_messages[tool_id]
+                                tool_result_idx, tool_result_msg = tool_result_messages[tool_id]
 
                                 fixed_messages.append(tool_call_msg)
                                 fixed_messages.append(tool_result_msg)
 
                                 # Mark both as used
                                 used_indices.add(tool_call_messages[tool_id][0])
-                                used_indices.add(tool_result_messages[tool_id][0])
+                                used_indices.add(tool_result_idx)
                             elif tool_id and tool_id in tool_call_messages:
                                 # Tool call without result - add just the tool call
                                 _, tool_call_msg = tool_call_messages[tool_id]
@@ -480,7 +487,9 @@ def _fix_tool_message_ordering(
                 used_indices.add(i)  # Mark original multi-tool message as used
 
             elif role == "tool":
-                # Skip - these will be handled as part of tool pairs above
+                # Only preserve unmatched tool results to avoid duplicates
+                if i not in paired_tool_result_indices:
+                    fixed_messages.append(original_message)
                 used_indices.add(i)
 
             else:

From bb7a3a46caed264543cf1a8ccfa8c25c01c8292c Mon Sep 17 00:00:00 2001
From: habema <abualhaj02@gmail.com>
Date: Wed, 24 Sep 2025 19:16:37 +0300
Subject: [PATCH 3/3] add test for extended thinking message reordering

---
 tests/test_extended_thinking_message_order.py | 293 ++++++++++++++++++
 1 file changed, 293 insertions(+)
 create mode 100644 tests/test_extended_thinking_message_order.py

diff --git a/tests/test_extended_thinking_message_order.py b/tests/test_extended_thinking_message_order.py
new file mode 100644
index 000000000..3bc525623
--- /dev/null
+++ b/tests/test_extended_thinking_message_order.py
@@ -0,0 +1,293 @@
+"""Tests for the extended thinking message order bug fix in LitellmModel."""
+
+from __future__ import annotations
+
+from openai.types.chat import ChatCompletionMessageParam
+
+from agents.extensions.models.litellm_model import LitellmModel
+
+
+class TestExtendedThinkingMessageOrder:
+    """Test the _fix_tool_message_ordering method."""
+
+    def test_basic_reordering_tool_result_before_call(self):
+        """Test that a tool result appearing before its tool call gets reordered correctly."""
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Hello"},
+            {"role": "tool", "tool_call_id": "call_123", "content": "Result for call_123"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_123",
+                        "type": "function",
+                        "function": {"name": "test", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "user", "content": "Thanks"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        # Should reorder to: user, assistant+tool_call, tool_result, user
+        assert len(result) == 4
+        assert result[0]["role"] == "user"
+        assert result[1]["role"] == "assistant"
+        assert result[1]["tool_calls"][0]["id"] == "call_123"  # type: ignore
+        assert result[2]["role"] == "tool"
+        assert result[2]["tool_call_id"] == "call_123"
+        assert result[3]["role"] == "user"
+
+    def test_consecutive_tool_calls_get_separated(self):
+        """Test that consecutive assistant messages with tool calls get properly paired with results."""  # noqa: E501
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Hello"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {"name": "test1", "arguments": "{}"},
+                    }
+                ],
+            },
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_2",
+                        "type": "function",
+                        "function": {"name": "test2", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_1", "content": "Result 1"},
+            {"role": "tool", "tool_call_id": "call_2", "content": "Result 2"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        # Should pair each tool call with its result immediately
+        assert len(result) == 5
+        assert result[0]["role"] == "user"
+        assert result[1]["role"] == "assistant"
+        assert result[1]["tool_calls"][0]["id"] == "call_1"  # type: ignore
+        assert result[2]["role"] == "tool"
+        assert result[2]["tool_call_id"] == "call_1"
+        assert result[3]["role"] == "assistant"
+        assert result[3]["tool_calls"][0]["id"] == "call_2"  # type: ignore
+        assert result[4]["role"] == "tool"
+        assert result[4]["tool_call_id"] == "call_2"
+
+    def test_unmatched_tool_results_preserved(self):
+        """Test that tool results without matching tool calls are preserved."""
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Hello"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {"name": "test", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_1", "content": "Matched result"},
+            {"role": "tool", "tool_call_id": "call_orphan", "content": "Orphaned result"},
+            {"role": "user", "content": "End"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        # Should preserve the orphaned tool result
+        assert len(result) == 5
+        assert result[0]["role"] == "user"
+        assert result[1]["role"] == "assistant"
+        assert result[2]["role"] == "tool"
+        assert result[2]["tool_call_id"] == "call_1"
+        assert result[3]["role"] == "tool"  # Orphaned result preserved
+        assert result[3]["tool_call_id"] == "call_orphan"
+        assert result[4]["role"] == "user"
+
+    def test_tool_calls_without_results_preserved(self):
+        """Test that tool calls without results are still included."""
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Hello"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {"name": "test", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "user", "content": "End"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        # Should preserve the tool call even without a result
+        assert len(result) == 3
+        assert result[0]["role"] == "user"
+        assert result[1]["role"] == "assistant"
+        assert result[1]["tool_calls"][0]["id"] == "call_1"  # type: ignore
+        assert result[2]["role"] == "user"
+
+    def test_correctly_ordered_messages_unchanged(self):
+        """Test that correctly ordered messages remain in the same order."""
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Hello"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {"name": "test", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_1", "content": "Result"},
+            {"role": "assistant", "content": "Done"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        # Should remain exactly the same
+        assert len(result) == 4
+        assert result[0]["role"] == "user"
+        assert result[1]["role"] == "assistant"
+        assert result[1]["tool_calls"][0]["id"] == "call_1"  # type: ignore
+        assert result[2]["role"] == "tool"
+        assert result[2]["tool_call_id"] == "call_1"
+        assert result[3]["role"] == "assistant"
+
+    def test_multiple_tool_calls_single_message(self):
+        """Test assistant message with multiple tool calls gets split properly."""
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Hello"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {"name": "test1", "arguments": "{}"},
+                    },
+                    {
+                        "id": "call_2",
+                        "type": "function",
+                        "function": {"name": "test2", "arguments": "{}"},
+                    },
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_1", "content": "Result 1"},
+            {"role": "tool", "tool_call_id": "call_2", "content": "Result 2"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        # Should split the multi-tool message and pair each properly
+        assert len(result) == 5
+        assert result[0]["role"] == "user"
+        assert result[1]["role"] == "assistant"
+        assert len(result[1]["tool_calls"]) == 1  # type: ignore
+        assert result[1]["tool_calls"][0]["id"] == "call_1"  # type: ignore
+        assert result[2]["role"] == "tool"
+        assert result[2]["tool_call_id"] == "call_1"
+        assert result[3]["role"] == "assistant"
+        assert len(result[3]["tool_calls"]) == 1  # type: ignore
+        assert result[3]["tool_calls"][0]["id"] == "call_2"  # type: ignore
+        assert result[4]["role"] == "tool"
+        assert result[4]["tool_call_id"] == "call_2"
+
+    def test_empty_messages_list(self):
+        """Test that empty message list is handled correctly."""
+        messages: list[ChatCompletionMessageParam] = []
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        assert result == []
+
+    def test_no_tool_messages(self):
+        """Test that messages without tool calls are left unchanged."""
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there"},
+            {"role": "user", "content": "How are you?"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        assert result == messages
+
+    def test_complex_mixed_scenario(self):
+        """Test a complex scenario with various message types and orderings."""
+        messages: list[ChatCompletionMessageParam] = [
+            {"role": "user", "content": "Start"},
+            {
+                "role": "tool",
+                "tool_call_id": "call_out_of_order",
+                "content": "Out of order result",
+            },  # This comes before its call
+            {"role": "assistant", "content": "Regular response"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_out_of_order",
+                        "type": "function",
+                        "function": {"name": "test", "arguments": "{}"},
+                    }
+                ],
+            },
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_normal",
+                        "type": "function",
+                        "function": {"name": "test2", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_normal", "content": "Normal result"},
+            {
+                "role": "tool",
+                "tool_call_id": "call_orphan",
+                "content": "Orphaned result",
+            },  # No matching call
+            {"role": "user", "content": "End"},
+        ]
+
+        model = LitellmModel("test-model")
+        result = model._fix_tool_message_ordering(messages)
+
+        # Should reorder properly while preserving all messages
+        assert len(result) == 8
+        assert result[0]["role"] == "user"  # Start
+        assert result[1]["role"] == "assistant"  # Regular response
+        assert result[2]["role"] == "assistant"  # call_out_of_order
+        assert result[2]["tool_calls"][0]["id"] == "call_out_of_order"  # type: ignore
+        assert result[3]["role"] == "tool"  # Out of order result (now properly paired)
+        assert result[3]["tool_call_id"] == "call_out_of_order"
+        assert result[4]["role"] == "assistant"  # call_normal
+        assert result[4]["tool_calls"][0]["id"] == "call_normal"  # type: ignore
+        assert result[5]["role"] == "tool"  # Normal result
+        assert result[5]["tool_call_id"] == "call_normal"
+        assert result[6]["role"] == "tool"  # Orphaned result (preserved)
+        assert result[6]["tool_call_id"] == "call_orphan"
+        assert result[7]["role"] == "user"  # End