From 54a9f7a008bceff0800d773040114d4d5c496ede Mon Sep 17 00:00:00 2001 From: Sung-jin Brian Hong Date: Mon, 15 Sep 2025 14:42:45 +0900 Subject: [PATCH 1/4] feat: Support Anthropic extended thinking and interleaved thinking Anthropic's API requires thinking blocks to be the first content in assistant messages when reasoning is enabled and tool calls are present. ref: https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking This change: - Stores thinking text in ResponseReasoningItem.content field - Stores signature in encrypted_content field for metadata - Reconstructs thinking blocks when converting items->messages - Handles both streaming and non-streaming cases - Only affects Anthropic models with reasoning enabled - Adds regression test to prevent future breakage Fixes compatibility with Claude models when using tools with interleaved thinking. --- src/agents/models/chatcmpl_converter.py | 71 ++++++++--- src/agents/models/chatcmpl_stream_handler.py | 30 ++++- tests/test_anthropic_thinking_blocks.py | 117 ++++++++++++++++++- 3 files changed, 201 insertions(+), 17 deletions(-) diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py index 77ff22ee0..a0816b836 100644 --- a/src/agents/models/chatcmpl_converter.py +++ b/src/agents/models/chatcmpl_converter.py @@ -39,7 +39,7 @@ ResponseReasoningItemParam, ) from openai.types.responses.response_input_param import FunctionCallOutput, ItemReference, Message -from openai.types.responses.response_reasoning_item import Summary +from openai.types.responses.response_reasoning_item import Content, Summary from ..agent_output import AgentOutputSchemaBase from ..exceptions import AgentsException, UserError @@ -93,7 +93,9 @@ def convert_response_format( def message_to_output_items(cls, message: ChatCompletionMessage) -> list[TResponseOutputItem]: items: list[TResponseOutputItem] = [] - # Handle reasoning content if available + # Check if message is agents.extentions.models.litellm_model.InternalChatCompletionMessage + # We can't actually import it here because litellm is an optional dependency + # So we use hasattr to check for reasoning_content and thinking_blocks if hasattr(message, "reasoning_content") and message.reasoning_content: reasoning_item = ResponseReasoningItem( id=FAKE_RESPONSES_ID, @@ -101,16 +103,28 @@ def message_to_output_items(cls, message: ChatCompletionMessage) -> list[TRespon type="reasoning", ) - # Store full thinking blocks for Anthropic compatibility + # Store thinking blocks for Anthropic compatibility if hasattr(message, "thinking_blocks") and message.thinking_blocks: - # Store thinking blocks in the reasoning item's content - # Convert thinking blocks to Content objects - from openai.types.responses.response_reasoning_item import Content - - reasoning_item.content = [ - Content(text=str(block.get("thinking", "")), type="reasoning_text") - for block in message.thinking_blocks - ] + # Store thinking text in content and signature in encrypted_content + reasoning_item.content = [] + signature = None + for block in message.thinking_blocks: + if isinstance(block, dict): + thinking_text = block.get("thinking", "") + if thinking_text: + reasoning_item.content.append( + Content(text=thinking_text, type="reasoning_text") + ) + # Store the signature if present + if block.get("signature"): + signature = block.get("signature") + + # Store only the last signature in encrypted_content + # If there are multiple thinking blocks, this should be a problem. + # In practice, there should only be one signature for the entire reasoning step. + # Tested with: claude-sonnet-4-20250514 + if signature: + reasoning_item.encrypted_content = signature items.append(reasoning_item) @@ -325,6 +339,7 @@ def items_to_messages( result: list[ChatCompletionMessageParam] = [] current_assistant_msg: ChatCompletionAssistantMessageParam | None = None + pending_thinking_blocks: list[dict[str, str]] | None = None def flush_assistant_message() -> None: nonlocal current_assistant_msg @@ -336,10 +351,17 @@ def flush_assistant_message() -> None: current_assistant_msg = None def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: - nonlocal current_assistant_msg + nonlocal current_assistant_msg, pending_thinking_blocks if current_assistant_msg is None: current_assistant_msg = ChatCompletionAssistantMessageParam(role="assistant") current_assistant_msg["tool_calls"] = [] + + # If we have pending thinking blocks, use them as the content + # This is required for Anthropic API tool calls with interleaved thinking + if pending_thinking_blocks: + current_assistant_msg["content"] = pending_thinking_blocks # type: ignore + pending_thinking_blocks = None # Clear after using + return current_assistant_msg for item in items: @@ -483,9 +505,28 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: f"Encountered an item_reference, which is not supported: {item_ref}" ) - # 7) reasoning message => not handled - elif cls.maybe_reasoning_message(item): - pass + # 7) reasoning message => extract thinking blocks if present + elif reasoning_item := cls.maybe_reasoning_message(item): + # Reconstruct thinking blocks from content (text) and encrypted_content (signature) + content_items = reasoning_item.get("content", []) + signature = reasoning_item.get("encrypted_content") + + if content_items: + # Reconstruct thinking blocks from content and signature + pending_thinking_blocks = [] + for content_item in content_items: + if ( + isinstance(content_item, dict) + and content_item.get("type") == "reasoning_text" + ): + thinking_block = { + "type": "thinking", + "thinking": content_item.get("text", ""), + } + # Add signature if available + if signature: + thinking_block["signature"] = signature + pending_thinking_blocks.append(thinking_block) # 8) If we haven't recognized it => fail or ignore else: diff --git a/src/agents/models/chatcmpl_stream_handler.py b/src/agents/models/chatcmpl_stream_handler.py index 359d47bb5..474bffe09 100644 --- a/src/agents/models/chatcmpl_stream_handler.py +++ b/src/agents/models/chatcmpl_stream_handler.py @@ -62,6 +62,9 @@ class StreamingState: # Fields for real-time function call streaming function_call_streaming: dict[int, bool] = field(default_factory=dict) function_call_output_idx: dict[int, int] = field(default_factory=dict) + # Store accumulated thinking text and signature for Anthropic compatibility + thinking_text: str = "" + thinking_signature: str | None = None class SequenceNumber: @@ -101,6 +104,19 @@ async def handle_stream( delta = chunk.choices[0].delta + # Handle thinking blocks from Anthropic (for preserving signatures) + if hasattr(delta, "thinking_blocks") and delta.thinking_blocks: + for block in delta.thinking_blocks: + if isinstance(block, dict): + # Accumulate thinking text + thinking_text = block.get("thinking", "") + if thinking_text: + state.thinking_text += thinking_text + # Store signature if present + signature = block.get("signature") + if signature: + state.thinking_signature = signature + # Handle reasoning content for reasoning summaries if hasattr(delta, "reasoning_content"): reasoning_content = delta.reasoning_content @@ -527,7 +543,19 @@ async def handle_stream( # include Reasoning item if it exists if state.reasoning_content_index_and_output: - outputs.append(state.reasoning_content_index_and_output[1]) + reasoning_item = state.reasoning_content_index_and_output[1] + # Store thinking text in content and signature in encrypted_content + if state.thinking_text: + # Add thinking text as a Content object + if not reasoning_item.content: + reasoning_item.content = [] + reasoning_item.content.append( + Content(text=state.thinking_text, type="reasoning_text") + ) + # Store signature in encrypted_content + if state.thinking_signature: + reasoning_item.encrypted_content = state.thinking_signature + outputs.append(reasoning_item) # include text or refusal content if they exist if state.text_content_index_and_output or state.refusal_content_index_and_output: diff --git a/tests/test_anthropic_thinking_blocks.py b/tests/test_anthropic_thinking_blocks.py index 9513c7833..1cc9cdbfe 100644 --- a/tests/test_anthropic_thinking_blocks.py +++ b/tests/test_anthropic_thinking_blocks.py @@ -10,7 +10,10 @@ from __future__ import annotations -from typing import Any +from typing import Any, cast + +from openai.types.chat import ChatCompletionMessageToolCall +from openai.types.chat.chat_completion_message_tool_call import Function from agents.extensions.models.litellm_model import InternalChatCompletionMessage from agents.models.chatcmpl_converter import Converter @@ -99,3 +102,115 @@ def test_reasoning_items_preserved_in_message_conversion(): thinking_block = reasoning_item.content[0] assert thinking_block.type == "reasoning_text" assert thinking_block.text == "I need to call the weather function for Paris" + + +def test_anthropic_thinking_blocks_with_tool_calls(): + """ + Test for models with extended thinking and interleaved thinking with tool calls. + + This test verifies the Anthropic's API's requirements for thinking blocks + to be the first content in assistant messages when reasoning is enabled and tool + calls are present. + """ + # Create a message with reasoning, thinking blocks and tool calls + message = InternalChatCompletionMessage( + role="assistant", + content="I'll check the weather for you.", + reasoning_content="The user wants weather information, I need to call the weather function", + thinking_blocks=[ + { + "type": "thinking", + "thinking": ( + "The user is asking about weather. " + "Let me use the weather tool to get this information." + ), + "signature": "TestSignature123", + } + ], + tool_calls=[ + ChatCompletionMessageToolCall( + id="call_123", + type="function", + function=Function(name="get_weather", arguments='{"city": "Tokyo"}'), + ) + ], + ) + + # Step 1: Convert message to output items + output_items = Converter.message_to_output_items(message) + + # Verify reasoning item exists and contains thinking blocks + reasoning_items = [ + item for item in output_items if hasattr(item, "type") and item.type == "reasoning" + ] + assert len(reasoning_items) == 1, "Should have exactly one reasoning item" + + reasoning_item = reasoning_items[0] + + # Verify thinking text is stored in content + assert hasattr(reasoning_item, "content") and reasoning_item.content, ( + "Reasoning item should have content" + ) + assert reasoning_item.content[0].type == "reasoning_text", ( + "Content should be reasoning_text type" + ) + + # Verify signature is stored in encrypted_content + assert hasattr(reasoning_item, "encrypted_content"), ( + "Reasoning item should have encrypted_content" + ) + assert reasoning_item.encrypted_content == "TestSignature123", "Signature should be preserved" + + # Verify tool calls are present + tool_call_items = [ + item for item in output_items if hasattr(item, "type") and item.type == "function_call" + ] + assert len(tool_call_items) == 1, "Should have exactly one tool call" + + # Step 2: Convert output items back to messages + # Convert items to dicts for the converter (simulating serialization/deserialization) + items_as_dicts: list[dict[str, Any]] = [] + for item in output_items: + if hasattr(item, "model_dump"): + items_as_dicts.append(item.model_dump()) + else: + items_as_dicts.append(cast(dict[str, Any], item)) + + messages = Converter.items_to_messages(items_as_dicts) # type: ignore[arg-type] + + # Find the assistant message with tool calls + assistant_messages = [ + msg for msg in messages if msg.get("role") == "assistant" and msg.get("tool_calls") + ] + assert len(assistant_messages) == 1, "Should have exactly one assistant message with tool calls" + + assistant_msg = assistant_messages[0] + + # Content must start with thinking blocks, not text + content = assistant_msg.get("content") + assert content is not None, "Assistant message should have content" + + assert isinstance(content, list) and len(content) > 0, ( + "Assistant message content should be a non-empty list" + ) + + first_content = content[0] + assert first_content.get("type") == "thinking", ( + f"First content must be 'thinking' type for Anthropic compatibility, " + f"but got '{first_content.get('type')}'" + ) + expected_thinking = ( + "The user is asking about weather. Let me use the weather tool to get this information." + ) + assert first_content.get("thinking") == expected_thinking, ( + "Thinking content should be preserved" + ) + # Signature should also be preserved + assert first_content.get("signature") == "TestSignature123", ( + "Signature should be preserved in thinking block" + ) + + # Verify tool calls are preserved + tool_calls = assistant_msg.get("tool_calls", []) + assert len(cast(list[Any], tool_calls)) == 1, "Tool calls should be preserved" + assert cast(list[Any], tool_calls)[0]["function"]["name"] == "get_weather" From 543d1927b0a0f9e84fccbe863c8cc6c2ecb50682 Mon Sep 17 00:00:00 2001 From: Sung-jin Brian Hong Date: Tue, 16 Sep 2025 12:39:14 +0900 Subject: [PATCH 2/4] refactor: more specific code path for thinking block insertion Move the location of thinking block insertion from `ensure_assistant_message()` to `elif func_call := cls.maybe_function_tool_call(item):` to be more specific and prevent unintended side-effects. --- src/agents/models/chatcmpl_converter.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py index a0816b836..6a4d65370 100644 --- a/src/agents/models/chatcmpl_converter.py +++ b/src/agents/models/chatcmpl_converter.py @@ -356,12 +356,6 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: current_assistant_msg = ChatCompletionAssistantMessageParam(role="assistant") current_assistant_msg["tool_calls"] = [] - # If we have pending thinking blocks, use them as the content - # This is required for Anthropic API tool calls with interleaved thinking - if pending_thinking_blocks: - current_assistant_msg["content"] = pending_thinking_blocks # type: ignore - pending_thinking_blocks = None # Clear after using - return current_assistant_msg for item in items: @@ -477,6 +471,13 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: elif func_call := cls.maybe_function_tool_call(item): asst = ensure_assistant_message() + + # If we have pending thinking blocks, use them as the content + # This is required for Anthropic API tool calls with interleaved thinking + if pending_thinking_blocks: + asst["content"] = pending_thinking_blocks # type: ignore + pending_thinking_blocks = None # Clear after using + tool_calls = list(asst.get("tool_calls", [])) arguments = func_call["arguments"] if func_call["arguments"] else "{}" new_tool_call = ChatCompletionMessageFunctionToolCallParam( From 8ab93577855fbea33539be04eea27d37eaa74e84 Mon Sep 17 00:00:00 2001 From: Sung-jin Brian Hong Date: Tue, 16 Sep 2025 14:34:32 +0900 Subject: [PATCH 3/4] fix: handoff from reasoning model to non-reasoning Be more selective/surgical on the reasoning message preservation. When we handoff from Claude 4 Sonnet Thinking to non-thinking agent, we get errors because non-thinking models expects no thinking blocks in the request. This fixes this edge case by only preserving blocks when reasoning effort is not None. --- src/agents/extensions/models/litellm_model.py | 10 +++++++++- src/agents/models/chatcmpl_converter.py | 10 +++++++++- tests/test_anthropic_thinking_blocks.py | 2 +- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py index 4369b342b..9d35194a3 100644 --- a/src/agents/extensions/models/litellm_model.py +++ b/src/agents/extensions/models/litellm_model.py @@ -257,7 +257,15 @@ async def _fetch_response( stream: bool = False, prompt: Any | None = None, ) -> litellm.types.utils.ModelResponse | tuple[Response, AsyncStream[ChatCompletionChunk]]: - converted_messages = Converter.items_to_messages(input) + # Preserve reasoning messages for tool calls when reasoning is on + # This is needed for models like Claude 4 Sonnet/Opus which support interleaved thinking + preserve_reasoning_message = ( + model_settings.reasoning is not None and model_settings.reasoning.effort is not None + ) + + converted_messages = Converter.items_to_messages( + input, preserve_reasoning_message=preserve_reasoning_message + ) if system_instructions: converted_messages.insert( diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py index 6a4d65370..2599b8191 100644 --- a/src/agents/models/chatcmpl_converter.py +++ b/src/agents/models/chatcmpl_converter.py @@ -315,10 +315,18 @@ def extract_all_content( def items_to_messages( cls, items: str | Iterable[TResponseInputItem], + preserve_reasoning_message: bool = False, ) -> list[ChatCompletionMessageParam]: """ Convert a sequence of 'Item' objects into a list of ChatCompletionMessageParam. + Args: + items: A string or iterable of response input items to convert + preserve_reasoning_message: Whether to preserve reasoning messages (thinking blocks) + in tool calls for reasoning models like Claude 4 Sonnet/Opus which support + interleaved thinking. When True, thinking blocks are reconstructed and + included in assistant messages with tool calls. + Rules: - EasyInputMessage or InputMessage (role=user) => ChatCompletionUserMessageParam - EasyInputMessage or InputMessage (role=system) => ChatCompletionSystemMessageParam @@ -512,7 +520,7 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: content_items = reasoning_item.get("content", []) signature = reasoning_item.get("encrypted_content") - if content_items: + if content_items and preserve_reasoning_message: # Reconstruct thinking blocks from content and signature pending_thinking_blocks = [] for content_item in content_items: diff --git a/tests/test_anthropic_thinking_blocks.py b/tests/test_anthropic_thinking_blocks.py index 1cc9cdbfe..b52486bfc 100644 --- a/tests/test_anthropic_thinking_blocks.py +++ b/tests/test_anthropic_thinking_blocks.py @@ -176,7 +176,7 @@ def test_anthropic_thinking_blocks_with_tool_calls(): else: items_as_dicts.append(cast(dict[str, Any], item)) - messages = Converter.items_to_messages(items_as_dicts) # type: ignore[arg-type] + messages = Converter.items_to_messages(items_as_dicts, preserve_reasoning_message=True) # type: ignore[arg-type] # Find the assistant message with tool calls assistant_messages = [ From c8d5e4a263e7fb00d6a053338a793d170f7c548d Mon Sep 17 00:00:00 2001 From: Sung-jin Brian Hong Date: Tue, 16 Sep 2025 17:09:02 +0900 Subject: [PATCH 4/4] refactor: rename preserve_reasoning_message to preserve_thinking_blocks Since this parameter is anthropic-specific logic, changed the name to anthropic terminology --- src/agents/extensions/models/litellm_model.py | 4 ++-- src/agents/models/chatcmpl_converter.py | 12 ++++++------ tests/test_anthropic_thinking_blocks.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py index 9d35194a3..8d39ad390 100644 --- a/src/agents/extensions/models/litellm_model.py +++ b/src/agents/extensions/models/litellm_model.py @@ -259,12 +259,12 @@ async def _fetch_response( ) -> litellm.types.utils.ModelResponse | tuple[Response, AsyncStream[ChatCompletionChunk]]: # Preserve reasoning messages for tool calls when reasoning is on # This is needed for models like Claude 4 Sonnet/Opus which support interleaved thinking - preserve_reasoning_message = ( + preserve_thinking_blocks = ( model_settings.reasoning is not None and model_settings.reasoning.effort is not None ) converted_messages = Converter.items_to_messages( - input, preserve_reasoning_message=preserve_reasoning_message + input, preserve_thinking_blocks=preserve_thinking_blocks ) if system_instructions: diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py index 2599b8191..96f02a5fe 100644 --- a/src/agents/models/chatcmpl_converter.py +++ b/src/agents/models/chatcmpl_converter.py @@ -315,17 +315,17 @@ def extract_all_content( def items_to_messages( cls, items: str | Iterable[TResponseInputItem], - preserve_reasoning_message: bool = False, + preserve_thinking_blocks: bool = False, ) -> list[ChatCompletionMessageParam]: """ Convert a sequence of 'Item' objects into a list of ChatCompletionMessageParam. Args: items: A string or iterable of response input items to convert - preserve_reasoning_message: Whether to preserve reasoning messages (thinking blocks) - in tool calls for reasoning models like Claude 4 Sonnet/Opus which support - interleaved thinking. When True, thinking blocks are reconstructed and - included in assistant messages with tool calls. + preserve_thinking_blocks: Whether to preserve thinking blocks in tool calls + for reasoning models like Claude 4 Sonnet/Opus which support interleaved + thinking. When True, thinking blocks are reconstructed and included in + assistant messages with tool calls. Rules: - EasyInputMessage or InputMessage (role=user) => ChatCompletionUserMessageParam @@ -520,7 +520,7 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam: content_items = reasoning_item.get("content", []) signature = reasoning_item.get("encrypted_content") - if content_items and preserve_reasoning_message: + if content_items and preserve_thinking_blocks: # Reconstruct thinking blocks from content and signature pending_thinking_blocks = [] for content_item in content_items: diff --git a/tests/test_anthropic_thinking_blocks.py b/tests/test_anthropic_thinking_blocks.py index b52486bfc..933be2c0e 100644 --- a/tests/test_anthropic_thinking_blocks.py +++ b/tests/test_anthropic_thinking_blocks.py @@ -176,7 +176,7 @@ def test_anthropic_thinking_blocks_with_tool_calls(): else: items_as_dicts.append(cast(dict[str, Any], item)) - messages = Converter.items_to_messages(items_as_dicts, preserve_reasoning_message=True) # type: ignore[arg-type] + messages = Converter.items_to_messages(items_as_dicts, preserve_thinking_blocks=True) # type: ignore[arg-type] # Find the assistant message with tool calls assistant_messages = [