Skip to content

Commit 83bb95b

Browse files
Merge branch 'main' into main
2 parents 2288fe4 + aeaf83f commit 83bb95b

File tree

9 files changed

+326
-19
lines changed

9 files changed

+326
-19
lines changed

docs/ref/realtime/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
## Audio Configuration
1212

1313
::: agents.realtime.config.RealtimeInputAudioTranscriptionConfig
14+
::: agents.realtime.config.RealtimeInputAudioNoiseReductionConfig
1415
::: agents.realtime.config.RealtimeTurnDetectionConfig
1516

1617
## Guardrails Settings

src/agents/extensions/models/litellm_model.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,15 @@ async def _fetch_response(
257257
stream: bool = False,
258258
prompt: Any | None = None,
259259
) -> litellm.types.utils.ModelResponse | tuple[Response, AsyncStream[ChatCompletionChunk]]:
260-
converted_messages = Converter.items_to_messages(input)
260+
# Preserve reasoning messages for tool calls when reasoning is on
261+
# This is needed for models like Claude 4 Sonnet/Opus which support interleaved thinking
262+
preserve_thinking_blocks = (
263+
model_settings.reasoning is not None and model_settings.reasoning.effort is not None
264+
)
265+
266+
converted_messages = Converter.items_to_messages(
267+
input, preserve_thinking_blocks=preserve_thinking_blocks
268+
)
261269

262270
if system_instructions:
263271
converted_messages.insert(

src/agents/models/chatcmpl_converter.py

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
ResponseReasoningItemParam,
4040
)
4141
from openai.types.responses.response_input_param import FunctionCallOutput, ItemReference, Message
42-
from openai.types.responses.response_reasoning_item import Summary
42+
from openai.types.responses.response_reasoning_item import Content, Summary
4343

4444
from ..agent_output import AgentOutputSchemaBase
4545
from ..exceptions import AgentsException, UserError
@@ -93,24 +93,38 @@ def convert_response_format(
9393
def message_to_output_items(cls, message: ChatCompletionMessage) -> list[TResponseOutputItem]:
9494
items: list[TResponseOutputItem] = []
9595

96-
# Handle reasoning content if available
96+
# Check if message is agents.extentions.models.litellm_model.InternalChatCompletionMessage
97+
# We can't actually import it here because litellm is an optional dependency
98+
# So we use hasattr to check for reasoning_content and thinking_blocks
9799
if hasattr(message, "reasoning_content") and message.reasoning_content:
98100
reasoning_item = ResponseReasoningItem(
99101
id=FAKE_RESPONSES_ID,
100102
summary=[Summary(text=message.reasoning_content, type="summary_text")],
101103
type="reasoning",
102104
)
103105

104-
# Store full thinking blocks for Anthropic compatibility
106+
# Store thinking blocks for Anthropic compatibility
105107
if hasattr(message, "thinking_blocks") and message.thinking_blocks:
106-
# Store thinking blocks in the reasoning item's content
107-
# Convert thinking blocks to Content objects
108-
from openai.types.responses.response_reasoning_item import Content
109-
110-
reasoning_item.content = [
111-
Content(text=str(block.get("thinking", "")), type="reasoning_text")
112-
for block in message.thinking_blocks
113-
]
108+
# Store thinking text in content and signature in encrypted_content
109+
reasoning_item.content = []
110+
signature = None
111+
for block in message.thinking_blocks:
112+
if isinstance(block, dict):
113+
thinking_text = block.get("thinking", "")
114+
if thinking_text:
115+
reasoning_item.content.append(
116+
Content(text=thinking_text, type="reasoning_text")
117+
)
118+
# Store the signature if present
119+
if block.get("signature"):
120+
signature = block.get("signature")
121+
122+
# Store only the last signature in encrypted_content
123+
# If there are multiple thinking blocks, this should be a problem.
124+
# In practice, there should only be one signature for the entire reasoning step.
125+
# Tested with: claude-sonnet-4-20250514
126+
if signature:
127+
reasoning_item.encrypted_content = signature
114128

115129
items.append(reasoning_item)
116130

@@ -301,10 +315,18 @@ def extract_all_content(
301315
def items_to_messages(
302316
cls,
303317
items: str | Iterable[TResponseInputItem],
318+
preserve_thinking_blocks: bool = False,
304319
) -> list[ChatCompletionMessageParam]:
305320
"""
306321
Convert a sequence of 'Item' objects into a list of ChatCompletionMessageParam.
307322
323+
Args:
324+
items: A string or iterable of response input items to convert
325+
preserve_thinking_blocks: Whether to preserve thinking blocks in tool calls
326+
for reasoning models like Claude 4 Sonnet/Opus which support interleaved
327+
thinking. When True, thinking blocks are reconstructed and included in
328+
assistant messages with tool calls.
329+
308330
Rules:
309331
- EasyInputMessage or InputMessage (role=user) => ChatCompletionUserMessageParam
310332
- EasyInputMessage or InputMessage (role=system) => ChatCompletionSystemMessageParam
@@ -325,6 +347,7 @@ def items_to_messages(
325347

326348
result: list[ChatCompletionMessageParam] = []
327349
current_assistant_msg: ChatCompletionAssistantMessageParam | None = None
350+
pending_thinking_blocks: list[dict[str, str]] | None = None
328351

329352
def flush_assistant_message() -> None:
330353
nonlocal current_assistant_msg
@@ -336,10 +359,11 @@ def flush_assistant_message() -> None:
336359
current_assistant_msg = None
337360

338361
def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
339-
nonlocal current_assistant_msg
362+
nonlocal current_assistant_msg, pending_thinking_blocks
340363
if current_assistant_msg is None:
341364
current_assistant_msg = ChatCompletionAssistantMessageParam(role="assistant")
342365
current_assistant_msg["tool_calls"] = []
366+
343367
return current_assistant_msg
344368

345369
for item in items:
@@ -455,6 +479,13 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
455479

456480
elif func_call := cls.maybe_function_tool_call(item):
457481
asst = ensure_assistant_message()
482+
483+
# If we have pending thinking blocks, use them as the content
484+
# This is required for Anthropic API tool calls with interleaved thinking
485+
if pending_thinking_blocks:
486+
asst["content"] = pending_thinking_blocks # type: ignore
487+
pending_thinking_blocks = None # Clear after using
488+
458489
tool_calls = list(asst.get("tool_calls", []))
459490
arguments = func_call["arguments"] if func_call["arguments"] else "{}"
460491
new_tool_call = ChatCompletionMessageFunctionToolCallParam(
@@ -483,9 +514,28 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
483514
f"Encountered an item_reference, which is not supported: {item_ref}"
484515
)
485516

486-
# 7) reasoning message => not handled
487-
elif cls.maybe_reasoning_message(item):
488-
pass
517+
# 7) reasoning message => extract thinking blocks if present
518+
elif reasoning_item := cls.maybe_reasoning_message(item):
519+
# Reconstruct thinking blocks from content (text) and encrypted_content (signature)
520+
content_items = reasoning_item.get("content", [])
521+
signature = reasoning_item.get("encrypted_content")
522+
523+
if content_items and preserve_thinking_blocks:
524+
# Reconstruct thinking blocks from content and signature
525+
pending_thinking_blocks = []
526+
for content_item in content_items:
527+
if (
528+
isinstance(content_item, dict)
529+
and content_item.get("type") == "reasoning_text"
530+
):
531+
thinking_block = {
532+
"type": "thinking",
533+
"thinking": content_item.get("text", ""),
534+
}
535+
# Add signature if available
536+
if signature:
537+
thinking_block["signature"] = signature
538+
pending_thinking_blocks.append(thinking_block)
489539

490540
# 8) If we haven't recognized it => fail or ignore
491541
else:

src/agents/models/chatcmpl_stream_handler.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ class StreamingState:
6262
# Fields for real-time function call streaming
6363
function_call_streaming: dict[int, bool] = field(default_factory=dict)
6464
function_call_output_idx: dict[int, int] = field(default_factory=dict)
65+
# Store accumulated thinking text and signature for Anthropic compatibility
66+
thinking_text: str = ""
67+
thinking_signature: str | None = None
6568

6669

6770
class SequenceNumber:
@@ -101,6 +104,19 @@ async def handle_stream(
101104

102105
delta = chunk.choices[0].delta
103106

107+
# Handle thinking blocks from Anthropic (for preserving signatures)
108+
if hasattr(delta, "thinking_blocks") and delta.thinking_blocks:
109+
for block in delta.thinking_blocks:
110+
if isinstance(block, dict):
111+
# Accumulate thinking text
112+
thinking_text = block.get("thinking", "")
113+
if thinking_text:
114+
state.thinking_text += thinking_text
115+
# Store signature if present
116+
signature = block.get("signature")
117+
if signature:
118+
state.thinking_signature = signature
119+
104120
# Handle reasoning content for reasoning summaries
105121
if hasattr(delta, "reasoning_content"):
106122
reasoning_content = delta.reasoning_content
@@ -527,7 +543,19 @@ async def handle_stream(
527543

528544
# include Reasoning item if it exists
529545
if state.reasoning_content_index_and_output:
530-
outputs.append(state.reasoning_content_index_and_output[1])
546+
reasoning_item = state.reasoning_content_index_and_output[1]
547+
# Store thinking text in content and signature in encrypted_content
548+
if state.thinking_text:
549+
# Add thinking text as a Content object
550+
if not reasoning_item.content:
551+
reasoning_item.content = []
552+
reasoning_item.content.append(
553+
Content(text=state.thinking_text, type="reasoning_text")
554+
)
555+
# Store signature in encrypted_content
556+
if state.thinking_signature:
557+
reasoning_item.encrypted_content = state.thinking_signature
558+
outputs.append(reasoning_item)
531559

532560
# include text or refusal content if they exist
533561
if state.text_content_index_and_output or state.refusal_content_index_and_output:

src/agents/realtime/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
RealtimeAudioFormat,
44
RealtimeClientMessage,
55
RealtimeGuardrailsSettings,
6+
RealtimeInputAudioNoiseReductionConfig,
67
RealtimeInputAudioTranscriptionConfig,
78
RealtimeModelName,
89
RealtimeModelTracingConfig,
@@ -101,6 +102,7 @@
101102
"RealtimeAudioFormat",
102103
"RealtimeClientMessage",
103104
"RealtimeGuardrailsSettings",
105+
"RealtimeInputAudioNoiseReductionConfig",
104106
"RealtimeInputAudioTranscriptionConfig",
105107
"RealtimeModelName",
106108
"RealtimeModelTracingConfig",

src/agents/realtime/config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ class RealtimeInputAudioTranscriptionConfig(TypedDict):
6161
"""An optional prompt to guide transcription."""
6262

6363

64+
class RealtimeInputAudioNoiseReductionConfig(TypedDict):
65+
"""Noise reduction configuration for input audio."""
66+
67+
type: NotRequired[Literal["near_field", "far_field"]]
68+
"""Noise reduction mode to apply to input audio."""
69+
70+
6471
class RealtimeTurnDetectionConfig(TypedDict):
6572
"""Turn detection config. Allows extra vendor keys if needed."""
6673

@@ -119,6 +126,9 @@ class RealtimeSessionModelSettings(TypedDict):
119126
input_audio_transcription: NotRequired[RealtimeInputAudioTranscriptionConfig]
120127
"""Configuration for transcribing input audio."""
121128

129+
input_audio_noise_reduction: NotRequired[RealtimeInputAudioNoiseReductionConfig | None]
130+
"""Noise reduction configuration for input audio."""
131+
122132
turn_detection: NotRequired[RealtimeTurnDetectionConfig]
123133
"""Configuration for detecting conversation turns."""
124134

src/agents/realtime/openai_realtime.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -825,14 +825,24 @@ def _get_session_config(
825825
"output_audio_format",
826826
DEFAULT_MODEL_SETTINGS.get("output_audio_format"),
827827
)
828+
input_audio_noise_reduction = model_settings.get(
829+
"input_audio_noise_reduction",
830+
DEFAULT_MODEL_SETTINGS.get("input_audio_noise_reduction"),
831+
)
828832

829833
input_audio_config = None
830834
if any(
831835
value is not None
832-
for value in [input_audio_format, input_audio_transcription, turn_detection]
836+
for value in [
837+
input_audio_format,
838+
input_audio_noise_reduction,
839+
input_audio_transcription,
840+
turn_detection,
841+
]
833842
):
834843
input_audio_config = OpenAIRealtimeAudioInput(
835844
format=to_realtime_audio_format(input_audio_format),
845+
noise_reduction=cast(Any, input_audio_noise_reduction),
836846
transcription=cast(Any, input_audio_transcription),
837847
turn_detection=cast(Any, turn_detection),
838848
)

tests/realtime/test_openai_realtime.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from typing import Any, cast
23
from unittest.mock import AsyncMock, Mock, patch
34

@@ -96,6 +97,88 @@ def mock_create_task_func(coro):
9697
assert model._websocket_task is not None
9798
assert model.model == "gpt-4o-realtime-preview"
9899

100+
@pytest.mark.asyncio
101+
async def test_session_update_includes_noise_reduction(self, model, mock_websocket):
102+
"""Session.update should pass through input_audio_noise_reduction config."""
103+
config = {
104+
"api_key": "test-api-key-123",
105+
"initial_model_settings": {
106+
"model_name": "gpt-4o-realtime-preview",
107+
"input_audio_noise_reduction": {"type": "near_field"},
108+
},
109+
}
110+
111+
sent_messages: list[dict[str, Any]] = []
112+
113+
async def async_websocket(*args, **kwargs):
114+
async def send(payload: str):
115+
sent_messages.append(json.loads(payload))
116+
return None
117+
118+
mock_websocket.send.side_effect = send
119+
return mock_websocket
120+
121+
with patch("websockets.connect", side_effect=async_websocket):
122+
with patch("asyncio.create_task") as mock_create_task:
123+
mock_task = AsyncMock()
124+
125+
def mock_create_task_func(coro):
126+
coro.close()
127+
return mock_task
128+
129+
mock_create_task.side_effect = mock_create_task_func
130+
await model.connect(config)
131+
132+
# Find the session.update events
133+
session_updates = [m for m in sent_messages if m.get("type") == "session.update"]
134+
assert len(session_updates) >= 1
135+
# Verify the last session.update contains the noise_reduction field
136+
session = session_updates[-1]["session"]
137+
assert session.get("audio", {}).get("input", {}).get("noise_reduction") == {
138+
"type": "near_field"
139+
}
140+
141+
@pytest.mark.asyncio
142+
async def test_session_update_omits_noise_reduction_when_not_provided(
143+
self, model, mock_websocket
144+
):
145+
"""Session.update should omit input_audio_noise_reduction when not provided."""
146+
config = {
147+
"api_key": "test-api-key-123",
148+
"initial_model_settings": {
149+
"model_name": "gpt-4o-realtime-preview",
150+
},
151+
}
152+
153+
sent_messages: list[dict[str, Any]] = []
154+
155+
async def async_websocket(*args, **kwargs):
156+
async def send(payload: str):
157+
sent_messages.append(json.loads(payload))
158+
return None
159+
160+
mock_websocket.send.side_effect = send
161+
return mock_websocket
162+
163+
with patch("websockets.connect", side_effect=async_websocket):
164+
with patch("asyncio.create_task") as mock_create_task:
165+
mock_task = AsyncMock()
166+
167+
def mock_create_task_func(coro):
168+
coro.close()
169+
return mock_task
170+
171+
mock_create_task.side_effect = mock_create_task_func
172+
await model.connect(config)
173+
174+
# Find the session.update events
175+
session_updates = [m for m in sent_messages if m.get("type") == "session.update"]
176+
assert len(session_updates) >= 1
177+
# Verify the last session.update omits the noise_reduction field
178+
session = session_updates[-1]["session"]
179+
assert "audio" in session and "input" in session["audio"]
180+
assert "noise_reduction" not in session["audio"]["input"]
181+
99182
@pytest.mark.asyncio
100183
async def test_connect_with_custom_headers_overrides_defaults(self, model, mock_websocket):
101184
"""If custom headers are provided, use them verbatim without adding defaults."""

0 commit comments

Comments
 (0)