diff --git a/examples/realtime/app/server.py b/examples/realtime/app/server.py index 73fcf3e56..4690c3067 100644 --- a/examples/realtime/app/server.py +++ b/examples/realtime/app/server.py @@ -111,6 +111,8 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]: } elif event.type == "error": base_event["error"] = str(event.error) if hasattr(event, "error") else "Unknown error" + elif event.type == "input_audio_timeout_triggered": + pass else: assert_never(event) diff --git a/src/agents/realtime/config.py b/src/agents/realtime/config.py index fdbc19074..36254012b 100644 --- a/src/agents/realtime/config.py +++ b/src/agents/realtime/config.py @@ -78,6 +78,9 @@ class RealtimeTurnDetectionConfig(TypedDict): threshold: NotRequired[float] """The threshold for voice activity detection.""" + idle_timeout_ms: NotRequired[int] + """Threshold for server-vad to trigger a response if the user is idle for this duration.""" + class RealtimeSessionModelSettings(TypedDict): """Model settings for a realtime model session.""" diff --git a/src/agents/realtime/events.py b/src/agents/realtime/events.py index 93248b611..3c523c33b 100644 --- a/src/agents/realtime/events.py +++ b/src/agents/realtime/events.py @@ -216,6 +216,16 @@ class RealtimeGuardrailTripped: type: Literal["guardrail_tripped"] = "guardrail_tripped" +@dataclass +class RealtimeInputAudioTimeoutTriggered: + """Called when the model detects a period of inactivity/silence from the user.""" + + info: RealtimeEventInfo + """Common info for all events, such as the context.""" + + type: Literal["input_audio_timeout_triggered"] = "input_audio_timeout_triggered" + + RealtimeSessionEvent: TypeAlias = Union[ RealtimeAgentStartEvent, RealtimeAgentEndEvent, @@ -230,5 +240,6 @@ class RealtimeGuardrailTripped: RealtimeHistoryUpdated, RealtimeHistoryAdded, RealtimeGuardrailTripped, + RealtimeInputAudioTimeoutTriggered, ] """An event emitted by the realtime session.""" diff --git a/src/agents/realtime/model_events.py b/src/agents/realtime/model_events.py index 5aeadc0f9..a6d0bdecb 100644 --- a/src/agents/realtime/model_events.py +++ b/src/agents/realtime/model_events.py @@ -84,6 +84,15 @@ class RealtimeModelInputAudioTranscriptionCompletedEvent: type: Literal["input_audio_transcription_completed"] = "input_audio_transcription_completed" +@dataclass +class RealtimeModelInputAudioTimeoutTriggeredEvent: + """Input audio timeout triggered.""" + + item_id: str + audio_start_ms: int + audio_end_ms: int + + type: Literal["input_audio_timeout_triggered"] = "input_audio_timeout_triggered" @dataclass class RealtimeModelTranscriptDeltaEvent: @@ -174,6 +183,7 @@ class RealtimeModelRawServerEvent: RealtimeModelAudioEvent, RealtimeModelAudioInterruptedEvent, RealtimeModelAudioDoneEvent, + RealtimeModelInputAudioTimeoutTriggeredEvent, RealtimeModelInputAudioTranscriptionCompletedEvent, RealtimeModelTranscriptDeltaEvent, RealtimeModelItemUpdatedEvent, diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py index bbeda20f1..b483308d3 100644 --- a/src/agents/realtime/openai_realtime.py +++ b/src/agents/realtime/openai_realtime.py @@ -6,7 +6,7 @@ import json import os from datetime import datetime -from typing import Any, Callable, Literal +from typing import Annotated, Any, Callable, Literal, Union import pydantic import websockets @@ -52,7 +52,7 @@ SessionTracingTracingConfiguration as OpenAISessionTracingConfiguration, SessionUpdateEvent as OpenAISessionUpdateEvent, ) -from pydantic import TypeAdapter +from pydantic import BaseModel, Field, TypeAdapter from typing_extensions import assert_never from websockets.asyncio.client import ClientConnection @@ -83,6 +83,7 @@ RealtimeModelErrorEvent, RealtimeModelEvent, RealtimeModelExceptionEvent, + RealtimeModelInputAudioTimeoutTriggeredEvent, RealtimeModelInputAudioTranscriptionCompletedEvent, RealtimeModelItemDeletedEvent, RealtimeModelItemUpdatedEvent, @@ -128,6 +129,22 @@ async def get_api_key(key: str | Callable[[], MaybeAwaitable[str]] | None) -> st return os.getenv("OPENAI_API_KEY") +class _InputAudioBufferTimeoutTriggeredEvent(BaseModel): + type: Literal["input_audio_buffer.timeout_triggered"] + event_id: str + audio_start_ms: int + audio_end_ms: int + item_id: str + +AllRealtimeServerEvents = Annotated[ + Union[ + OpenAIRealtimeServerEvent, + _InputAudioBufferTimeoutTriggeredEvent, + ], + Field(discriminator="type"), +] + + class OpenAIRealtimeWebSocketModel(RealtimeModel): """A model that uses OpenAI's WebSocket API.""" @@ -462,8 +479,8 @@ async def _handle_ws_event(self, event: dict[str, Any]): try: if "previous_item_id" in event and event["previous_item_id"] is None: event["previous_item_id"] = "" # TODO (rm) remove - parsed: OpenAIRealtimeServerEvent = TypeAdapter( - OpenAIRealtimeServerEvent + parsed: AllRealtimeServerEvents = TypeAdapter( + AllRealtimeServerEvents ).validate_python(event) except pydantic.ValidationError as e: logger.error(f"Failed to validate server event: {event}", exc_info=True) @@ -554,6 +571,12 @@ async def _handle_ws_event(self, event: dict[str, Any]): or parsed.type == "response.output_item.done" ): await self._handle_output_item(parsed.item) + elif parsed.type == "input_audio_buffer.timeout_triggered": + await self._emit_event(RealtimeModelInputAudioTimeoutTriggeredEvent( + item_id=parsed.item_id, + audio_start_ms=parsed.audio_start_ms, + audio_end_ms=parsed.audio_end_ms, + )) def _update_created_session(self, session: OpenAISessionObject) -> None: self._created_session = session diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py index 42d61cf2b..c309a2655 100644 --- a/src/agents/realtime/session.py +++ b/src/agents/realtime/session.py @@ -28,6 +28,7 @@ RealtimeHandoffEvent, RealtimeHistoryAdded, RealtimeHistoryUpdated, + RealtimeInputAudioTimeoutTriggered, RealtimeRawModelEvent, RealtimeSessionEvent, RealtimeToolEnd, @@ -227,6 +228,12 @@ async def on_event(self, event: RealtimeModelEvent) -> None: await self._put_event( RealtimeHistoryUpdated(info=self._event_info, history=self._history) ) + elif event.type == "input_audio_timeout_triggered": + await self._put_event( + RealtimeInputAudioTimeoutTriggered( + info=self._event_info, + ) + ) elif event.type == "transcript_delta": # Accumulate transcript text for guardrail debouncing per item_id item_id = event.item_id