From 653cd2316d99132cddb555a65eab5b880a15c26c Mon Sep 17 00:00:00 2001 From: Rohan Mehta Date: Wed, 16 Jul 2025 15:16:32 -0400 Subject: [PATCH] Realtime docs --- docs/realtime/guide.md | 143 +++++++++++++++++++++++++++ docs/realtime/quickstart.md | 175 ++++++++++++++++++++++++++++++++++ docs/ref/realtime/agent.md | 3 + docs/ref/realtime/config.md | 41 ++++++++ docs/ref/realtime/events.md | 36 +++++++ docs/ref/realtime/runner.md | 3 + docs/ref/realtime/session.md | 3 + mkdocs.yml | 12 +++ src/agents/realtime/config.py | 58 +++++++++++ src/agents/realtime/items.py | 83 ++++++++++++++++ 10 files changed, 557 insertions(+) create mode 100644 docs/realtime/guide.md create mode 100644 docs/realtime/quickstart.md create mode 100644 docs/ref/realtime/agent.md create mode 100644 docs/ref/realtime/config.md create mode 100644 docs/ref/realtime/events.md create mode 100644 docs/ref/realtime/runner.md create mode 100644 docs/ref/realtime/session.md diff --git a/docs/realtime/guide.md b/docs/realtime/guide.md new file mode 100644 index 000000000..9ea2525cf --- /dev/null +++ b/docs/realtime/guide.md @@ -0,0 +1,143 @@ +# Guide + +This guide provides an in-depth look at building voice-enabled AI agents using the OpenAI Agents SDK's realtime capabilities. + +!!! warning "Beta feature" +Realtime agents are in beta. Expect some breaking changes as we improve the implementation. + +## Overview + +Realtime agents allow for conversational flows, processing audio and text inputs in real time and responding with realtime audio. They maintain persistent connections with OpenAI's Realtime API, enabling natural voice conversations with low latency and the ability to handle interruptions gracefully. + +## Architecture + +### Core Components + +The realtime system consists of several key components: + +- **RealtimeAgent**: An agent, configured wiht instructions, tools and handoffs. +- **RealtimeRunner**: Manages configuration. You can call `runner.run()` to get a session. +- **RealtimeSession**: A single interaction session. You typically create one each time a user starts a conversation, and keep it alive until the conversation is done. +- **RealtimeModel**: The underlying model interface (typically OpenAI's WebSocket implementation) + +### Session flow + +A typical realtime session follows this flow: + +1. **Create your RealtimeAgent(s)** with instructions, tools and handoffs. +2. **Set up a RealtimeRunner** with the agent and configuration options +3. **Start the session** using `await runner.run()` which returns a RealtimeSession. +4. **Send audio or text messages** to the session using `send_audio()` or `send_message()` +5. **Listen for events** by iterating over the session - events include audio output, transcripts, tool calls, handoffs, and errors +6. **Handle interruptions** when users speak over the agent, which automatically stops current audio generation + +The session maintains the conversation history and manages the persistent connection with the realtime model. + +## Agent configuration + +RealtimeAgent works similarly to the regular Agent class with some key differences. For full API details, see the [`RealtimeAgent`][agents.realtime.agent.RealtimeAgent] API reference. + +Key differences from regular agents: + +- Model choice is configured at the session level, not the agent level. +- No structured output support (`outputType` is not supported). +- Voice can be configured per agent but cannot be changed after the first agent speaks. +- All other features like tools, handoffs, and instructions work the same way. + +## Session configuration + +### Model settings + +The session configuration allows you to control the underlying realtime model behavior. You can configure the model name (such as `gpt-4o-realtime-preview`), voice selection (alloy, echo, fable, onyx, nova, shimmer), and supported modalities (text and/or audio). Audio formats can be set for both input and output, with PCM16 being the default. + +### Audio configuration + +Audio settings control how the session handles voice input and output. You can configure input audio transcription using models like Whisper, set language preferences, and provide transcription prompts to improve accuracy for domain-specific terms. Turn detection settings control when the agent should start and stop responding, with options for voice activity detection thresholds, silence duration, and padding around detected speech. + +## Tools and Functions + +### Adding Tools + +Just like regular agents, realtime agents support function tools that execute during conversations: + +```python +from agents import function_tool + +@function_tool +def get_weather(city: str) -> str: + """Get current weather for a city.""" + # Your weather API logic here + return f"The weather in {city} is sunny, 72°F" + +@function_tool +def book_appointment(date: str, time: str, service: str) -> str: + """Book an appointment.""" + # Your booking logic here + return f"Appointment booked for {service} on {date} at {time}" + +agent = RealtimeAgent( + name="Assistant", + instructions="You can help with weather and appointments.", + tools=[get_weather, book_appointment], +) +``` + +## Handoffs + +### Creating Handoffs + +Handoffs allow transferring conversations between specialized agents. + +```python +from agents.realtime import realtime_handoff + +# Specialized agents +billing_agent = RealtimeAgent( + name="Billing Support", + instructions="You specialize in billing and payment issues.", +) + +technical_agent = RealtimeAgent( + name="Technical Support", + instructions="You handle technical troubleshooting.", +) + +# Main agent with handoffs +main_agent = RealtimeAgent( + name="Customer Service", + instructions="You are the main customer service agent. Hand off to specialists when needed.", + handoffs=[ + realtime_handoff(billing_agent, tool_description="Transfer to billing support"), + realtime_handoff(technical_agent, tool_description="Transfer to technical support"), + ] +) +``` + +## Event handling + +The session streams events that you can listen to by iterating over the session object. Events include audio output chunks, transcription results, tool execution start and end, agent handoffs, and errors. Key events to handle include: + +- **audio**: Raw audio data from the agent's response +- **audio_end**: Agent finished speaking +- **audio_interrupted**: User interrupted the agent +- **tool_start/tool_end**: Tool execution lifecycle +- **handoff**: Agent handoff occurred +- **error**: Error occurred during processing + +For complete event details, see [`RealtimeSessionEvent`][agents.realtime.events.RealtimeSessionEvent]. + +## Guardrails + +Only output guardrails are supported for realtime agents. These guardrails are debounced and run periodically (not on every word) to avoid performance issues during real-time generation. The default debounce length is 100 characters, but this is configurable. + +When a guardrail is triggered, it generates a `guardrail_tripped` event and can interrupt the agent's current response. The debounce behavior helps balance safety with real-time performance requirements. Unlike text agents, realtime agents do **not** raise an Exception when guardrails are tripped. + +## Audio processing + +Send audio to the session using [`session.send_audio(audio_bytes)`][agents.realtime.session.RealtimeSession.send_audio] or send text using [`session.send_message()`][agents.realtime.session.RealtimeSession.send_message]. + +For audio output, listen for `audio` events and play the audio data through your preferred audio library. Make sure to listen for `audio_interrupted` events to stop playback immediately and clear any queued audio when the user interrupts the agent. + +## Examples + +For complete working examples, check out the [examples/realtime directory](https://github.com/openai/openai-agents-python/tree/main/examples/realtime) which includes demos with and without UI components. diff --git a/docs/realtime/quickstart.md b/docs/realtime/quickstart.md new file mode 100644 index 000000000..2cee550ea --- /dev/null +++ b/docs/realtime/quickstart.md @@ -0,0 +1,175 @@ +# Quickstart + +Realtime agents enable voice conversations with your AI agents using OpenAI's Realtime API. This guide walks you through creating your first realtime voice agent. + +!!! warning "Beta feature" +Realtime agents are in beta. Expect some breaking changes as we improve the implementation. + +## Prerequisites + +- Python 3.9 or higher +- OpenAI API key +- Basic familiarity with the OpenAI Agents SDK + +## Installation + +If you haven't already, install the OpenAI Agents SDK: + +```bash +pip install openai-agents +``` + +## Creating your first realtime agent + +### 1. Import required components + +```python +import asyncio +from agents.realtime import RealtimeAgent, RealtimeRunner +``` + +### 2. Create a realtime agent + +```python +agent = RealtimeAgent( + name="Assistant", + instructions="You are a helpful voice assistant. Keep your responses conversational and friendly.", +) +``` + +### 3. Set up the runner + +```python +runner = RealtimeRunner( + starting_agent=agent, + config={ + "model_settings": { + "model_name": "gpt-4o-realtime-preview", + "voice": "alloy", + "modalities": ["text", "audio"], + } + } +) +``` + +### 4. Start a session + +```python +async def main(): + # Start the realtime session + session = await runner.run() + + async with session: + # Send a text message to start the conversation + await session.send_message("Hello! How are you today?") + + # The agent will stream back audio in real-time (not shown in this example) + # Listen for events from the session + async for event in session: + if event.type == "response.audio_transcript.done": + print(f"Assistant: {event.transcript}") + elif event.type == "conversation.item.input_audio_transcription.completed": + print(f"User: {event.transcript}") + +# Run the session +asyncio.run(main()) +``` + +## Complete example + +Here's a complete working example: + +```python +import asyncio +from agents.realtime import RealtimeAgent, RealtimeRunner + +async def main(): + # Create the agent + agent = RealtimeAgent( + name="Assistant", + instructions="You are a helpful voice assistant. Keep responses brief and conversational.", + ) + + # Set up the runner with configuration + runner = RealtimeRunner( + starting_agent=agent, + config={ + "model_settings": { + "model_name": "gpt-4o-realtime-preview", + "voice": "alloy", + "modalities": ["text", "audio"], + "input_audio_transcription": { + "model": "whisper-1" + }, + "turn_detection": { + "type": "server_vad", + "threshold": 0.5, + "prefix_padding_ms": 300, + "silence_duration_ms": 200 + } + } + } + ) + + # Start the session + session = await runner.run() + + async with session: + print("Session started! The agent will stream audio responses in real-time.") + + # Process events + async for event in session: + if event.type == "response.audio_transcript.done": + print(f"Assistant: {event.transcript}") + elif event.type == "conversation.item.input_audio_transcription.completed": + print(f"User: {event.transcript}") + elif event.type == "error": + print(f"Error: {event.error}") + break + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Configuration options + +### Model settings + +- `model_name`: Choose from available realtime models (e.g., `gpt-4o-realtime-preview`) +- `voice`: Select voice (`alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`) +- `modalities`: Enable text and/or audio (`["text", "audio"]`) + +### Audio settings + +- `input_audio_format`: Format for input audio (`pcm16`, `g711_ulaw`, `g711_alaw`) +- `output_audio_format`: Format for output audio +- `input_audio_transcription`: Transcription configuration + +### Turn detection + +- `type`: Detection method (`server_vad`, `semantic_vad`) +- `threshold`: Voice activity threshold (0.0-1.0) +- `silence_duration_ms`: Silence duration to detect turn end +- `prefix_padding_ms`: Audio padding before speech + +## Next steps + +- [Learn more about realtime agents](guide.md) +- Check out working examples in the [examples/realtime](https://github.com/openai/openai-agents-python/tree/main/examples/realtime) folder +- Add tools to your agent +- Implement handoffs between agents +- Set up guardrails for safety + +## Authentication + +Make sure your OpenAI API key is set in your environment: + +```bash +export OPENAI_API_KEY="your-api-key-here" +``` + +Or pass it directly when creating the session: + +```python +session = await runner.run(model_config={"api_key": "your-api-key"}) +``` diff --git a/docs/ref/realtime/agent.md b/docs/ref/realtime/agent.md new file mode 100644 index 000000000..d90833920 --- /dev/null +++ b/docs/ref/realtime/agent.md @@ -0,0 +1,3 @@ +# `RealtimeAgent` + +::: agents.realtime.agent.RealtimeAgent \ No newline at end of file diff --git a/docs/ref/realtime/config.md b/docs/ref/realtime/config.md new file mode 100644 index 000000000..3e50f47ad --- /dev/null +++ b/docs/ref/realtime/config.md @@ -0,0 +1,41 @@ +# Realtime Configuration + +## Run Configuration + +::: agents.realtime.config.RealtimeRunConfig + +## Model Settings + +::: agents.realtime.config.RealtimeSessionModelSettings + +## Audio Configuration + +::: agents.realtime.config.RealtimeInputAudioTranscriptionConfig +::: agents.realtime.config.RealtimeTurnDetectionConfig + +## Guardrails Settings + +::: agents.realtime.config.RealtimeGuardrailsSettings + +## Model Configuration + +::: agents.realtime.model.RealtimeModelConfig + +## Tracing Configuration + +::: agents.realtime.config.RealtimeModelTracingConfig + +## User Input Types + +::: agents.realtime.config.RealtimeUserInput +::: agents.realtime.config.RealtimeUserInputText +::: agents.realtime.config.RealtimeUserInputMessage + +## Client Messages + +::: agents.realtime.config.RealtimeClientMessage + +## Type Aliases + +::: agents.realtime.config.RealtimeModelName +::: agents.realtime.config.RealtimeAudioFormat \ No newline at end of file diff --git a/docs/ref/realtime/events.md b/docs/ref/realtime/events.md new file mode 100644 index 000000000..137d9a643 --- /dev/null +++ b/docs/ref/realtime/events.md @@ -0,0 +1,36 @@ +# Realtime Events + +## Session Events + +::: agents.realtime.events.RealtimeSessionEvent + +## Event Types + +### Agent Events +::: agents.realtime.events.RealtimeAgentStartEvent +::: agents.realtime.events.RealtimeAgentEndEvent + +### Audio Events +::: agents.realtime.events.RealtimeAudio +::: agents.realtime.events.RealtimeAudioEnd +::: agents.realtime.events.RealtimeAudioInterrupted + +### Tool Events +::: agents.realtime.events.RealtimeToolStart +::: agents.realtime.events.RealtimeToolEnd + +### Handoff Events +::: agents.realtime.events.RealtimeHandoffEvent + +### Guardrail Events +::: agents.realtime.events.RealtimeGuardrailTripped + +### History Events +::: agents.realtime.events.RealtimeHistoryAdded +::: agents.realtime.events.RealtimeHistoryUpdated + +### Error Events +::: agents.realtime.events.RealtimeError + +### Raw Model Events +::: agents.realtime.events.RealtimeRawModelEvent \ No newline at end of file diff --git a/docs/ref/realtime/runner.md b/docs/ref/realtime/runner.md new file mode 100644 index 000000000..b2d26bba5 --- /dev/null +++ b/docs/ref/realtime/runner.md @@ -0,0 +1,3 @@ +# `RealtimeRunner` + +::: agents.realtime.runner.RealtimeRunner \ No newline at end of file diff --git a/docs/ref/realtime/session.md b/docs/ref/realtime/session.md new file mode 100644 index 000000000..52ad0b09e --- /dev/null +++ b/docs/ref/realtime/session.md @@ -0,0 +1,3 @@ +# `RealtimeSession` + +::: agents.realtime.session.RealtimeSession \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 19529bf30..9e7f7aeec 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,6 +78,9 @@ plugins: - voice/quickstart.md - voice/pipeline.md - voice/tracing.md + - Realtime agents: + - realtime/quickstart.md + - realtime/guide.md - API Reference: - Agents: - ref/index.md @@ -115,6 +118,12 @@ plugins: - ref/tracing/setup.md - ref/tracing/span_data.md - ref/tracing/util.md + - Realtime: + - ref/realtime/agent.md + - ref/realtime/runner.md + - ref/realtime/session.md + - ref/realtime/events.md + - ref/realtime/config.md - Voice: - ref/voice/pipeline.md - ref/voice/workflow.md @@ -163,6 +172,9 @@ plugins: - voice/quickstart.md - voice/pipeline.md - voice/tracing.md + - リアルタイムエージェント: + - realtime/quickstart.md + - realtime/guide.md extra: # Remove material generation message in footer diff --git a/src/agents/realtime/config.py b/src/agents/realtime/config.py index 6e7e3f4be..f8a203589 100644 --- a/src/agents/realtime/config.py +++ b/src/agents/realtime/config.py @@ -28,53 +28,95 @@ RealtimeAudioFormat: TypeAlias = Union[Literal["pcm16", "g711_ulaw", "g711_alaw"], str] +"""The audio format for realtime audio streams.""" class RealtimeClientMessage(TypedDict): """A raw message to be sent to the model.""" type: str # explicitly required + """The type of the message.""" + other_data: NotRequired[dict[str, Any]] """Merged into the message body.""" class RealtimeInputAudioTranscriptionConfig(TypedDict): + """Configuration for audio transcription in realtime sessions.""" + language: NotRequired[str] + """The language code for transcription.""" + model: NotRequired[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"] | str] + """The transcription model to use.""" + prompt: NotRequired[str] + """An optional prompt to guide transcription.""" class RealtimeTurnDetectionConfig(TypedDict): """Turn detection config. Allows extra vendor keys if needed.""" type: NotRequired[Literal["semantic_vad", "server_vad"]] + """The type of voice activity detection to use.""" + create_response: NotRequired[bool] + """Whether to create a response when a turn is detected.""" + eagerness: NotRequired[Literal["auto", "low", "medium", "high"]] + """How eagerly to detect turn boundaries.""" + interrupt_response: NotRequired[bool] + """Whether to allow interrupting the assistant's response.""" + prefix_padding_ms: NotRequired[int] + """Padding time in milliseconds before turn detection.""" + silence_duration_ms: NotRequired[int] + """Duration of silence in milliseconds to trigger turn detection.""" + threshold: NotRequired[float] + """The threshold for voice activity detection.""" class RealtimeSessionModelSettings(TypedDict): """Model settings for a realtime model session.""" model_name: NotRequired[RealtimeModelName] + """The name of the realtime model to use.""" instructions: NotRequired[str] + """System instructions for the model.""" + modalities: NotRequired[list[Literal["text", "audio"]]] + """The modalities the model should support.""" + voice: NotRequired[str] + """The voice to use for audio output.""" input_audio_format: NotRequired[RealtimeAudioFormat] + """The format for input audio streams.""" + output_audio_format: NotRequired[RealtimeAudioFormat] + """The format for output audio streams.""" + input_audio_transcription: NotRequired[RealtimeInputAudioTranscriptionConfig] + """Configuration for transcribing input audio.""" + turn_detection: NotRequired[RealtimeTurnDetectionConfig] + """Configuration for detecting conversation turns.""" tool_choice: NotRequired[ToolChoice] + """How the model should choose which tools to call.""" + tools: NotRequired[list[Tool]] + """List of tools available to the model.""" + handoffs: NotRequired[list[Handoff]] + """List of handoff configurations.""" tracing: NotRequired[RealtimeModelTracingConfig | None] + """Configuration for request tracing.""" class RealtimeGuardrailsSettings(TypedDict): @@ -102,7 +144,10 @@ class RealtimeModelTracingConfig(TypedDict): class RealtimeRunConfig(TypedDict): + """Configuration for running a realtime agent session.""" + model_settings: NotRequired[RealtimeSessionModelSettings] + """Settings for the realtime model session.""" output_guardrails: NotRequired[list[OutputGuardrail[Any]]] """List of output guardrails to run on the agent's responses.""" @@ -117,14 +162,27 @@ class RealtimeRunConfig(TypedDict): class RealtimeUserInputText(TypedDict): + """A text input from the user.""" + type: Literal["input_text"] + """The type identifier for text input.""" + text: str + """The text content from the user.""" class RealtimeUserInputMessage(TypedDict): + """A message input from the user.""" + type: Literal["message"] + """The type identifier for message inputs.""" + role: Literal["user"] + """The role identifier for user messages.""" + content: list[RealtimeUserInputText] + """List of text content items in the message.""" RealtimeUserInput: TypeAlias = Union[str, RealtimeUserInputMessage] +"""User input that can be a string or structured message.""" diff --git a/src/agents/realtime/items.py b/src/agents/realtime/items.py index fc05ebc51..f8a288145 100644 --- a/src/agents/realtime/items.py +++ b/src/agents/realtime/items.py @@ -6,68 +6,127 @@ class InputText(BaseModel): + """Text input content for realtime messages.""" + type: Literal["input_text"] = "input_text" + """The type identifier for text input.""" + text: str | None = None + """The text content.""" # Allow extra data model_config = ConfigDict(extra="allow") class InputAudio(BaseModel): + """Audio input content for realtime messages.""" + type: Literal["input_audio"] = "input_audio" + """The type identifier for audio input.""" + audio: str | None = None + """The base64-encoded audio data.""" + transcript: str | None = None + """The transcript of the audio, if available.""" # Allow extra data model_config = ConfigDict(extra="allow") class AssistantText(BaseModel): + """Text content from the assistant in realtime responses.""" + type: Literal["text"] = "text" + """The type identifier for text content.""" + text: str | None = None + """The text content from the assistant.""" # Allow extra data model_config = ConfigDict(extra="allow") class AssistantAudio(BaseModel): + """Audio content from the assistant in realtime responses.""" + type: Literal["audio"] = "audio" + """The type identifier for audio content.""" + audio: str | None = None + """The base64-encoded audio data from the assistant.""" + transcript: str | None = None + """The transcript of the audio response.""" # Allow extra data model_config = ConfigDict(extra="allow") class SystemMessageItem(BaseModel): + """A system message item in realtime conversations.""" + item_id: str + """Unique identifier for this message item.""" + previous_item_id: str | None = None + """ID of the previous item in the conversation.""" + type: Literal["message"] = "message" + """The type identifier for message items.""" + role: Literal["system"] = "system" + """The role identifier for system messages.""" + content: list[InputText] + """List of text content for the system message.""" # Allow extra data model_config = ConfigDict(extra="allow") class UserMessageItem(BaseModel): + """A user message item in realtime conversations.""" + item_id: str + """Unique identifier for this message item.""" + previous_item_id: str | None = None + """ID of the previous item in the conversation.""" + type: Literal["message"] = "message" + """The type identifier for message items.""" + role: Literal["user"] = "user" + """The role identifier for user messages.""" + content: list[Annotated[InputText | InputAudio, Field(discriminator="type")]] + """List of content items, can be text or audio.""" # Allow extra data model_config = ConfigDict(extra="allow") class AssistantMessageItem(BaseModel): + """An assistant message item in realtime conversations.""" + item_id: str + """Unique identifier for this message item.""" + previous_item_id: str | None = None + """ID of the previous item in the conversation.""" + type: Literal["message"] = "message" + """The type identifier for message items.""" + role: Literal["assistant"] = "assistant" + """The role identifier for assistant messages.""" + status: Literal["in_progress", "completed", "incomplete"] | None = None + """The status of the assistant's response.""" + content: list[Annotated[AssistantText | AssistantAudio, Field(discriminator="type")]] + """List of content items from the assistant, can be text or audio.""" # Allow extra data model_config = ConfigDict(extra="allow") @@ -77,25 +136,49 @@ class AssistantMessageItem(BaseModel): Union[SystemMessageItem, UserMessageItem, AssistantMessageItem], Field(discriminator="role"), ] +"""A message item that can be from system, user, or assistant.""" class RealtimeToolCallItem(BaseModel): + """A tool call item in realtime conversations.""" + item_id: str + """Unique identifier for this tool call item.""" + previous_item_id: str | None = None + """ID of the previous item in the conversation.""" + call_id: str | None + """The call ID for this tool invocation.""" + type: Literal["function_call"] = "function_call" + """The type identifier for function call items.""" + status: Literal["in_progress", "completed"] + """The status of the tool call execution.""" + arguments: str + """The JSON string arguments passed to the tool.""" + name: str + """The name of the tool being called.""" + output: str | None = None + """The output result from the tool execution.""" # Allow extra data model_config = ConfigDict(extra="allow") RealtimeItem = Union[RealtimeMessageItem, RealtimeToolCallItem] +"""A realtime item that can be a message or tool call.""" class RealtimeResponse(BaseModel): + """A response from the realtime model.""" + id: str + """Unique identifier for this response.""" + output: list[RealtimeMessageItem] + """List of message items in the response."""