Skip to content

Commit 32997b2

Browse files
authored
Realtime: web demo (#1149)
Add a web demo that uses a fastapi backend. Also remove the CLI UI demo.
1 parent f63cc0c commit 32997b2

File tree

8 files changed

+976
-119
lines changed

8 files changed

+976
-119
lines changed

examples/realtime/app/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Realtime Demo App
2+
3+
A web-based realtime voice assistant demo with a FastAPI backend and HTML/JS frontend.
4+
5+
## Installation
6+
7+
Install the required dependencies:
8+
9+
```bash
10+
uv add fastapi uvicorn websockets
11+
```
12+
13+
## Usage
14+
15+
Start the application with a single command:
16+
17+
```bash
18+
cd examples/realtime/app && uv run python server.py
19+
```
20+
21+
Then open your browser to: http://localhost:8000
22+
23+
## How to Use
24+
25+
1. Click **Connect** to establish a realtime session
26+
2. Audio capture starts automatically - just speak naturally
27+
3. Click the **Mic On/Off** button to mute/unmute your microphone
28+
4. Watch the conversation unfold in the left pane
29+
5. Monitor raw events in the right pane (click to expand/collapse)
30+
6. Click **Disconnect** when done
31+
32+
## Architecture
33+
34+
- **Backend**: FastAPI server with WebSocket connections for real-time communication
35+
- **Session Management**: Each connection gets a unique session with the OpenAI Realtime API
36+
- **Audio Processing**: 24kHz mono audio capture and playback
37+
- **Event Handling**: Full event stream processing with transcript generation
38+
- **Frontend**: Vanilla JavaScript with clean, responsive CSS
39+
40+
The demo showcases the core patterns for building realtime voice applications with the OpenAI Agents SDK.

examples/realtime/app/server.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import asyncio
2+
import base64
3+
import json
4+
import logging
5+
import struct
6+
from contextlib import asynccontextmanager
7+
from typing import Any, assert_never
8+
9+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
10+
from fastapi.responses import FileResponse
11+
from fastapi.staticfiles import StaticFiles
12+
13+
from agents import function_tool
14+
from agents.realtime import RealtimeAgent, RealtimeRunner, RealtimeSession, RealtimeSessionEvent
15+
16+
logging.basicConfig(level=logging.INFO)
17+
logger = logging.getLogger(__name__)
18+
19+
20+
@function_tool
21+
def get_weather(city: str) -> str:
22+
"""Get the weather in a city."""
23+
return f"The weather in {city} is sunny."
24+
25+
26+
@function_tool
27+
def get_secret_number() -> int:
28+
"""Returns the secret number, if the user asks for it."""
29+
return 71
30+
31+
32+
haiku_agent = RealtimeAgent(
33+
name="Haiku Agent",
34+
instructions="You are a haiku poet. You must respond ONLY in traditional haiku format (5-7-5 syllables). Every response should be a proper haiku about the topic. Do not break character.",
35+
tools=[],
36+
)
37+
38+
agent = RealtimeAgent(
39+
name="Assistant",
40+
instructions="If the user wants poetry or haikus, you can hand them off to the haiku agent via the transfer_to_haiku_agent tool.",
41+
tools=[get_weather, get_secret_number],
42+
handoffs=[haiku_agent],
43+
)
44+
45+
46+
class RealtimeWebSocketManager:
47+
def __init__(self):
48+
self.active_sessions: dict[str, RealtimeSession] = {}
49+
self.session_contexts: dict[str, Any] = {}
50+
self.websockets: dict[str, WebSocket] = {}
51+
52+
async def connect(self, websocket: WebSocket, session_id: str):
53+
await websocket.accept()
54+
self.websockets[session_id] = websocket
55+
56+
runner = RealtimeRunner(agent)
57+
session_context = await runner.run()
58+
session = await session_context.__aenter__()
59+
self.active_sessions[session_id] = session
60+
self.session_contexts[session_id] = session_context
61+
62+
# Start event processing task
63+
asyncio.create_task(self._process_events(session_id))
64+
65+
async def disconnect(self, session_id: str):
66+
if session_id in self.session_contexts:
67+
await self.session_contexts[session_id].__aexit__(None, None, None)
68+
del self.session_contexts[session_id]
69+
if session_id in self.active_sessions:
70+
del self.active_sessions[session_id]
71+
if session_id in self.websockets:
72+
del self.websockets[session_id]
73+
74+
async def send_audio(self, session_id: str, audio_bytes: bytes):
75+
if session_id in self.active_sessions:
76+
await self.active_sessions[session_id].send_audio(audio_bytes)
77+
78+
async def _process_events(self, session_id: str):
79+
try:
80+
session = self.active_sessions[session_id]
81+
websocket = self.websockets[session_id]
82+
83+
async for event in session:
84+
event_data = await self._serialize_event(event)
85+
await websocket.send_text(json.dumps(event_data))
86+
except Exception as e:
87+
logger.error(f"Error processing events for session {session_id}: {e}")
88+
89+
async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
90+
base_event: dict[str, Any] = {
91+
"type": event.type,
92+
}
93+
94+
if event.type == "agent_start":
95+
base_event["agent"] = event.agent.name
96+
elif event.type == "agent_end":
97+
base_event["agent"] = event.agent.name
98+
elif event.type == "handoff":
99+
base_event["from"] = event.from_agent.name
100+
base_event["to"] = event.to_agent.name
101+
elif event.type == "tool_start":
102+
base_event["tool"] = event.tool.name
103+
elif event.type == "tool_end":
104+
base_event["tool"] = event.tool.name
105+
base_event["output"] = str(event.output)
106+
elif event.type == "audio":
107+
base_event["audio"] = base64.b64encode(event.audio.data).decode("utf-8")
108+
elif event.type == "audio_interrupted":
109+
pass
110+
elif event.type == "audio_end":
111+
pass
112+
elif event.type == "history_updated":
113+
base_event["history"] = [item.model_dump(mode="json") for item in event.history]
114+
elif event.type == "history_added":
115+
pass
116+
elif event.type == "guardrail_tripped":
117+
base_event["guardrail_results"] = [
118+
{"name": result.guardrail.name} for result in event.guardrail_results
119+
]
120+
elif event.type == "raw_model_event":
121+
base_event["raw_model_event"] = {
122+
"type": event.data.type,
123+
}
124+
elif event.type == "error":
125+
base_event["error"] = str(event.error) if hasattr(event, "error") else "Unknown error"
126+
else:
127+
assert_never(event)
128+
129+
return base_event
130+
131+
132+
manager = RealtimeWebSocketManager()
133+
134+
135+
@asynccontextmanager
136+
async def lifespan(app: FastAPI):
137+
yield
138+
139+
140+
app = FastAPI(lifespan=lifespan)
141+
142+
143+
@app.websocket("/ws/{session_id}")
144+
async def websocket_endpoint(websocket: WebSocket, session_id: str):
145+
await manager.connect(websocket, session_id)
146+
try:
147+
while True:
148+
data = await websocket.receive_text()
149+
message = json.loads(data)
150+
151+
if message["type"] == "audio":
152+
# Convert int16 array to bytes
153+
int16_data = message["data"]
154+
audio_bytes = struct.pack(f"{len(int16_data)}h", *int16_data)
155+
await manager.send_audio(session_id, audio_bytes)
156+
157+
except WebSocketDisconnect:
158+
await manager.disconnect(session_id)
159+
160+
161+
app.mount("/", StaticFiles(directory="static", html=True), name="static")
162+
163+
164+
@app.get("/")
165+
async def read_index():
166+
return FileResponse("static/index.html")
167+
168+
169+
if __name__ == "__main__":
170+
import uvicorn
171+
172+
uvicorn.run(app, host="0.0.0.0", port=8000)

0 commit comments

Comments
 (0)