Skip to content

Commit 7b2f4e0

Browse files
author
Lucas Wang
committed
fix: Twilio audio jittering by buffering outgoing audio chunks
Fixes #1906 The Twilio realtime example was experiencing jittering/skip sounds at the beginning of every word. This was caused by sending small audio chunks from OpenAI to Twilio too frequently without buffering. Changes: - Added outgoing audio buffer to accumulate audio chunks from OpenAI - Buffer audio until reaching 50ms worth of data before sending to Twilio - Flush remaining buffered audio on audio_end and audio_interrupted events - Updated periodic flush loop to handle both incoming and outgoing buffers - Added documentation about audio buffering to troubleshooting section Technical details: - Incoming audio (Twilio → OpenAI) was already buffered - Now outgoing audio (OpenAI → Twilio) is also buffered symmetrically - Buffer size: 50ms chunks (400 bytes at 8kHz sample rate) - Prevents choppy playback by sending larger, consistent audio packets Tested with: - Linting: ruff check ✓ - Formatting: ruff format ✓ - Type checking: mypy ✓ Generated with Lucas Wang<[email protected]>
1 parent 748ac80 commit 7b2f4e0

File tree

2 files changed

+68
-23
lines changed

2 files changed

+68
-23
lines changed

examples/realtime/twilio/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ This example demonstrates how to connect the OpenAI Realtime API to a phone call
7070
7171
- **WebSocket connection issues**: Ensure your ngrok URL is correct and publicly accessible
7272
- **Audio quality**: Twilio streams audio in mulaw format at 8kHz, which may affect quality
73+
- **Audio jittering/skipping**: The implementation includes audio buffering (50ms chunks) to reduce jittering at word boundaries. This buffers both incoming (Twilio → OpenAI) and outgoing (OpenAI → Twilio) audio for smoother playback.
7374
- **Latency**: Network latency between Twilio, your server, and OpenAI affects response time
7475
- **Logs**: Check the console output for detailed connection and error logs
7576

examples/realtime/twilio/twilio_handler.py

Lines changed: 67 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,15 @@ def __init__(self, twilio_websocket: WebSocket):
5252
self.BUFFER_SIZE_BYTES = int(self.SAMPLE_RATE * self.CHUNK_LENGTH_S) # 50ms worth of audio
5353

5454
self._stream_sid: str | None = None
55+
56+
# Incoming audio buffer (from Twilio to OpenAI)
5557
self._audio_buffer: bytearray = bytearray()
5658
self._last_buffer_send_time = time.time()
5759

60+
# Outgoing audio buffer (from OpenAI to Twilio) - NEW
61+
self._outgoing_audio_buffer: bytearray = bytearray()
62+
self._last_outgoing_send_time = time.time()
63+
5864
# Mark event tracking for playback
5965
self._mark_counter = 0
6066
self._mark_data: dict[
@@ -122,18 +128,10 @@ async def _twilio_message_loop(self) -> None:
122128
async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
123129
"""Handle events from the realtime session."""
124130
if event.type == "audio":
125-
base64_audio = base64.b64encode(event.audio.data).decode("utf-8")
126-
await self.twilio_websocket.send_text(
127-
json.dumps(
128-
{
129-
"event": "media",
130-
"streamSid": self._stream_sid,
131-
"media": {"payload": base64_audio},
132-
}
133-
)
134-
)
131+
# Buffer outgoing audio to reduce jittering
132+
self._outgoing_audio_buffer.extend(event.audio.data)
135133

136-
# Send mark event for playback tracking
134+
# Store metadata for this audio chunk
137135
self._mark_counter += 1
138136
mark_id = str(self._mark_counter)
139137
self._mark_data[mark_id] = (
@@ -142,23 +140,24 @@ async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
142140
len(event.audio.data),
143141
)
144142

145-
await self.twilio_websocket.send_text(
146-
json.dumps(
147-
{
148-
"event": "mark",
149-
"streamSid": self._stream_sid,
150-
"mark": {"name": mark_id},
151-
}
152-
)
153-
)
143+
# Send buffered audio if we have enough data (reduces jittering)
144+
if len(self._outgoing_audio_buffer) >= self.BUFFER_SIZE_BYTES:
145+
await self._flush_outgoing_audio_buffer(mark_id)
154146

155147
elif event.type == "audio_interrupted":
156148
print("Sending audio interrupted to Twilio")
149+
# Flush any remaining buffered audio before clearing
150+
if self._outgoing_audio_buffer:
151+
await self._flush_outgoing_audio_buffer(None)
157152
await self.twilio_websocket.send_text(
158153
json.dumps({"event": "clear", "streamSid": self._stream_sid})
159154
)
155+
self._outgoing_audio_buffer.clear()
160156
elif event.type == "audio_end":
161-
print("Audio end")
157+
print("Audio end - flushing remaining buffered audio")
158+
# Flush remaining audio at the end
159+
if self._outgoing_audio_buffer:
160+
await self._flush_outgoing_audio_buffer(None)
162161
elif event.type == "raw_model_event":
163162
pass
164163
else:
@@ -246,19 +245,64 @@ async def _flush_audio_buffer(self) -> None:
246245
except Exception as e:
247246
print(f"Error sending buffered audio to OpenAI: {e}")
248247

248+
async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None:
249+
"""Send buffered audio to Twilio to reduce jittering."""
250+
if not self._outgoing_audio_buffer:
251+
return
252+
253+
try:
254+
# Encode and send the buffered audio to Twilio
255+
base64_audio = base64.b64encode(bytes(self._outgoing_audio_buffer)).decode("utf-8")
256+
await self.twilio_websocket.send_text(
257+
json.dumps(
258+
{
259+
"event": "media",
260+
"streamSid": self._stream_sid,
261+
"media": {"payload": base64_audio},
262+
}
263+
)
264+
)
265+
266+
# Send mark event for playback tracking (if provided)
267+
if mark_id is not None:
268+
await self.twilio_websocket.send_text(
269+
json.dumps(
270+
{
271+
"event": "mark",
272+
"streamSid": self._stream_sid,
273+
"mark": {"name": mark_id},
274+
}
275+
)
276+
)
277+
278+
# Clear the buffer
279+
self._outgoing_audio_buffer.clear()
280+
self._last_outgoing_send_time = time.time()
281+
282+
except Exception as e:
283+
print(f"Error sending buffered audio to Twilio: {e}")
284+
249285
async def _buffer_flush_loop(self) -> None:
250-
"""Periodically flush audio buffer to prevent stale data."""
286+
"""Periodically flush audio buffers to prevent stale data."""
251287
try:
252288
while True:
253289
await asyncio.sleep(self.CHUNK_LENGTH_S) # Check every 50ms
254290

255-
# If buffer has data and it's been too long since last send, flush it
256291
current_time = time.time()
292+
293+
# Flush incoming audio buffer (from Twilio to OpenAI) if stale
257294
if (
258295
self._audio_buffer
259296
and current_time - self._last_buffer_send_time > self.CHUNK_LENGTH_S * 2
260297
):
261298
await self._flush_audio_buffer()
262299

300+
# Flush outgoing audio buffer (from OpenAI to Twilio) if stale
301+
if (
302+
self._outgoing_audio_buffer
303+
and current_time - self._last_outgoing_send_time > self.CHUNK_LENGTH_S * 2
304+
):
305+
await self._flush_outgoing_audio_buffer(None)
306+
263307
except Exception as e:
264308
print(f"Error in buffer flush loop: {e}")

0 commit comments

Comments
 (0)