fix: Twilio audio jittering by buffering outgoing audio chunks

Lucas Wang · Lucas Wang · commit 7b2f4e09c0f1 · 2025-10-19T01:33:59.000+08:00
Fixes #1906 The Twilio realtime example was experiencing jittering/skip sounds at the beginning of every word. This was caused by sending small audio chunks from OpenAI to Twilio too frequently without buffering. Changes: - Added outgoing audio buffer to accumulate audio chunks from OpenAI - Buffer audio until reaching 50ms worth of data before sending to Twilio - Flush remaining buffered audio on audio_end and audio_interrupted events - Updated periodic flush loop to handle both incoming and outgoing buffers - Added documentation about audio buffering to troubleshooting section Technical details: - Incoming audio (Twilio → OpenAI) was already buffered - Now outgoing audio (OpenAI → Twilio) is also buffered symmetrically - Buffer size: 50ms chunks (400 bytes at 8kHz sample rate) - Prevents choppy playback by sending larger, consistent audio packets Tested with: - Linting: ruff check ✓ - Formatting: ruff format ✓ - Type checking: mypy ✓ Generated with Lucas Wang<lucas_wang@automodules.com>
diff --git a/examples/realtime/twilio/README.md b/examples/realtime/twilio/README.md
@@ -70,6 +70,7 @@ This example demonstrates how to connect the OpenAI Realtime API to a phone call
 
 -   **WebSocket connection issues**: Ensure your ngrok URL is correct and publicly accessible
 -   **Audio quality**: Twilio streams audio in mulaw format at 8kHz, which may affect quality
+-   **Audio jittering/skipping**: The implementation includes audio buffering (50ms chunks) to reduce jittering at word boundaries. This buffers both incoming (Twilio → OpenAI) and outgoing (OpenAI → Twilio) audio for smoother playback.
 -   **Latency**: Network latency between Twilio, your server, and OpenAI affects response time
 -   **Logs**: Check the console output for detailed connection and error logs
 
diff --git a/examples/realtime/twilio/twilio_handler.py b/examples/realtime/twilio/twilio_handler.py
@@ -52,9 +52,15 @@ def __init__(self, twilio_websocket: WebSocket):
         self.BUFFER_SIZE_BYTES = int(self.SAMPLE_RATE * self.CHUNK_LENGTH_S)  # 50ms worth of audio
 
         self._stream_sid: str | None = None
+
+        # Incoming audio buffer (from Twilio to OpenAI)
         self._audio_buffer: bytearray = bytearray()
         self._last_buffer_send_time = time.time()
 
+        # Outgoing audio buffer (from OpenAI to Twilio) - NEW
+        self._outgoing_audio_buffer: bytearray = bytearray()
+        self._last_outgoing_send_time = time.time()
+
         # Mark event tracking for playback
         self._mark_counter = 0
         self._mark_data: dict[
@@ -122,18 +128,10 @@ async def _twilio_message_loop(self) -> None:
     async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
         """Handle events from the realtime session."""
         if event.type == "audio":
-            base64_audio = base64.b64encode(event.audio.data).decode("utf-8")
-            await self.twilio_websocket.send_text(
-                json.dumps(
-                    {
-                        "event": "media",
-                        "streamSid": self._stream_sid,
-                        "media": {"payload": base64_audio},
-                    }
-                )
-            )
+            # Buffer outgoing audio to reduce jittering
+            self._outgoing_audio_buffer.extend(event.audio.data)
 
-            # Send mark event for playback tracking
+            # Store metadata for this audio chunk
             self._mark_counter += 1
             mark_id = str(self._mark_counter)
             self._mark_data[mark_id] = (
@@ -142,23 +140,24 @@ async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
                 len(event.audio.data),
             )
 
-            await self.twilio_websocket.send_text(
-                json.dumps(
-                    {
-                        "event": "mark",
-                        "streamSid": self._stream_sid,
-                        "mark": {"name": mark_id},
-                    }
-                )
-            )
+            # Send buffered audio if we have enough data (reduces jittering)
+            if len(self._outgoing_audio_buffer) >= self.BUFFER_SIZE_BYTES:
+                await self._flush_outgoing_audio_buffer(mark_id)
 
         elif event.type == "audio_interrupted":
             print("Sending audio interrupted to Twilio")
+            # Flush any remaining buffered audio before clearing
+            if self._outgoing_audio_buffer:
+                await self._flush_outgoing_audio_buffer(None)
             await self.twilio_websocket.send_text(
                 json.dumps({"event": "clear", "streamSid": self._stream_sid})
             )
+            self._outgoing_audio_buffer.clear()
         elif event.type == "audio_end":
-            print("Audio end")
+            print("Audio end - flushing remaining buffered audio")
+            # Flush remaining audio at the end
+            if self._outgoing_audio_buffer:
+                await self._flush_outgoing_audio_buffer(None)
         elif event.type == "raw_model_event":
             pass
         else:
@@ -246,19 +245,64 @@ async def _flush_audio_buffer(self) -> None:
         except Exception as e:
             print(f"Error sending buffered audio to OpenAI: {e}")
 
+    async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None:
+        """Send buffered audio to Twilio to reduce jittering."""
+        if not self._outgoing_audio_buffer:
+            return
+
+        try:
+            # Encode and send the buffered audio to Twilio
+            base64_audio = base64.b64encode(bytes(self._outgoing_audio_buffer)).decode("utf-8")
+            await self.twilio_websocket.send_text(
+                json.dumps(
+                    {
+                        "event": "media",
+                        "streamSid": self._stream_sid,
+                        "media": {"payload": base64_audio},
+                    }
+                )
+            )
+
+            # Send mark event for playback tracking (if provided)
+            if mark_id is not None:
+                await self.twilio_websocket.send_text(
+                    json.dumps(
+                        {
+                            "event": "mark",
+                            "streamSid": self._stream_sid,
+                            "mark": {"name": mark_id},
+                        }
+                    )
+                )
+
+            # Clear the buffer
+            self._outgoing_audio_buffer.clear()
+            self._last_outgoing_send_time = time.time()
+
+        except Exception as e:
+            print(f"Error sending buffered audio to Twilio: {e}")
+
     async def _buffer_flush_loop(self) -> None:
-        """Periodically flush audio buffer to prevent stale data."""
+        """Periodically flush audio buffers to prevent stale data."""
         try:
             while True:
                 await asyncio.sleep(self.CHUNK_LENGTH_S)  # Check every 50ms
 
-                # If buffer has data and it's been too long since last send, flush it
                 current_time = time.time()
+
+                # Flush incoming audio buffer (from Twilio to OpenAI) if stale
                 if (
                     self._audio_buffer
                     and current_time - self._last_buffer_send_time > self.CHUNK_LENGTH_S * 2
                 ):
                     await self._flush_audio_buffer()
 
+                # Flush outgoing audio buffer (from OpenAI to Twilio) if stale
+                if (
+                    self._outgoing_audio_buffer
+                    and current_time - self._last_outgoing_send_time > self.CHUNK_LENGTH_S * 2
+                ):
+                    await self._flush_outgoing_audio_buffer(None)
+
         except Exception as e:
             print(f"Error in buffer flush loop: {e}")