openai
diff --git a/‎examples/realtime/app/server.py‎
Lines changed: 65 additions & 10 deletions b/‎examples/realtime/app/server.py‎
Lines changed: 65 additions & 10 deletions
diff --git a/‎examples/realtime/app/static/app.js‎
Lines changed: 41 additions & 8 deletions b/‎examples/realtime/app/static/app.js‎
Lines changed: 41 additions & 8 deletions
diff --git a/‎examples/realtime/cli/demo.py‎
Lines changed: 58 additions & 15 deletions b/‎examples/realtime/cli/demo.py‎
Lines changed: 58 additions & 15 deletions
@@ -36,20 +36,32 @@ def __init__(self):
         self.active_sessions: dict[str, RealtimeSession] = {}
         self.session_contexts: dict[str, Any] = {}
         self.websockets: dict[str, WebSocket] = {}
+        self._pending_audio: dict[str, bytearray] = {}
+        self._audio_flush_tasks: dict[str, asyncio.Task[Any]] = {}
 
     async def connect(self, websocket: WebSocket, session_id: str):
         await websocket.accept()
         self.websockets[session_id] = websocket
 
         agent = get_starting_agent()
         runner = RealtimeRunner(agent)
-        session_context = await runner.run()
+        # Disable server-side interrupt_response to avoid truncating assistant audio
+        session_context = await runner.run(
+            model_config={
+                "initial_model_settings": {
+                    "turn_detection": {"type": "semantic_vad", "interrupt_response": False}
+                }
+            }
+        )
         session = await session_context.__aenter__()
         self.active_sessions[session_id] = session
         self.session_contexts[session_id] = session_context
 
         # Start event processing task
         asyncio.create_task(self._process_events(session_id))
+        # Init audio buffer + steady flush task (~40ms)
+        self._pending_audio[session_id] = bytearray()
+        self._audio_flush_tasks[session_id] = asyncio.create_task(self._flush_audio_loop(session_id))
 
     async def disconnect(self, session_id: str):
         if session_id in self.session_contexts:
@@ -59,6 +71,11 @@ async def disconnect(self, session_id: str):
             del self.active_sessions[session_id]
         if session_id in self.websockets:
             del self.websockets[session_id]
+        if session_id in self._pending_audio:
+            del self._pending_audio[session_id]
+        if session_id in self._audio_flush_tasks:
+            self._audio_flush_tasks[session_id].cancel()
+            del self._audio_flush_tasks[session_id]
 
     async def send_audio(self, session_id: str, audio_bytes: bytes):
         if session_id in self.active_sessions:
@@ -70,12 +87,13 @@ async def _process_events(self, session_id: str):
             websocket = self.websockets[session_id]
 
             async for event in session:
-                event_data = await self._serialize_event(event)
-                await websocket.send_text(json.dumps(event_data))
+                event_data = await self._serialize_event(session_id, event)
+                if event_data is not None:
+                    await websocket.send_text(json.dumps(event_data))
         except Exception as e:
             logger.error(f"Error processing events for session {session_id}: {e}")
 
-    async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
+    async def _serialize_event(self, session_id: str, event: RealtimeSessionEvent) -> dict[str, Any] | None:
         base_event: dict[str, Any] = {
             "type": event.type,
         }
@@ -93,7 +111,9 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
             base_event["tool"] = event.tool.name
             base_event["output"] = str(event.output)
         elif event.type == "audio":
-            base_event["audio"] = base64.b64encode(event.audio.data).decode("utf-8")
+            # Coalesce raw PCM and flush on a steady timer for smoother playback.
+            self._pending_audio[session_id].extend(event.audio.data)
+            return None
         elif event.type == "audio_interrupted":
             pass
         elif event.type == "audio_end":
@@ -107,9 +127,20 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
                 {"name": result.guardrail.name} for result in event.guardrail_results
             ]
         elif event.type == "raw_model_event":
-            base_event["raw_model_event"] = {
-                "type": event.data.type,
-            }
+            # Surface useful raw events to the UI with details.
+            if getattr(event.data, "type", None) == "transcript_delta":
+                # Stream assistant transcript deltas to the UI.
+                base_event = {
+                    "type": "transcript_delta",
+                    "item_id": getattr(event.data, "item_id", ""),
+                    "response_id": getattr(event.data, "response_id", ""),
+                    "delta": getattr(event.data, "delta", ""),
+                }
+            else:
+                # Fallback to a minimal raw event descriptor.
+                base_event["raw_model_event"] = {
+                    "type": getattr(event.data, "type", "other"),
+                }
         elif event.type == "error":
             base_event["error"] = str(event.error) if hasattr(event, "error") else "Unknown error"
         elif event.type == "input_audio_timeout_triggered":
@@ -119,6 +150,28 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
 
         return base_event
 
+    async def _flush_audio_loop(self, session_id: str) -> None:
+        try:
+            while session_id in self.websockets:
+                await asyncio.sleep(0.04)  # ~40ms cadence
+                buf = self._pending_audio.get(session_id)
+                ws = self.websockets.get(session_id)
+                if not buf or ws is None:
+                    continue
+                if not buf:
+                    continue
+                b = bytes(buf)
+                self._pending_audio[session_id] = bytearray()
+                try:
+                    await ws.send_text(
+                        json.dumps({"type": "audio", "audio": base64.b64encode(b).decode("utf-8")})
+                    )
+                except Exception:
+                    logger.error("Failed sending coalesced audio", exc_info=True)
+                    break
+        except asyncio.CancelledError:
+            pass
+
 
 manager = RealtimeWebSocketManager()
 
@@ -142,7 +195,8 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
             if message["type"] == "audio":
                 # Convert int16 array to bytes
                 int16_data = message["data"]
-                audio_bytes = struct.pack(f"{len(int16_data)}h", *int16_data)
+                # Send little-endian PCM16 to the model.
+                audio_bytes = struct.pack("<" + f"{len(int16_data)}h", *int16_data)
                 await manager.send_audio(session_id, audio_bytes)
 
     except WebSocketDisconnect:
@@ -160,4 +214,5 @@ async def read_index():
 if __name__ == "__main__":
     import uvicorn
 
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    log_level = "info"
+    uvicorn.run(app, host="0.0.0.0", port=8000, log_level=log_level)
@@ -14,6 +14,10 @@ class RealtimeDemo {
         this.isPlayingAudio = false;
         this.playbackAudioContext = null;
         this.currentAudioSource = null;
+        this.nextPlaybackTime = 0;
+
+        // Live assistant transcript buffer
+        this.pendingAssistantText = '';
 
         this.initializeElements();
         this.setupEventListeners();
@@ -138,9 +142,9 @@ class RealtimeDemo {
             const source = this.audioContext.createMediaStreamSource(this.stream);
 
             // Create a script processor to capture audio data
-            this.processor = this.audioContext.createScriptProcessor(4096, 1, 1);
+            this.processor = this.audioContext.createScriptProcessor(2048, 1, 1);
             source.connect(this.processor);
-            this.processor.connect(this.audioContext.destination);
+            // Do not connect to destination to avoid local echo.
 
             this.processor.onaudioprocess = (event) => {
                 if (!this.isMuted && this.ws && this.ws.readyState === WebSocket.OPEN) {
@@ -204,6 +208,9 @@ class RealtimeDemo {
             case 'audio':
                 this.playAudio(event.audio);
                 break;
+            case 'transcript_delta':
+                this.handleTranscriptDelta(event.delta || '');
+                break;
             case 'audio_interrupted':
                 this.stopAudioPlayback();
                 break;
@@ -260,7 +267,18 @@ class RealtimeDemo {
         } else {
             console.log('History is not an array or is null/undefined');
         }
-        
+
+        // If we have a live assistant transcript, append it as a streaming bubble.
+        if (this.pendingAssistantText && this.pendingAssistantText.trim()) {
+            const messageDiv = document.createElement('div');
+            messageDiv.className = 'message assistant';
+            const bubbleDiv = document.createElement('div');
+            bubbleDiv.className = 'message-bubble';
+            bubbleDiv.textContent = this.pendingAssistantText;
+            messageDiv.appendChild(bubbleDiv);
+            this.messagesContent.appendChild(messageDiv);
+        }
+
         this.scrollToBottom();
     }
 
@@ -370,12 +388,13 @@ class RealtimeDemo {
         if (this.isPlayingAudio || this.audioQueue.length === 0) {
             return;
         }
-        
+
         this.isPlayingAudio = true;
-        
+
         // Initialize audio context if needed
         if (!this.playbackAudioContext) {
             this.playbackAudioContext = new AudioContext({ sampleRate: 24000 });
+            this.nextPlaybackTime = this.playbackAudioContext.currentTime;
         }
 
         while (this.audioQueue.length > 0) {
@@ -425,14 +444,28 @@ class RealtimeDemo {
                     this.currentAudioSource = null;
                     resolve();
                 };
-                source.start();
-                
+                // Schedule to minimize gaps between chunks
+                const now = this.playbackAudioContext.currentTime;
+                if (this.nextPlaybackTime < now) {
+                    this.nextPlaybackTime = now;
+                }
+                source.start(this.nextPlaybackTime);
+                this.nextPlaybackTime += audioBuffer.duration;
+
             } catch (error) {
                 console.error('Failed to play audio chunk:', error);
                 reject(error);
             }
         });
     }
+
+    handleTranscriptDelta(delta) {
+        if (!delta) return;
+        this.pendingAssistantText += delta;
+        // Update the live bubble if present; otherwise, append a temporary one.
+        // Reuse updateMessagesFromHistory to keep behavior consistent.
+        this.updateMessagesFromHistory([]);
+    }
 
     stopAudioPlayback() {
         console.log('Stopping audio playback due to interruption');
@@ -464,4 +497,4 @@ class RealtimeDemo {
 // Initialize the demo when the page loads
 document.addEventListener('DOMContentLoaded', () => {
     new RealtimeDemo();
-});
+});
@@ -8,10 +8,17 @@
 import sounddevice as sd
 
 from agents import function_tool
-from agents.realtime import RealtimeAgent, RealtimeRunner, RealtimeSession, RealtimeSessionEvent
+from agents.realtime import (
+    RealtimeAgent,
+    RealtimePlaybackTracker,
+    RealtimeRunner,
+    RealtimeSession,
+    RealtimeSessionEvent,
+)
+from agents.realtime.model import RealtimeModelConfig
 
 # Audio configuration
-CHUNK_LENGTH_S = 0.05  # 50ms
+CHUNK_LENGTH_S = 0.04  # 40ms aligns with realtime defaults
 SAMPLE_RATE = 24000
 FORMAT = np.int16
 CHANNELS = 1
@@ -49,11 +56,16 @@ def __init__(self) -> None:
         self.audio_player: sd.OutputStream | None = None
         self.recording = False
 
+        # Playback tracker lets the model know our real playback progress
+        self.playback_tracker = RealtimePlaybackTracker()
+
         # Audio output state for callback system
-        self.output_queue: queue.Queue[Any] = queue.Queue(maxsize=10)  # Buffer more chunks
+        # Store tuples: (samples_np, item_id, content_index)
+        self.output_queue: queue.Queue[Any] = queue.Queue(maxsize=100)
         self.interrupt_event = threading.Event()
-        self.current_audio_chunk: np.ndarray[Any, np.dtype[Any]] | None = None
+        self.current_audio_chunk: tuple[np.ndarray[Any, np.dtype[Any]], str, int] | None = None
         self.chunk_position = 0
+        self.bytes_per_sample = np.dtype(FORMAT).itemsize
 
     def _output_callback(self, outdata, frames: int, time, status) -> None:
         """Callback for audio output - handles continuous audio stream from server."""
@@ -92,20 +104,29 @@ def _output_callback(self, outdata, frames: int, time, status) -> None:
 
             # Copy data from current chunk to output buffer
             remaining_output = len(outdata) - samples_filled
-            remaining_chunk = len(self.current_audio_chunk) - self.chunk_position
+            samples, item_id, content_index = self.current_audio_chunk
+            remaining_chunk = len(samples) - self.chunk_position
             samples_to_copy = min(remaining_output, remaining_chunk)
 
             if samples_to_copy > 0:
-                chunk_data = self.current_audio_chunk[
-                    self.chunk_position : self.chunk_position + samples_to_copy
-                ]
+                chunk_data = samples[self.chunk_position : self.chunk_position + samples_to_copy]
                 # More efficient: direct assignment for mono audio instead of reshape
                 outdata[samples_filled : samples_filled + samples_to_copy, 0] = chunk_data
                 samples_filled += samples_to_copy
                 self.chunk_position += samples_to_copy
 
+                # Inform playback tracker about played bytes
+                try:
+                    self.playback_tracker.on_play_bytes(
+                        item_id=item_id,
+                        item_content_index=content_index,
+                        bytes=chunk_data.tobytes(),
+                    )
+                except Exception:
+                    pass
+
                 # If we've used up the entire chunk, reset for next iteration
-                if self.chunk_position >= len(self.current_audio_chunk):
+                if self.chunk_position >= len(samples):
                     self.current_audio_chunk = None
                     self.chunk_position = 0
 
@@ -125,7 +146,15 @@ async def run(self) -> None:
 
         try:
             runner = RealtimeRunner(agent)
-            async with await runner.run() as session:
+            # Attach playback tracker and disable server-side response interruption,
+            # which can truncate assistant audio when mic picks up speaker output.
+            model_config: RealtimeModelConfig = {
+                "playback_tracker": self.playback_tracker,
+                "initial_model_settings": {
+                    "turn_detection": {"type": "semantic_vad", "interrupt_response": False},
+                },
+            }
+            async with await runner.run(model_config=model_config) as session:
                 self.session = session
                 print("Connected. Starting audio recording...")
 
@@ -170,6 +199,14 @@ async def capture_audio(self) -> None:
         read_size = int(SAMPLE_RATE * CHUNK_LENGTH_S)
 
         try:
+            # Simple energy-based barge-in: if user speaks while audio is playing, interrupt.
+            def rms_energy(samples: np.ndarray[Any, np.dtype[Any]]) -> float:
+                if samples.size == 0:
+                    return 0.0
+                # Normalize int16 to [-1, 1]
+                x = samples.astype(np.float32) / 32768.0
+                return float(np.sqrt(np.mean(x * x)))
+
             while self.recording:
                 # Check if there's enough data to read
                 if self.audio_stream.read_available < read_size:
@@ -182,8 +219,12 @@ async def capture_audio(self) -> None:
                 # Convert numpy array to bytes
                 audio_bytes = data.tobytes()
 
-                # Send audio to session
-                await self.session.send_audio(audio_bytes)
+                # Half-duplex gating: do not send mic while assistant audio is playing
+                assistant_playing = (
+                    self.current_audio_chunk is not None or not self.output_queue.empty()
+                )
+                if not assistant_playing:
+                    await self.session.send_audio(audio_bytes)
 
                 # Yield control back to event loop
                 await asyncio.sleep(0)
@@ -212,17 +253,19 @@ async def _on_event(self, event: RealtimeSessionEvent) -> None:
             elif event.type == "audio_end":
                 print("Audio ended")
             elif event.type == "audio":
-                # Enqueue audio for callback-based playback
+                # Enqueue audio for callback-based playback with metadata
                 np_audio = np.frombuffer(event.audio.data, dtype=np.int16)
                 try:
-                    self.output_queue.put_nowait(np_audio)
+                    self.output_queue.put_nowait((np_audio, event.item_id, event.content_index))
                 except queue.Full:
                     # Queue is full - only drop if we have significant backlog
                     # This prevents aggressive dropping that could cause choppiness
                     if self.output_queue.qsize() > 8:  # Keep some buffer
                         try:
                             self.output_queue.get_nowait()
-                            self.output_queue.put_nowait(np_audio)
+                            self.output_queue.put_nowait(
+                                (np_audio, event.item_id, event.content_index)
+                            )
                         except queue.Empty:
                             pass
                     # If queue isn't too full, just skip this chunk to avoid blocking