@@ -36,20 +36,32 @@ def __init__(self):
3636 self .active_sessions : dict [str , RealtimeSession ] = {}
3737 self .session_contexts : dict [str , Any ] = {}
3838 self .websockets : dict [str , WebSocket ] = {}
39+ self ._pending_audio : dict [str , bytearray ] = {}
40+ self ._audio_flush_tasks : dict [str , asyncio .Task [Any ]] = {}
3941
4042 async def connect (self , websocket : WebSocket , session_id : str ):
4143 await websocket .accept ()
4244 self .websockets [session_id ] = websocket
4345
4446 agent = get_starting_agent ()
4547 runner = RealtimeRunner (agent )
46- session_context = await runner .run ()
48+ # Disable server-side interrupt_response to avoid truncating assistant audio
49+ session_context = await runner .run (
50+ model_config = {
51+ "initial_model_settings" : {
52+ "turn_detection" : {"type" : "semantic_vad" , "interrupt_response" : False }
53+ }
54+ }
55+ )
4756 session = await session_context .__aenter__ ()
4857 self .active_sessions [session_id ] = session
4958 self .session_contexts [session_id ] = session_context
5059
5160 # Start event processing task
5261 asyncio .create_task (self ._process_events (session_id ))
62+ # Init audio buffer + steady flush task (~40ms)
63+ self ._pending_audio [session_id ] = bytearray ()
64+ self ._audio_flush_tasks [session_id ] = asyncio .create_task (self ._flush_audio_loop (session_id ))
5365
5466 async def disconnect (self , session_id : str ):
5567 if session_id in self .session_contexts :
@@ -59,6 +71,11 @@ async def disconnect(self, session_id: str):
5971 del self .active_sessions [session_id ]
6072 if session_id in self .websockets :
6173 del self .websockets [session_id ]
74+ if session_id in self ._pending_audio :
75+ del self ._pending_audio [session_id ]
76+ if session_id in self ._audio_flush_tasks :
77+ self ._audio_flush_tasks [session_id ].cancel ()
78+ del self ._audio_flush_tasks [session_id ]
6279
6380 async def send_audio (self , session_id : str , audio_bytes : bytes ):
6481 if session_id in self .active_sessions :
@@ -70,12 +87,13 @@ async def _process_events(self, session_id: str):
7087 websocket = self .websockets [session_id ]
7188
7289 async for event in session :
73- event_data = await self ._serialize_event (event )
74- await websocket .send_text (json .dumps (event_data ))
90+ event_data = await self ._serialize_event (session_id , event )
91+ if event_data is not None :
92+ await websocket .send_text (json .dumps (event_data ))
7593 except Exception as e :
7694 logger .error (f"Error processing events for session { session_id } : { e } " )
7795
78- async def _serialize_event (self , event : RealtimeSessionEvent ) -> dict [str , Any ]:
96+ async def _serialize_event (self , session_id : str , event : RealtimeSessionEvent ) -> dict [str , Any ] | None :
7997 base_event : dict [str , Any ] = {
8098 "type" : event .type ,
8199 }
@@ -93,7 +111,9 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
93111 base_event ["tool" ] = event .tool .name
94112 base_event ["output" ] = str (event .output )
95113 elif event .type == "audio" :
96- base_event ["audio" ] = base64 .b64encode (event .audio .data ).decode ("utf-8" )
114+ # Coalesce raw PCM and flush on a steady timer for smoother playback.
115+ self ._pending_audio [session_id ].extend (event .audio .data )
116+ return None
97117 elif event .type == "audio_interrupted" :
98118 pass
99119 elif event .type == "audio_end" :
@@ -107,9 +127,20 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
107127 {"name" : result .guardrail .name } for result in event .guardrail_results
108128 ]
109129 elif event .type == "raw_model_event" :
110- base_event ["raw_model_event" ] = {
111- "type" : event .data .type ,
112- }
130+ # Surface useful raw events to the UI with details.
131+ if getattr (event .data , "type" , None ) == "transcript_delta" :
132+ # Stream assistant transcript deltas to the UI.
133+ base_event = {
134+ "type" : "transcript_delta" ,
135+ "item_id" : getattr (event .data , "item_id" , "" ),
136+ "response_id" : getattr (event .data , "response_id" , "" ),
137+ "delta" : getattr (event .data , "delta" , "" ),
138+ }
139+ else :
140+ # Fallback to a minimal raw event descriptor.
141+ base_event ["raw_model_event" ] = {
142+ "type" : getattr (event .data , "type" , "other" ),
143+ }
113144 elif event .type == "error" :
114145 base_event ["error" ] = str (event .error ) if hasattr (event , "error" ) else "Unknown error"
115146 elif event .type == "input_audio_timeout_triggered" :
@@ -119,6 +150,28 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
119150
120151 return base_event
121152
153+ async def _flush_audio_loop (self , session_id : str ) -> None :
154+ try :
155+ while session_id in self .websockets :
156+ await asyncio .sleep (0.04 ) # ~40ms cadence
157+ buf = self ._pending_audio .get (session_id )
158+ ws = self .websockets .get (session_id )
159+ if not buf or ws is None :
160+ continue
161+ if not buf :
162+ continue
163+ b = bytes (buf )
164+ self ._pending_audio [session_id ] = bytearray ()
165+ try :
166+ await ws .send_text (
167+ json .dumps ({"type" : "audio" , "audio" : base64 .b64encode (b ).decode ("utf-8" )})
168+ )
169+ except Exception :
170+ logger .error ("Failed sending coalesced audio" , exc_info = True )
171+ break
172+ except asyncio .CancelledError :
173+ pass
174+
122175
123176manager = RealtimeWebSocketManager ()
124177
@@ -142,7 +195,8 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
142195 if message ["type" ] == "audio" :
143196 # Convert int16 array to bytes
144197 int16_data = message ["data" ]
145- audio_bytes = struct .pack (f"{ len (int16_data )} h" , * int16_data )
198+ # Send little-endian PCM16 to the model.
199+ audio_bytes = struct .pack ("<" + f"{ len (int16_data )} h" , * int16_data )
146200 await manager .send_audio (session_id , audio_bytes )
147201
148202 except WebSocketDisconnect :
@@ -160,4 +214,5 @@ async def read_index():
160214if __name__ == "__main__" :
161215 import uvicorn
162216
163- uvicorn .run (app , host = "0.0.0.0" , port = 8000 )
217+ log_level = "info"
218+ uvicorn .run (app , host = "0.0.0.0" , port = 8000 , log_level = log_level )
0 commit comments