@@ -419,8 +419,8 @@ async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
419419
420420 @self .events .subscribe
421421 async def on_stt_transcript_event_create_response (event : STTTranscriptEvent ):
422- if self .realtime_mode or not self . llm :
423- # when running in realtime mode, there is no need to send the response to the LLM
422+ if self .llm . handles_audio :
423+ # There is no need to send the response to the LLM if it handles audio itself.
424424 return
425425
426426 user_id = event .user_id ()
@@ -793,7 +793,7 @@ async def on_audio_received(event: AudioReceivedEvent):
793793
794794 # Always listen to remote video tracks so we can forward frames to Realtime providers
795795 @self .edge .events .subscribe
796- async def on_track (event : TrackAddedEvent ):
796+ async def on_video_track_added (event : TrackAddedEvent ):
797797 track_id = event .track_id
798798 track_type = event .track_type
799799 user = event .user
@@ -807,7 +807,7 @@ async def on_track(event: TrackAddedEvent):
807807 f"🎥 Track re-added: { track_type_name } ({ track_id } ), switching to it"
808808 )
809809
810- if self .realtime_mode and isinstance ( self . llm , Realtime ) :
810+ if self .llm . handles_video :
811811 # Get the existing forwarder and switch to this track
812812 _ , _ , forwarder = self ._active_video_tracks [track_id ]
813813 track = self .edge .add_track_subscriber (track_id )
@@ -823,7 +823,7 @@ async def on_track(event: TrackAddedEvent):
823823 task .add_done_callback (_log_task_exception )
824824
825825 @self .edge .events .subscribe
826- async def on_track_removed (event : TrackRemovedEvent ):
826+ async def on_video_track_removed (event : TrackRemovedEvent ):
827827 track_id = event .track_id
828828 track_type = event .track_type
829829 if not track_id :
@@ -841,11 +841,7 @@ async def on_track_removed(event: TrackRemovedEvent):
841841 self ._active_video_tracks .pop (track_id , None )
842842
843843 # If this was the active track, switch to any other available track
844- if (
845- track_id == self ._current_video_track_id
846- and self .realtime_mode
847- and isinstance (self .llm , Realtime )
848- ):
844+ if self .llm .handles_video and track_id == self ._current_video_track_id :
849845 self .logger .info (
850846 "🎥 Active video track removed, switching to next available"
851847 )
@@ -871,7 +867,7 @@ async def _reply_to_audio(
871867 )
872868
873869 # when in Realtime mode call the Realtime directly (non-blocking)
874- if self .realtime_mode and isinstance ( self . llm , Realtime ) :
870+ if self .llm . handles_audio :
875871 # TODO: this behaviour should be easy to change in the agent class
876872 asyncio .create_task (
877873 self .llm .simple_audio_response (pcm_data , participant )
@@ -972,7 +968,7 @@ async def recv(self):
972968 # If Realtime provider supports video, switch to this new track
973969 track_type_name = TrackType .Name (track_type )
974970
975- if self .realtime_mode :
971+ if self .llm . handles_video :
976972 if self ._video_track :
977973 # We have a video publisher (e.g., YOLO processor)
978974 # Create a separate forwarder for the PROCESSED video track
@@ -1094,8 +1090,8 @@ async def recv(self):
10941090
10951091 async def _on_turn_event (self , event : TurnStartedEvent | TurnEndedEvent ) -> None :
10961092 """Handle turn detection events."""
1097- # In realtime mode, the LLM handles turn detection, interruption, and responses itself
1098- if self .realtime_mode :
1093+ # Skip the turn event handling if the model doesn't require TTS or SST audio itself.
1094+ if not ( self .llm . needs_tts and self . llm . needs_stt ) :
10991095 return
11001096
11011097 if isinstance (event , TurnStartedEvent ):
@@ -1129,56 +1125,44 @@ async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None
11291125 self .logger .info (
11301126 f"👉 Turn ended - participant { participant_id } finished (confidence: { event .confidence } )"
11311127 )
1128+ if not event .participant or event .participant .user_id == self .agent_user .id :
1129+ # Exit early if the event is triggered by the model response.
1130+ return
11321131
1133- # When turn detection is enabled, trigger LLM response when user's turn ends
1132+ # When turn detection is enabled, trigger LLM response when user's turn ends.
11341133 # This is the signal that the user has finished speaking and expects a response
1135- if event .participant and event .participant .user_id != self .agent_user .id :
1136- # Get the accumulated transcript for this speaker
1137- transcript = self ._pending_user_transcripts .get (
1138- event .participant .user_id , ""
1134+ transcript = self ._pending_user_transcripts .get (
1135+ event .participant .user_id , ""
1136+ )
1137+ if transcript .strip ():
1138+ self .logger .info (
1139+ f"🤖 Triggering LLM response after turn ended for { event .participant .user_id } "
11391140 )
11401141
1141- if transcript and transcript .strip ():
1142- self .logger .info (
1143- f"🤖 Triggering LLM response after turn ended for { event .participant .user_id } "
1144- )
1145-
1146- # Create participant object if we have metadata
1147- participant = None
1148- if hasattr (event , "custom" ) and event .custom :
1149- # Try to extract participant info from custom metadata
1150- participant = event .custom .get ("participant" )
1142+ # Create participant object if we have metadata
1143+ participant = None
1144+ if hasattr (event , "custom" ) and event .custom :
1145+ # Try to extract participant info from custom metadata
1146+ participant = event .custom .get ("participant" )
11511147
1152- # Trigger LLM response with the complete transcript
1153- if self .llm :
1154- await self .simple_response (transcript , participant )
1148+ # Trigger LLM response with the complete transcript
1149+ await self .simple_response (transcript , participant )
11551150
1156- # Clear the pending transcript for this speaker
1157- self ._pending_user_transcripts [event .participant .user_id ] = ""
1151+ # Clear the pending transcript for this speaker
1152+ self ._pending_user_transcripts [event .participant .user_id ] = ""
11581153
11591154 async def _on_stt_error (self , error ):
11601155 """Handle STT service errors."""
11611156 self .logger .error (f"❌ STT Error: { error } " )
11621157
1163- @property
1164- def realtime_mode (self ) -> bool :
1165- """Check if the agent is in Realtime mode.
1166-
1167- Returns:
1168- True if `llm` is a `Realtime` implementation; otherwise False.
1169- """
1170- if self .llm is not None and isinstance (self .llm , Realtime ):
1171- return True
1172- return False
1173-
11741158 @property
11751159 def publish_audio (self ) -> bool :
11761160 """Whether the agent should publish an outbound audio track.
11771161
11781162 Returns:
11791163 True if TTS is configured or when in Realtime mode.
11801164 """
1181- if self .tts is not None or self .realtime_mode :
1165+ if self .tts is not None or self .llm . handles_audio :
11821166 return True
11831167 return False
11841168
@@ -1212,9 +1196,7 @@ def _needs_audio_or_video_input(self) -> bool:
12121196 # Video input needed for:
12131197 # - Video processors (for frame analysis)
12141198 # - Realtime mode with video (multimodal LLMs)
1215- needs_video = len (self .video_processors ) > 0 or (
1216- self .realtime_mode and isinstance (self .llm , Realtime )
1217- )
1199+ needs_video = len (self .video_processors ) > 0 or self .llm .handles_video
12181200
12191201 return needs_audio or needs_video
12201202
@@ -1265,7 +1247,7 @@ def image_processors(self) -> List[Any]:
12651247
12661248 def _validate_configuration (self ):
12671249 """Validate the agent configuration."""
1268- if self .realtime_mode :
1250+ if self .llm . handles_audio :
12691251 # Realtime mode - should not have separate STT/TTS
12701252 if self .stt or self .tts :
12711253 self .logger .warning (
@@ -1302,7 +1284,7 @@ def _prepare_rtc(self):
13021284
13031285 # Set up audio track if TTS is available
13041286 if self .publish_audio :
1305- if self .realtime_mode and isinstance ( self . llm , Realtime ) :
1287+ if self .llm . handles_audio :
13061288 self ._audio_track = self .llm .output_track
13071289 self .logger .info ("🎵 Using Realtime provider output track for audio" )
13081290 else :
0 commit comments