Add flags to signal capabilities and requirements in LLM

dangusev · dangusev · commit c1584e005223 · 2025-11-04T23:16:46.000+01:00
diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -419,8 +419,8 @@ async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
 
         @self.events.subscribe
         async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
-            if self.realtime_mode or not self.llm:
-                # when running in realtime mode, there is no need to send the response to the LLM
+            if self.llm.handles_audio:
+                # There is no need to send the response to the LLM if it handles audio itself.
                 return
 
             user_id = event.user_id()
@@ -793,7 +793,7 @@ async def on_audio_received(event: AudioReceivedEvent):
 
         # Always listen to remote video tracks so we can forward frames to Realtime providers
         @self.edge.events.subscribe
-        async def on_track(event: TrackAddedEvent):
+        async def on_video_track_added(event: TrackAddedEvent):
             track_id = event.track_id
             track_type = event.track_type
             user = event.user
@@ -807,7 +807,7 @@ async def on_track(event: TrackAddedEvent):
                     f"🎥 Track re-added: {track_type_name} ({track_id}), switching to it"
                 )
 
-                if self.realtime_mode and isinstance(self.llm, Realtime):
+                if self.llm.handles_video:
                     # Get the existing forwarder and switch to this track
                     _, _, forwarder = self._active_video_tracks[track_id]
                     track = self.edge.add_track_subscriber(track_id)
@@ -823,7 +823,7 @@ async def on_track(event: TrackAddedEvent):
             task.add_done_callback(_log_task_exception)
 
         @self.edge.events.subscribe
-        async def on_track_removed(event: TrackRemovedEvent):
+        async def on_video_track_removed(event: TrackRemovedEvent):
             track_id = event.track_id
             track_type = event.track_type
             if not track_id:
@@ -841,11 +841,7 @@ async def on_track_removed(event: TrackRemovedEvent):
             self._active_video_tracks.pop(track_id, None)
 
             # If this was the active track, switch to any other available track
-            if (
-                track_id == self._current_video_track_id
-                and self.realtime_mode
-                and isinstance(self.llm, Realtime)
-            ):
+            if self.llm.handles_video and track_id == self._current_video_track_id:
                 self.logger.info(
                     "🎥 Active video track removed, switching to next available"
                 )
@@ -871,7 +867,7 @@ async def _reply_to_audio(
                 )
 
             # when in Realtime mode call the Realtime directly (non-blocking)
-            if self.realtime_mode and isinstance(self.llm, Realtime):
+            if self.llm.handles_audio:
                 # TODO: this behaviour should be easy to change in the agent class
                 asyncio.create_task(
                     self.llm.simple_audio_response(pcm_data, participant)
@@ -972,7 +968,7 @@ async def recv(self):
             # If Realtime provider supports video, switch to this new track
             track_type_name = TrackType.Name(track_type)
 
-            if self.realtime_mode:
+            if self.llm.handles_video:
                 if self._video_track:
                     # We have a video publisher (e.g., YOLO processor)
                     # Create a separate forwarder for the PROCESSED video track
@@ -1094,8 +1090,8 @@ async def recv(self):
 
     async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
         """Handle turn detection events."""
-        # In realtime mode, the LLM handles turn detection, interruption, and responses itself
-        if self.realtime_mode:
+        # Skip the turn event handling if the model doesn't require TTS or SST audio itself.
+        if not (self.llm.needs_tts and self.llm.needs_stt):
             return
 
         if isinstance(event, TurnStartedEvent):
@@ -1129,56 +1125,44 @@ async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None
             self.logger.info(
                 f"👉 Turn ended - participant {participant_id} finished (confidence: {event.confidence})"
             )
+            if not event.participant or event.participant.user_id == self.agent_user.id:
+                # Exit early if the event is triggered by the model response.
+                return
 
-            # When turn detection is enabled, trigger LLM response when user's turn ends
+            # When turn detection is enabled, trigger LLM response when user's turn ends.
             # This is the signal that the user has finished speaking and expects a response
-            if event.participant and event.participant.user_id != self.agent_user.id:
-                # Get the accumulated transcript for this speaker
-                transcript = self._pending_user_transcripts.get(
-                    event.participant.user_id, ""
+            transcript = self._pending_user_transcripts.get(
+                event.participant.user_id, ""
+            )
+            if transcript.strip():
+                self.logger.info(
+                    f"🤖 Triggering LLM response after turn ended for {event.participant.user_id}"
                 )
 
-                if transcript and transcript.strip():
-                    self.logger.info(
-                        f"🤖 Triggering LLM response after turn ended for {event.participant.user_id}"
-                    )
-
-                    # Create participant object if we have metadata
-                    participant = None
-                    if hasattr(event, "custom") and event.custom:
-                        # Try to extract participant info from custom metadata
-                        participant = event.custom.get("participant")
+                # Create participant object if we have metadata
+                participant = None
+                if hasattr(event, "custom") and event.custom:
+                    # Try to extract participant info from custom metadata
+                    participant = event.custom.get("participant")
 
-                    # Trigger LLM response with the complete transcript
-                    if self.llm:
-                        await self.simple_response(transcript, participant)
+                # Trigger LLM response with the complete transcript
+                await self.simple_response(transcript, participant)
 
-                    # Clear the pending transcript for this speaker
-                    self._pending_user_transcripts[event.participant.user_id] = ""
+                # Clear the pending transcript for this speaker
+                self._pending_user_transcripts[event.participant.user_id] = ""
 
     async def _on_stt_error(self, error):
         """Handle STT service errors."""
         self.logger.error(f"❌ STT Error: {error}")
 
-    @property
-    def realtime_mode(self) -> bool:
-        """Check if the agent is in Realtime mode.
-
-        Returns:
-            True if `llm` is a `Realtime` implementation; otherwise False.
-        """
-        if self.llm is not None and isinstance(self.llm, Realtime):
-            return True
-        return False
-
     @property
     def publish_audio(self) -> bool:
         """Whether the agent should publish an outbound audio track.
 
         Returns:
             True if TTS is configured or when in Realtime mode.
         """
-        if self.tts is not None or self.realtime_mode:
+        if self.tts is not None or self.llm.handles_audio:
             return True
         return False
 
@@ -1212,9 +1196,7 @@ def _needs_audio_or_video_input(self) -> bool:
         # Video input needed for:
         # - Video processors (for frame analysis)
         # - Realtime mode with video (multimodal LLMs)
-        needs_video = len(self.video_processors) > 0 or (
-            self.realtime_mode and isinstance(self.llm, Realtime)
-        )
+        needs_video = len(self.video_processors) > 0 or self.llm.handles_video
 
         return needs_audio or needs_video
 
@@ -1265,7 +1247,7 @@ def image_processors(self) -> List[Any]:
 
     def _validate_configuration(self):
         """Validate the agent configuration."""
-        if self.realtime_mode:
+        if self.llm.handles_audio:
             # Realtime mode - should not have separate STT/TTS
             if self.stt or self.tts:
                 self.logger.warning(
@@ -1302,7 +1284,7 @@ def _prepare_rtc(self):
 
         # Set up audio track if TTS is available
         if self.publish_audio:
-            if self.realtime_mode and isinstance(self.llm, Realtime):
+            if self.llm.handles_audio:
                 self._audio_track = self.llm.output_track
                 self.logger.info("🎵 Using Realtime provider output track for audio")
             else:
diff --git a/agents-core/vision_agents/core/llm/llm.py b/agents-core/vision_agents/core/llm/llm.py
@@ -44,8 +44,12 @@ def __init__(self, original: T, text: str, exception: Optional[Exception] = None
 
 
 class LLM(abc.ABC):
-    # if we want to use realtime/ sts behaviour
-    sts: bool = False
+    # Instruct the Agent that this model requires STT and TTS services, and it doesn't handle audio and video
+    # on its own.
+    needs_stt: bool = True
+    needs_tts: bool = True
+    handles_audio: bool = False
+    handles_video: bool = False
 
     before_response_listener: BeforeCb
     after_response_listener: AfterCb
diff --git a/agents-core/vision_agents/core/llm/realtime.py b/agents-core/vision_agents/core/llm/realtime.py
@@ -42,6 +42,13 @@ class Realtime(LLM, abc.ABC):
     fps: int = 1
     session_id: str  # UUID to identify this session
 
+    # Instruct the Agent that this model can handle audio and video
+    # without additional STT and TTS services.
+    handles_audio: bool = True
+    handles_video: bool = True
+    needs_stt = False
+    needs_tts = False
+
     def __init__(
         self,
         fps: int = 1,  # the number of video frames per second to send (for implementations that support setting fps)