Skip to content

Commit c1584e0

Browse files
committed
Add flags to signal capabilities and requirements in LLM
1 parent 60f6d83 commit c1584e0

File tree

3 files changed

+46
-53
lines changed

3 files changed

+46
-53
lines changed

agents-core/vision_agents/core/agents/agents.py

Lines changed: 33 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -419,8 +419,8 @@ async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
419419

420420
@self.events.subscribe
421421
async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
422-
if self.realtime_mode or not self.llm:
423-
# when running in realtime mode, there is no need to send the response to the LLM
422+
if self.llm.handles_audio:
423+
# There is no need to send the response to the LLM if it handles audio itself.
424424
return
425425

426426
user_id = event.user_id()
@@ -793,7 +793,7 @@ async def on_audio_received(event: AudioReceivedEvent):
793793

794794
# Always listen to remote video tracks so we can forward frames to Realtime providers
795795
@self.edge.events.subscribe
796-
async def on_track(event: TrackAddedEvent):
796+
async def on_video_track_added(event: TrackAddedEvent):
797797
track_id = event.track_id
798798
track_type = event.track_type
799799
user = event.user
@@ -807,7 +807,7 @@ async def on_track(event: TrackAddedEvent):
807807
f"🎥 Track re-added: {track_type_name} ({track_id}), switching to it"
808808
)
809809

810-
if self.realtime_mode and isinstance(self.llm, Realtime):
810+
if self.llm.handles_video:
811811
# Get the existing forwarder and switch to this track
812812
_, _, forwarder = self._active_video_tracks[track_id]
813813
track = self.edge.add_track_subscriber(track_id)
@@ -823,7 +823,7 @@ async def on_track(event: TrackAddedEvent):
823823
task.add_done_callback(_log_task_exception)
824824

825825
@self.edge.events.subscribe
826-
async def on_track_removed(event: TrackRemovedEvent):
826+
async def on_video_track_removed(event: TrackRemovedEvent):
827827
track_id = event.track_id
828828
track_type = event.track_type
829829
if not track_id:
@@ -841,11 +841,7 @@ async def on_track_removed(event: TrackRemovedEvent):
841841
self._active_video_tracks.pop(track_id, None)
842842

843843
# If this was the active track, switch to any other available track
844-
if (
845-
track_id == self._current_video_track_id
846-
and self.realtime_mode
847-
and isinstance(self.llm, Realtime)
848-
):
844+
if self.llm.handles_video and track_id == self._current_video_track_id:
849845
self.logger.info(
850846
"🎥 Active video track removed, switching to next available"
851847
)
@@ -871,7 +867,7 @@ async def _reply_to_audio(
871867
)
872868

873869
# when in Realtime mode call the Realtime directly (non-blocking)
874-
if self.realtime_mode and isinstance(self.llm, Realtime):
870+
if self.llm.handles_audio:
875871
# TODO: this behaviour should be easy to change in the agent class
876872
asyncio.create_task(
877873
self.llm.simple_audio_response(pcm_data, participant)
@@ -972,7 +968,7 @@ async def recv(self):
972968
# If Realtime provider supports video, switch to this new track
973969
track_type_name = TrackType.Name(track_type)
974970

975-
if self.realtime_mode:
971+
if self.llm.handles_video:
976972
if self._video_track:
977973
# We have a video publisher (e.g., YOLO processor)
978974
# Create a separate forwarder for the PROCESSED video track
@@ -1094,8 +1090,8 @@ async def recv(self):
10941090

10951091
async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
10961092
"""Handle turn detection events."""
1097-
# In realtime mode, the LLM handles turn detection, interruption, and responses itself
1098-
if self.realtime_mode:
1093+
# Skip the turn event handling if the model doesn't require TTS or SST audio itself.
1094+
if not (self.llm.needs_tts and self.llm.needs_stt):
10991095
return
11001096

11011097
if isinstance(event, TurnStartedEvent):
@@ -1129,56 +1125,44 @@ async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None
11291125
self.logger.info(
11301126
f"👉 Turn ended - participant {participant_id} finished (confidence: {event.confidence})"
11311127
)
1128+
if not event.participant or event.participant.user_id == self.agent_user.id:
1129+
# Exit early if the event is triggered by the model response.
1130+
return
11321131

1133-
# When turn detection is enabled, trigger LLM response when user's turn ends
1132+
# When turn detection is enabled, trigger LLM response when user's turn ends.
11341133
# This is the signal that the user has finished speaking and expects a response
1135-
if event.participant and event.participant.user_id != self.agent_user.id:
1136-
# Get the accumulated transcript for this speaker
1137-
transcript = self._pending_user_transcripts.get(
1138-
event.participant.user_id, ""
1134+
transcript = self._pending_user_transcripts.get(
1135+
event.participant.user_id, ""
1136+
)
1137+
if transcript.strip():
1138+
self.logger.info(
1139+
f"🤖 Triggering LLM response after turn ended for {event.participant.user_id}"
11391140
)
11401141

1141-
if transcript and transcript.strip():
1142-
self.logger.info(
1143-
f"🤖 Triggering LLM response after turn ended for {event.participant.user_id}"
1144-
)
1145-
1146-
# Create participant object if we have metadata
1147-
participant = None
1148-
if hasattr(event, "custom") and event.custom:
1149-
# Try to extract participant info from custom metadata
1150-
participant = event.custom.get("participant")
1142+
# Create participant object if we have metadata
1143+
participant = None
1144+
if hasattr(event, "custom") and event.custom:
1145+
# Try to extract participant info from custom metadata
1146+
participant = event.custom.get("participant")
11511147

1152-
# Trigger LLM response with the complete transcript
1153-
if self.llm:
1154-
await self.simple_response(transcript, participant)
1148+
# Trigger LLM response with the complete transcript
1149+
await self.simple_response(transcript, participant)
11551150

1156-
# Clear the pending transcript for this speaker
1157-
self._pending_user_transcripts[event.participant.user_id] = ""
1151+
# Clear the pending transcript for this speaker
1152+
self._pending_user_transcripts[event.participant.user_id] = ""
11581153

11591154
async def _on_stt_error(self, error):
11601155
"""Handle STT service errors."""
11611156
self.logger.error(f"❌ STT Error: {error}")
11621157

1163-
@property
1164-
def realtime_mode(self) -> bool:
1165-
"""Check if the agent is in Realtime mode.
1166-
1167-
Returns:
1168-
True if `llm` is a `Realtime` implementation; otherwise False.
1169-
"""
1170-
if self.llm is not None and isinstance(self.llm, Realtime):
1171-
return True
1172-
return False
1173-
11741158
@property
11751159
def publish_audio(self) -> bool:
11761160
"""Whether the agent should publish an outbound audio track.
11771161
11781162
Returns:
11791163
True if TTS is configured or when in Realtime mode.
11801164
"""
1181-
if self.tts is not None or self.realtime_mode:
1165+
if self.tts is not None or self.llm.handles_audio:
11821166
return True
11831167
return False
11841168

@@ -1212,9 +1196,7 @@ def _needs_audio_or_video_input(self) -> bool:
12121196
# Video input needed for:
12131197
# - Video processors (for frame analysis)
12141198
# - Realtime mode with video (multimodal LLMs)
1215-
needs_video = len(self.video_processors) > 0 or (
1216-
self.realtime_mode and isinstance(self.llm, Realtime)
1217-
)
1199+
needs_video = len(self.video_processors) > 0 or self.llm.handles_video
12181200

12191201
return needs_audio or needs_video
12201202

@@ -1265,7 +1247,7 @@ def image_processors(self) -> List[Any]:
12651247

12661248
def _validate_configuration(self):
12671249
"""Validate the agent configuration."""
1268-
if self.realtime_mode:
1250+
if self.llm.handles_audio:
12691251
# Realtime mode - should not have separate STT/TTS
12701252
if self.stt or self.tts:
12711253
self.logger.warning(
@@ -1302,7 +1284,7 @@ def _prepare_rtc(self):
13021284

13031285
# Set up audio track if TTS is available
13041286
if self.publish_audio:
1305-
if self.realtime_mode and isinstance(self.llm, Realtime):
1287+
if self.llm.handles_audio:
13061288
self._audio_track = self.llm.output_track
13071289
self.logger.info("🎵 Using Realtime provider output track for audio")
13081290
else:

agents-core/vision_agents/core/llm/llm.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,12 @@ def __init__(self, original: T, text: str, exception: Optional[Exception] = None
4444

4545

4646
class LLM(abc.ABC):
47-
# if we want to use realtime/ sts behaviour
48-
sts: bool = False
47+
# Instruct the Agent that this model requires STT and TTS services, and it doesn't handle audio and video
48+
# on its own.
49+
needs_stt: bool = True
50+
needs_tts: bool = True
51+
handles_audio: bool = False
52+
handles_video: bool = False
4953

5054
before_response_listener: BeforeCb
5155
after_response_listener: AfterCb

agents-core/vision_agents/core/llm/realtime.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ class Realtime(LLM, abc.ABC):
4242
fps: int = 1
4343
session_id: str # UUID to identify this session
4444

45+
# Instruct the Agent that this model can handle audio and video
46+
# without additional STT and TTS services.
47+
handles_audio: bool = True
48+
handles_video: bool = True
49+
needs_stt = False
50+
needs_tts = False
51+
4552
def __init__(
4653
self,
4754
fps: int = 1, # the number of video frames per second to send (for implementations that support setting fps)

0 commit comments

Comments
 (0)