Skip to content

Commit 60f6d83

Browse files
Bugfix gemini (#150)
* fix gemini framerate * wip
1 parent 11f548a commit 60f6d83

File tree

5 files changed

+66
-6
lines changed

5 files changed

+66
-6
lines changed

agents-core/vision_agents/core/utils/audio_forwarder.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ async def _reader(self):
5050
"""Read audio frames from track and forward to callback."""
5151
while True:
5252
try:
53+
5354
received = await asyncio.wait_for(self.track.recv(), timeout=1.0)
5455
frame = cast(av.AudioFrame, received)
5556

examples/other_examples/openai_realtime_webrtc/openai_realtime_example.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,6 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non
5555
#TODO: should open demo be done by the CLI instead of the example?
5656
await agent.edge.open_demo(call)
5757
logger.info("LLM ready")
58-
# await agent.llm.request_session_info()
59-
logger.info("Requested session info")
6058
# Wait for a human to join the call before greeting
6159
logger.info("Waiting for human to join the call")
6260
await agent.llm.simple_response(text="Please greet the user.")

plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,12 @@ async def simple_audio_response(
147147
return
148148

149149
self._current_participant = participant
150-
self.logger.debug(f"Sending audio to gemini: {pcm.duration}")
150+
151151
# Build blob and send directly
152152
audio_bytes = pcm.resample(
153153
target_sample_rate=16000, target_channels=1
154154
).samples.tobytes()
155-
mime = f"audio/pcm;rate={pcm.sample_rate}"
155+
mime = f"audio/pcm;rate=16000"
156156
blob = Blob(data=audio_bytes, mime_type=mime)
157157

158158
await self._require_session().send_realtime_input(audio=blob)

plugins/openai/vision_agents/plugins/openai/openai_realtime.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
RealtimeSessionCreateRequestParam,
77
ResponseAudioTranscriptDoneEvent,
88
InputAudioBufferSpeechStartedEvent,
9-
ConversationItemInputAudioTranscriptionCompletedEvent,
9+
ConversationItemInputAudioTranscriptionCompletedEvent, SessionUpdatedEvent, ResponseCreatedEvent, ResponseDoneEvent,
1010
)
1111

1212
from vision_agents.core.llm import realtime
@@ -240,6 +240,20 @@ async def _handle_openai_event(self, event: dict) -> None:
240240
elif et == "response.tool_call":
241241
# Handle tool calls from OpenAI realtime
242242
await self._handle_tool_call_event(event)
243+
elif et == "response.created":
244+
e = ResponseCreatedEvent(**event)
245+
pass
246+
elif et == "response.done":
247+
logger.info("OpenAI response done %s", event)
248+
e = ResponseDoneEvent(**event)
249+
250+
if e.response.status == "failed":
251+
raise Exception("OpenAI realtime failure %s", e.response)
252+
elif et == "session.updated":
253+
pass
254+
#e = SessionUpdatedEvent(**event)
255+
else:
256+
logger.info(f"Unrecognized OpenAI Realtime event: {et} {event}")
243257

244258
async def _handle_audio_output(self, pcm: PcmData) -> None:
245259
"""Process audio output received from the OpenAI API.

plugins/openai/vision_agents/plugins/openai/rtc_manager.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ def __init__(self, model: str, voice: str, send_video: bool):
4646
self.pc = RTCPeerConnection()
4747
self.data_channel: Optional[RTCDataChannel] = None
4848

49+
# Set up connection event handlers
50+
self._setup_connection_logging()
51+
4952
# on this track we send audio to openAI
5053
self._audio_to_openai_track: QueuedAudioTrack = QueuedAudioTrack(
5154
sample_rate=48000
@@ -62,6 +65,51 @@ def __init__(self, model: str, voice: str, send_video: bool):
6265

6366
self.instructions: Optional[str] = None
6467

68+
def _setup_connection_logging(self) -> None:
69+
"""Set up event handlers for connection monitoring and error logging."""
70+
71+
@self.pc.on("connectionstatechange")
72+
async def on_connectionstatechange():
73+
state = self.pc.connectionState
74+
logger.info(f"🔗 RTC connection state changed: {state}")
75+
if state == "failed":
76+
logger.error("❌ RTC connection failed")
77+
elif state == "disconnected":
78+
logger.warning("⚠️ RTC connection disconnected")
79+
elif state == "connected":
80+
logger.info("✅ RTC connection established")
81+
elif state == "closed":
82+
logger.info("🔒 RTC connection closed")
83+
84+
@self.pc.on("iceconnectionstatechange")
85+
async def on_iceconnectionstatechange():
86+
state = self.pc.iceConnectionState
87+
logger.info(f"🧊 ICE connection state: {state}")
88+
if state == "failed":
89+
logger.error("❌ ICE connection failed")
90+
elif state == "disconnected":
91+
logger.warning("⚠️ ICE connection disconnected")
92+
elif state == "connected":
93+
logger.info("✅ ICE connection established")
94+
elif state == "checking":
95+
logger.debug("🔍 ICE checking candidates...")
96+
97+
@self.pc.on("icegatheringstatechange")
98+
async def on_icegatheringstatechange():
99+
state = self.pc.iceGatheringState
100+
logger.debug(f"🧊 ICE gathering state: {state}")
101+
if state == "complete":
102+
logger.info("✅ ICE gathering complete")
103+
104+
@self.pc.on("signalingstatechange")
105+
async def on_signalingstatechange():
106+
state = self.pc.signalingState
107+
logger.debug(f"📡 Signaling state: {state}")
108+
109+
@self.pc.on("datachannel")
110+
async def on_datachannel(channel):
111+
logger.info(f"📨 Remote data channel created: {channel.label}")
112+
65113
async def connect(self) -> None:
66114
"""Establish WebRTC connection to OpenAI's Realtime API.
67115
@@ -81,7 +129,6 @@ async def on_track(track):
81129
logger.info("receiving track from openai")
82130
if track.kind == "audio":
83131
track = cast(AudioStreamTrack, track)
84-
logger.info("Remote audio track attached; starting audio forwarder")
85132
if self._audio_callback:
86133
audio_forwarder = AudioForwarder(track, self._audio_callback)
87134
await audio_forwarder.start()

0 commit comments

Comments
 (0)