|
9 | 9 | from livekit import rtc |
10 | 10 | from livekit.agents import llm, utils |
11 | 11 | from livekit.agents.llm.function_context import _create_ai_function_info |
| 12 | +from livekit.agents.utils import images |
12 | 13 |
|
13 | 14 | from google import genai |
14 | 15 | from google.genai.types import ( |
@@ -331,14 +332,53 @@ def fnc_ctx(self) -> llm.FunctionContext | None: |
331 | 332 | def fnc_ctx(self, value: llm.FunctionContext | None) -> None: |
332 | 333 | self._fnc_ctx = value |
333 | 334 |
|
334 | | - def _push_audio(self, frame: rtc.AudioFrame) -> None: |
335 | | - if self._opts.enable_user_audio_transcription: |
336 | | - self._transcriber._push_audio(frame) |
| 335 | + def _push_media_chunk(self, data: bytes, mime_type: str) -> None: |
337 | 336 | realtime_input = LiveClientRealtimeInput( |
338 | | - media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")], |
| 337 | + media_chunks=[Blob(data=data, mime_type=mime_type)], |
339 | 338 | ) |
340 | 339 | self._queue_msg(realtime_input) |
341 | 340 |
|
| 341 | + DEFAULT_ENCODE_OPTIONS = images.EncodeOptions( |
| 342 | + format="JPEG", |
| 343 | + quality=75, |
| 344 | + resize_options=images.ResizeOptions( |
| 345 | + width=1024, height=1024, strategy="scale_aspect_fit" |
| 346 | + ), |
| 347 | + ) |
| 348 | + |
| 349 | + def push_video( |
| 350 | + self, |
| 351 | + frame: rtc.VideoFrame, |
| 352 | + encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS, |
| 353 | + ) -> None: |
| 354 | + """Push a video frame to the Gemini Multimodal Live session. |
| 355 | +
|
| 356 | + Args: |
| 357 | + frame (rtc.VideoFrame): The video frame to push. |
| 358 | + encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG. |
| 359 | +
|
| 360 | + Notes: |
| 361 | + - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point. |
| 362 | + """ |
| 363 | + encoded_data = images.encode( |
| 364 | + frame, |
| 365 | + encode_options, |
| 366 | + ) |
| 367 | + mime_type = ( |
| 368 | + "image/jpeg" |
| 369 | + if encode_options.format == "JPEG" |
| 370 | + else "image/png" |
| 371 | + if encode_options.format == "PNG" |
| 372 | + else "image/jpeg" |
| 373 | + ) |
| 374 | + self._push_media_chunk(encoded_data, mime_type) |
| 375 | + |
| 376 | + def _push_audio(self, frame: rtc.AudioFrame) -> None: |
| 377 | + if self._opts.enable_user_audio_transcription: |
| 378 | + self._transcriber._push_audio(frame) |
| 379 | + |
| 380 | + self._push_media_chunk(frame.data.tobytes(), "audio/pcm") |
| 381 | + |
342 | 382 | def _queue_msg(self, msg: ClientEvents) -> None: |
343 | 383 | self._send_ch.send_nowait(msg) |
344 | 384 |
|
|
0 commit comments