Proposal: Add simple video input support for gemini live (#1536)

bcherry · web-flow · commit b92a506c7a3c · 2025-03-04T10:05:23.000-08:00
diff --git a/.changeset/dull-shrimps-reflect.md b/.changeset/dull-shrimps-reflect.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-google": minor
+---
+
+Add simple video input support for gemini live
diff --git a/livekit-plugins/livekit-plugins-google/README.md b/livekit-plugins/livekit-plugins-google/README.md
@@ -16,3 +16,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
 
 - Cloud Speech-to-Text API
 - Cloud Text-to-Speech API
+
+
+## Gemini Multimodal Live
+
+Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
+
+### Live Video Input (experimental)
+
+You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`.  The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
+
+```
+# Make sure you subscribe to audio and video tracks
+await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
+
+# Create your RealtimeModel and store a reference
+model = google.beta.realtime.RealtimeModel(
+    # ...
+)
+
+# Create your MultimodalAgent as usual
+agent = MultimodalAgent(
+    model=model,
+    # ...
+)
+
+# Async method to process the video track and push frames to Gemini
+async def _process_video_track(self, track: Track):
+    video_stream = VideoStream(track)
+    last_frame_time = 0
+    
+    async for event in video_stream:
+        current_time = asyncio.get_event_loop().time()
+        
+        # Sample at 1 FPS
+        if current_time - last_frame_time < 1.0: 
+            continue
+            
+        last_frame_time = current_time
+        frame = event.frame
+        
+        # Push the frame into the RealtimeSession
+        model.sessions[0].push_video(frame)
+        
+    await video_stream.aclose()
+
+# Subscribe to new tracks and process them
+@ctx.room.on("track_subscribed")
+def _on_track_subscribed(track: Track, pub, participant):
+    if track.kind == TrackKind.KIND_VIDEO:
+        asyncio.create_task(self._process_video_track(track))
+```
+
+
+
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py
@@ -9,6 +9,7 @@
 from livekit import rtc
 from livekit.agents import llm, utils
 from livekit.agents.llm.function_context import _create_ai_function_info
+from livekit.agents.utils import images
 
 from google import genai
 from google.genai.types import (
@@ -331,14 +332,53 @@ def fnc_ctx(self) -> llm.FunctionContext | None:
     def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
         self._fnc_ctx = value
 
-    def _push_audio(self, frame: rtc.AudioFrame) -> None:
-        if self._opts.enable_user_audio_transcription:
-            self._transcriber._push_audio(frame)
+    def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
         realtime_input = LiveClientRealtimeInput(
-            media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
+            media_chunks=[Blob(data=data, mime_type=mime_type)],
         )
         self._queue_msg(realtime_input)
 
+    DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
+        format="JPEG",
+        quality=75,
+        resize_options=images.ResizeOptions(
+            width=1024, height=1024, strategy="scale_aspect_fit"
+        ),
+    )
+
+    def push_video(
+        self,
+        frame: rtc.VideoFrame,
+        encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
+    ) -> None:
+        """Push a video frame to the Gemini Multimodal Live session.
+
+        Args:
+            frame (rtc.VideoFrame): The video frame to push.
+            encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
+
+        Notes:
+        - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
+        """
+        encoded_data = images.encode(
+            frame,
+            encode_options,
+        )
+        mime_type = (
+            "image/jpeg"
+            if encode_options.format == "JPEG"
+            else "image/png"
+            if encode_options.format == "PNG"
+            else "image/jpeg"
+        )
+        self._push_media_chunk(encoded_data, mime_type)
+
+    def _push_audio(self, frame: rtc.AudioFrame) -> None:
+        if self._opts.enable_user_audio_transcription:
+            self._transcriber._push_audio(frame)
+
+        self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
+
     def _queue_msg(self, msg: ClientEvents) -> None:
         self._send_ch.send_nowait(msg)
 

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"livekit-plugins-google": minor
 +---
++
 +Add simple video input support for gemini live