Skip to content

Commit b92a506

Browse files
authored
Proposal: Add simple video input support for gemini live (#1536)
1 parent 60d0b1d commit b92a506

File tree

3 files changed

+103
-4
lines changed

3 files changed

+103
-4
lines changed

.changeset/dull-shrimps-reflect.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"livekit-plugins-google": minor
3+
---
4+
5+
Add simple video input support for gemini live

livekit-plugins/livekit-plugins-google/README.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
1616

1717
- Cloud Speech-to-Text API
1818
- Cloud Text-to-Speech API
19+
20+
21+
## Gemini Multimodal Live
22+
23+
Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
24+
25+
### Live Video Input (experimental)
26+
27+
You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
28+
29+
```
30+
# Make sure you subscribe to audio and video tracks
31+
await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
32+
33+
# Create your RealtimeModel and store a reference
34+
model = google.beta.realtime.RealtimeModel(
35+
# ...
36+
)
37+
38+
# Create your MultimodalAgent as usual
39+
agent = MultimodalAgent(
40+
model=model,
41+
# ...
42+
)
43+
44+
# Async method to process the video track and push frames to Gemini
45+
async def _process_video_track(self, track: Track):
46+
video_stream = VideoStream(track)
47+
last_frame_time = 0
48+
49+
async for event in video_stream:
50+
current_time = asyncio.get_event_loop().time()
51+
52+
# Sample at 1 FPS
53+
if current_time - last_frame_time < 1.0:
54+
continue
55+
56+
last_frame_time = current_time
57+
frame = event.frame
58+
59+
# Push the frame into the RealtimeSession
60+
model.sessions[0].push_video(frame)
61+
62+
await video_stream.aclose()
63+
64+
# Subscribe to new tracks and process them
65+
@ctx.room.on("track_subscribed")
66+
def _on_track_subscribed(track: Track, pub, participant):
67+
if track.kind == TrackKind.KIND_VIDEO:
68+
asyncio.create_task(self._process_video_track(track))
69+
```
70+
71+
72+

livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from livekit import rtc
1010
from livekit.agents import llm, utils
1111
from livekit.agents.llm.function_context import _create_ai_function_info
12+
from livekit.agents.utils import images
1213

1314
from google import genai
1415
from google.genai.types import (
@@ -331,14 +332,53 @@ def fnc_ctx(self) -> llm.FunctionContext | None:
331332
def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
332333
self._fnc_ctx = value
333334

334-
def _push_audio(self, frame: rtc.AudioFrame) -> None:
335-
if self._opts.enable_user_audio_transcription:
336-
self._transcriber._push_audio(frame)
335+
def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
337336
realtime_input = LiveClientRealtimeInput(
338-
media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
337+
media_chunks=[Blob(data=data, mime_type=mime_type)],
339338
)
340339
self._queue_msg(realtime_input)
341340

341+
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
342+
format="JPEG",
343+
quality=75,
344+
resize_options=images.ResizeOptions(
345+
width=1024, height=1024, strategy="scale_aspect_fit"
346+
),
347+
)
348+
349+
def push_video(
350+
self,
351+
frame: rtc.VideoFrame,
352+
encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
353+
) -> None:
354+
"""Push a video frame to the Gemini Multimodal Live session.
355+
356+
Args:
357+
frame (rtc.VideoFrame): The video frame to push.
358+
encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
359+
360+
Notes:
361+
- This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
362+
"""
363+
encoded_data = images.encode(
364+
frame,
365+
encode_options,
366+
)
367+
mime_type = (
368+
"image/jpeg"
369+
if encode_options.format == "JPEG"
370+
else "image/png"
371+
if encode_options.format == "PNG"
372+
else "image/jpeg"
373+
)
374+
self._push_media_chunk(encoded_data, mime_type)
375+
376+
def _push_audio(self, frame: rtc.AudioFrame) -> None:
377+
if self._opts.enable_user_audio_transcription:
378+
self._transcriber._push_audio(frame)
379+
380+
self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
381+
342382
def _queue_msg(self, msg: ClientEvents) -> None:
343383
self._send_ch.send_nowait(msg)
344384

0 commit comments

Comments
 (0)