Skip to content

Commit be30d46

Browse files
authored
feat(ai): Add support for sending videos via Live API. (#15432)
1 parent ea3f129 commit be30d46

File tree

7 files changed

+150
-6
lines changed

7 files changed

+150
-6
lines changed

FirebaseAI/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
- [fixed] Fixed various links in the Live API doc comments not mapping correctly.
33
- [fixed] Fixed minor translation issue for nanosecond conversion when receiving
44
`LiveServerGoingAwayNotice`. (#15410)
5+
- [feature] Added support for sending video frames with the Live API via the `sendVideoRealtime`
6+
method on [`LiveSession`](https://firebase.google.com/docs/reference/swift/firebaseai/api/reference/Classes/LiveSession).
7+
(#15432)
58

69
# 12.4.0
710
- [feature] Added support for the URL context tool, which allows the model to access content

FirebaseAI/Sources/Types/Public/Live/LiveSession.swift

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,24 @@ public final class LiveSession: Sendable {
6767
await service.send(.realtimeInput(message))
6868
}
6969

70-
/// Sends a video input stream to the model, using the realtime API.
70+
/// Sends a video frame to the model, using the realtime API.
71+
///
72+
/// Instead of raw video data, the model expects individual frames of the video,
73+
/// sent as images.
74+
///
75+
/// If your video has audio, send it seperately through ``LiveSession/sendAudioRealtime(_:)``.
76+
///
77+
/// For better performance, frames can also be sent at a lower rate than the video;
78+
/// even as low as 1 frame per second.
7179
///
7280
/// - Parameters:
73-
/// - video: Encoded video data, used to update the model on the client's conversation.
74-
/// - format: The format that the video was encoded in (eg; `mp4`, `webm`, `wmv`, etc.,).
75-
// TODO: (b/448671945) Make public after testing and next release
76-
func sendVideoRealtime(_ video: Data, format: String) async {
81+
/// - video: Encoded image data extracted from a frame of the video, used to update the model on
82+
/// the client's conversation.
83+
/// - mimeType: The IANA standard MIME type of the video frame data (eg; `images/png`,
84+
/// `images/jpeg`etc.,).
85+
public func sendVideoRealtime(_ video: Data, mimeType: String) async {
7786
let message = BidiGenerateContentRealtimeInput(
78-
video: InlineData(data: video, mimeType: "video/\(format)")
87+
video: InlineData(data: video, mimeType: mimeType)
7988
)
8089
await service.send(.realtimeInput(message))
8190
}

FirebaseAI/Tests/TestApp/FirebaseAITestApp.xcodeproj/project.pbxproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
objects = {
88

99
/* Begin PBXBuildFile section */
10+
0E0481222EA2E51300A50172 /* DataUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0E0481212EA2E51100A50172 /* DataUtils.swift */; };
1011
0E460FAB2E9858E4007E26A6 /* LiveSessionTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0E460FAA2E9858E4007E26A6 /* LiveSessionTests.swift */; };
1112
0EC8BAE22E98784E0075A4E0 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 868A7C532CCC26B500E449DD /* Assets.xcassets */; };
1213
862218812D04E098007ED2D4 /* IntegrationTestUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */; };
@@ -44,6 +45,7 @@
4445
/* End PBXContainerItemProxy section */
4546

4647
/* Begin PBXFileReference section */
48+
0E0481212EA2E51100A50172 /* DataUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DataUtils.swift; sourceTree = "<group>"; };
4749
0E460FAA2E9858E4007E26A6 /* LiveSessionTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LiveSessionTests.swift; sourceTree = "<group>"; };
4850
862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IntegrationTestUtils.swift; sourceTree = "<group>"; };
4951
864F8F702D4980D60002EA7E /* ImagenIntegrationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImagenIntegrationTests.swift; sourceTree = "<group>"; };
@@ -168,6 +170,7 @@
168170
8698D7442CD3CEF700ABA833 /* Utilities */ = {
169171
isa = PBXGroup;
170172
children = (
173+
0E0481212EA2E51100A50172 /* DataUtils.swift */,
171174
86D77E032D7B6C95003D155D /* InstanceConfig.swift */,
172175
862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */,
173176
);
@@ -304,6 +307,7 @@
304307
DEF0BB512DA9B7450093E9F4 /* SchemaTests.swift in Sources */,
305308
DEF0BB4F2DA74F680093E9F4 /* TestHelpers.swift in Sources */,
306309
868A7C4F2CCC229F00E449DD /* Credentials.swift in Sources */,
310+
0E0481222EA2E51300A50172 /* DataUtils.swift in Sources */,
307311
864F8F712D4980DD0002EA7E /* ImagenIntegrationTests.swift in Sources */,
308312
862218812D04E098007ED2D4 /* IntegrationTestUtils.swift in Sources */,
309313
86D77DFC2D7A5340003D155D /* GenerateContentIntegrationTests.swift in Sources */,
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"data" : [
3+
{
4+
"filename" : "videoplayback.mp4",
5+
"idiom" : "universal",
6+
"universal-type-identifier" : "public.mpeg-4"
7+
}
8+
],
9+
"info" : {
10+
"author" : "xcode",
11+
"version" : 1
12+
}
13+
}
Binary file not shown.

FirebaseAI/Tests/TestApp/Tests/Integration/LiveSessionTests.swift

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,14 @@ struct LiveSessionTests {
7676
role: "system",
7777
parts: "When you receive a message, if the message is a single word, assume it's the first name of a person, and call the getLastName tool to get the last name of said person. Only respond with the last name."
7878
)
79+
80+
static let animalInVideo = ModelContent(
81+
role: "system",
82+
parts: """
83+
Send a one word response of what ANIMAL is in the video. \
84+
If you don't receive a video, send "Test is broken, I didn't receive a video.".
85+
""".trimmingCharacters(in: .whitespacesAndNewlines)
86+
)
7987
}
8088

8189
@Test(arguments: arguments)
@@ -181,6 +189,49 @@ struct LiveSessionTests {
181189
#expect(modelResponse == "goodbye")
182190
}
183191

192+
@Test(arguments: arguments.filter { $0.1 != ModelNames.gemini2FlashLive })
193+
// gemini-2.0-flash-live-001 is buggy and likes to respond to the audio or system instruction
194+
// (eg; it will say 'okay' or 'hello', instead of following the instructions)
195+
func sendVideoRealtime_receiveText(_ config: InstanceConfig, modelName: String) async throws {
196+
let model = FirebaseAI.componentInstance(config).liveModel(
197+
modelName: modelName,
198+
generationConfig: textConfig,
199+
systemInstruction: SystemInstructions.animalInVideo
200+
)
201+
202+
let session = try await model.connect()
203+
guard let videoFile = NSDataAsset(name: "cat") else {
204+
Issue.record("Missing video file 'cat' in Assets")
205+
return
206+
}
207+
208+
let frames = try await videoFile.videoFrames()
209+
for frame in frames {
210+
await session.sendVideoRealtime(frame, mimeType: "image/png")
211+
}
212+
213+
// the model doesn't respond unless we send some audio too
214+
// vertex also responds if you send text, but google ai doesn't
215+
// (they both respond with audio though)
216+
guard let audioFile = NSDataAsset(name: "hello") else {
217+
Issue.record("Missing audio file 'hello.wav' in Assets")
218+
return
219+
}
220+
await session.sendAudioRealtime(audioFile.data)
221+
await session.sendAudioRealtime(Data(repeating: 0, count: audioFile.data.count))
222+
223+
let text = try await session.collectNextTextResponse()
224+
225+
await session.close()
226+
let modelResponse = text
227+
.trimmingCharacters(in: .whitespacesAndNewlines)
228+
.trimmingCharacters(in: .punctuationCharacters)
229+
.lowercased()
230+
231+
// model response varies
232+
#expect(["kitten", "cat", "kitty"].contains(modelResponse))
233+
}
234+
184235
@Test(arguments: arguments)
185236
func realtime_functionCalling(_ config: InstanceConfig, modelName: String) async throws {
186237
let model = FirebaseAI.componentInstance(config).liveModel(
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import AVFoundation
16+
import SwiftUI
17+
18+
extension NSDataAsset {
19+
/// The preferred file extension for this asset, if any.
20+
///
21+
/// This is set in the Asset catalog under the `File Type` field.
22+
var fileExtension: String? {
23+
UTType(typeIdentifier)?.preferredFilenameExtension
24+
}
25+
26+
/// Extracts `.png` frames from a video at a rate of 1 FPS.
27+
///
28+
/// - Returns:
29+
/// An array of `Data` corresponding to individual images for each frame.
30+
func videoFrames() async throws -> [Data] {
31+
guard let fileExtension else {
32+
fatalError(
33+
"Failed to find file extension; ensure the \"File Type\" is set in the asset catalog."
34+
)
35+
}
36+
37+
// we need a temp file so we can provide a URL to AVURLAsset
38+
let tempFileURL = URL(fileURLWithPath: NSTemporaryDirectory())
39+
.appendingPathComponent(UUID().uuidString, isDirectory: false)
40+
.appendingPathExtension(fileExtension)
41+
42+
try data.write(to: tempFileURL)
43+
44+
defer {
45+
try? FileManager.default.removeItem(at: tempFileURL)
46+
}
47+
48+
let asset = AVURLAsset(url: tempFileURL)
49+
let generator = AVAssetImageGenerator(asset: asset)
50+
51+
let duration = try await asset.load(.duration).seconds
52+
return try stride(from: 0, to: duration, by: 1).map { seconds in
53+
let time = CMTime(seconds: seconds, preferredTimescale: 1)
54+
let cg = try generator.copyCGImage(at: time, actualTime: nil)
55+
56+
let image = UIImage(cgImage: cg)
57+
guard let png = image.pngData() else {
58+
fatalError("Failed to encode image to png")
59+
}
60+
61+
return png
62+
}
63+
}
64+
}

0 commit comments

Comments
 (0)