Skip to content

[Firebase AI] Add Live API support #15181

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions FirebaseAI/Sources/FirebaseAI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,18 @@ public final class FirebaseAI: Sendable {
)
}

public func liveModel(modelName: String,
generationConfig: LiveGenerationConfig? = nil,
requestOptions: RequestOptions = RequestOptions()) -> LiveGenerativeModel {
return LiveGenerativeModel(
modelResourceName: modelResourceName(modelName: modelName),
firebaseInfo: firebaseInfo,
apiConfig: apiConfig,
generationConfig: generationConfig,
requestOptions: requestOptions
)
}

/// Class to enable FirebaseAI to register via the Objective-C based Firebase component system
/// to include FirebaseAI in the userAgent.
@objc(FIRVertexAIComponent) class FirebaseVertexAIComponent: NSObject {}
Expand Down
35 changes: 35 additions & 0 deletions FirebaseAI/Sources/Types/Internal/Live/ActivityHandling.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import Foundation

/// The different ways of handling user activity.
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
public struct ActivityHandling: EncodableProtoEnum, Hashable, Sendable {
enum Kind: String {
case interrupts = "START_OF_ACTIVITY_INTERRUPTS"
case noInterrupt = "NO_INTERRUPTION"
}

/// If true, start of activity will interrupt the model's response (also
/// called "barge in"). The model's current response will be cut-off in the
/// moment of the interruption. This is the default behavior.
public static let interrupts = ActivityHandling(kind: .interrupts)

/// The model's response will not be interrupted.
public static let noInterrupt = ActivityHandling(kind: .noInterrupt)

/// Returns the raw string representation of the `ActivityHandling` value.
public let rawValue: String
}
107 changes: 107 additions & 0 deletions FirebaseAI/Sources/Types/Internal/Live/AsyncWebSocket.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import Foundation

final class AsyncWebSocket: NSObject, @unchecked Sendable, URLSessionWebSocketDelegate {
private let webSocketTask: URLSessionWebSocketTask
private let stream: AsyncThrowingStream<URLSessionWebSocketTask.Message, Error>
private let continuation: AsyncThrowingStream<URLSessionWebSocketTask.Message, Error>.Continuation
private var continuationFinished = false
private let continuationLock = NSLock()

private var _isConnected = false
private let isConnectedLock = NSLock()
private(set) var isConnected: Bool {
get { isConnectedLock.withLock { _isConnected } }
set { isConnectedLock.withLock { _isConnected = newValue } }
}

init(urlSession: URLSession = GenAIURLSession.default, urlRequest: URLRequest) {
webSocketTask = urlSession.webSocketTask(with: urlRequest)
(stream, continuation) = AsyncThrowingStream<URLSessionWebSocketTask.Message, Error>
.makeStream()
}

deinit {
webSocketTask.cancel(with: .goingAway, reason: nil)
}

func connect() -> AsyncThrowingStream<URLSessionWebSocketTask.Message, Error> {
webSocketTask.resume()
isConnected = true
startReceiving()
return stream
}

func disconnect() {
webSocketTask.cancel(with: .goingAway, reason: nil)
isConnected = false
continuationLock.withLock {
self.continuation.finish()
self.continuationFinished = true
}
}

func send(_ message: URLSessionWebSocketTask.Message) async throws {
// TODO: Throw error if socket already closed
try await webSocketTask.send(message)
}

private func startReceiving() {
Task {
while !Task.isCancelled && self.webSocketTask.isOpen && self.isConnected {
let message = try await webSocketTask.receive()
// TODO: Check continuationFinished before yielding. Use the same thread for NSLock.
continuation.yield(message)
}
}
}

func urlSession(_ session: URLSession,
webSocketTask: URLSessionWebSocketTask,
didCloseWith closeCode: URLSessionWebSocketTask.CloseCode,
reason: Data?) {
continuationLock.withLock {
guard !continuationFinished else { return }
continuation.finish()
continuationFinished = true
}
}
}

private extension URLSessionWebSocketTask {
var isOpen: Bool {
return closeCode == .invalid
}
}

struct WebSocketClosedError: Error, Sendable, CustomNSError {
let closeCode: URLSessionWebSocketTask.CloseCode
let closeReason: String

init(closeCode: URLSessionWebSocketTask.CloseCode, closeReason: Data?) {
self.closeCode = closeCode
self.closeReason = closeReason
.flatMap { String(data: $0, encoding: .utf8) } ?? "Unknown reason."
}

var errorCode: Int { closeCode.rawValue }

var errorUserInfo: [String: Any] {
[
NSLocalizedDescriptionKey: "WebSocket closed with code \(closeCode.rawValue). Reason: \(closeReason)",
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import Foundation

/// Incremental update of the current conversation delivered from the client.
/// All the content here is unconditionally appended to the conversation
/// history and used as part of the prompt to the model to generate content.
///
/// A message here will interrupt any current model generation.
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
struct BidiGenerateContentClientContent: Encodable {
/// The content appended to the current conversation with the model.
///
/// For single-turn queries, this is a single instance. For multi-turn
/// queries, this is a repeated field that contains conversation history and
/// latest request.
let turns: [ModelContent]?

/// If true, indicates that the server content generation should start with
/// the currently accumulated prompt. Otherwise, the server will await
/// additional messages before starting generation.
let turnComplete: Bool?
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import Foundation

/// Messages sent by the client in the BidiGenerateContent RPC call.
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
enum BidiGenerateContentClientMessage {
/// Message to be sent in the first and only first client message.
case setup(BidiGenerateContentSetup)

/// Incremental update of the current conversation delivered from the client.
case clientContent(BidiGenerateContentClientContent)

/// User input that is sent in real time.
case realtimeInput(BidiGenerateContentRealtimeInput)

/// Response to a `ToolCallMessage` received from the server.
case toolResponse(BidiGenerateContentToolResponse)
}

@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
extension BidiGenerateContentClientMessage: Encodable {
enum CodingKeys: CodingKey {
case setup
case clientContent
case realtimeInput
case toolResponse
}

func encode(to encoder: any Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case let .setup(setup):
try container.encode(setup, forKey: .setup)
case let .clientContent(clientContent):
try container.encode(clientContent, forKey: .clientContent)
case let .realtimeInput(realtimeInput):
try container.encode(realtimeInput, forKey: .realtimeInput)
case let .toolResponse(toolResponse):
try container.encode(toolResponse, forKey: .toolResponse)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import Foundation

/// User input that is sent in real time.
///
/// This is different from `ClientContentUpdate` in a few ways:
///
/// - Can be sent continuously without interruption to model generation.
/// - If there is a need to mix data interleaved across the
/// `ClientContentUpdate` and the `RealtimeUpdate`, server attempts to
/// optimize for best response, but there are no guarantees.
/// - End of turn is not explicitly specified, but is rather derived from user
/// activity (for example, end of speech).
/// - Even before the end of turn, the data is processed incrementally
/// to optimize for a fast start of the response from the model.
/// - Is always assumed to be the user's input (cannot be used to populate
/// conversation history).
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
struct BidiGenerateContentRealtimeInput: Encodable {
/// These form the realtime audio input stream.
let audio: InlineData?

/// Indicates that the audio stream has ended, e.g. because the microphone was
/// turned off.
///
/// This should only be sent when automatic activity detection is enabled
/// (which is the default).
///
/// The client can reopen the stream by sending an audio message.
let audioStreamEnd: Bool?

/// These form the realtime video input stream.
let video: Data?

/// These form the realtime text input stream.
let text: String?

/// Marks the start of user activity.
struct ActivityStart: Encodable {}

/// Marks the start of user activity. This can only be sent if automatic
/// (i.e. server-side) activity detection is disabled.
let activityStart: ActivityStart?

/// Marks the end of user activity.
struct ActivityEnd: Encodable {}

/// Marks the end of user activity. This can only be sent if automatic (i.e.
// server-side) activity detection is disabled.
let activityEnd: ActivityEnd?
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import Foundation

/// Incremental server update generated by the model in response to client
/// messages.
///
/// Content is generated as quickly as possible, and not in realtime. Clients
/// may choose to buffer and play it out in realtime.
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
struct BidiGenerateContentServerContent: Decodable {
/// The content that the model has generated as part of the current
/// conversation with the user.
let modelTurn: ModelContent?

/// If true, indicates that the model is done generating. Generation will only
/// start in response to additional client messages. Can be set alongside
/// `content`, indicating that the `content` is the last in the turn.
let turnComplete: Bool?

/// If true, indicates that a client message has interrupted current model
/// generation. If the client is playing out the content in realtime, this is a
/// good signal to stop and empty the current queue. If the client is playing
/// out the content in realtime, this is a good signal to stop and empty the
/// current playback queue.
let interrupted: Bool?

/// If true, indicates that the model is done generating.
///
/// When model is interrupted while generating there will be no
/// 'generation_complete' message in interrupted turn, it will go through
/// 'interrupted > turn_complete'.
///
/// When model assumes realtime playback there will be delay between
/// generation_complete and turn_complete that is caused by model waiting for
/// playback to finish.
let generationComplete: Bool?

/// Metadata specifies sources used to ground generated content.
let groundingMetadata: GroundingMetadata?
}
Loading
Loading