diff --git a/FirebaseAI/Sources/FirebaseAI.swift b/FirebaseAI/Sources/FirebaseAI.swift index 48f7183d4e6..6730bada5b6 100644 --- a/FirebaseAI/Sources/FirebaseAI.swift +++ b/FirebaseAI/Sources/FirebaseAI.swift @@ -130,6 +130,18 @@ public final class FirebaseAI: Sendable { ) } + public func liveModel(modelName: String, + generationConfig: LiveGenerationConfig? = nil, + requestOptions: RequestOptions = RequestOptions()) -> LiveGenerativeModel { + return LiveGenerativeModel( + modelResourceName: modelResourceName(modelName: modelName), + firebaseInfo: firebaseInfo, + apiConfig: apiConfig, + generationConfig: generationConfig, + requestOptions: requestOptions + ) + } + /// Class to enable FirebaseAI to register via the Objective-C based Firebase component system /// to include FirebaseAI in the userAgent. @objc(FIRVertexAIComponent) class FirebaseVertexAIComponent: NSObject {} diff --git a/FirebaseAI/Sources/Types/Internal/Live/ActivityHandling.swift b/FirebaseAI/Sources/Types/Internal/Live/ActivityHandling.swift new file mode 100644 index 00000000000..26867a98925 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/ActivityHandling.swift @@ -0,0 +1,35 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// The different ways of handling user activity. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct ActivityHandling: EncodableProtoEnum, Hashable, Sendable { + enum Kind: String { + case interrupts = "START_OF_ACTIVITY_INTERRUPTS" + case noInterrupt = "NO_INTERRUPTION" + } + + /// If true, start of activity will interrupt the model's response (also + /// called "barge in"). The model's current response will be cut-off in the + /// moment of the interruption. This is the default behavior. + public static let interrupts = ActivityHandling(kind: .interrupts) + + /// The model's response will not be interrupted. + public static let noInterrupt = ActivityHandling(kind: .noInterrupt) + + /// Returns the raw string representation of the `ActivityHandling` value. + public let rawValue: String +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/AsyncWebSocket.swift b/FirebaseAI/Sources/Types/Internal/Live/AsyncWebSocket.swift new file mode 100644 index 00000000000..6a1da33241f --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/AsyncWebSocket.swift @@ -0,0 +1,107 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +final class AsyncWebSocket: NSObject, @unchecked Sendable, URLSessionWebSocketDelegate { + private let webSocketTask: URLSessionWebSocketTask + private let stream: AsyncThrowingStream + private let continuation: AsyncThrowingStream.Continuation + private var continuationFinished = false + private let continuationLock = NSLock() + + private var _isConnected = false + private let isConnectedLock = NSLock() + private(set) var isConnected: Bool { + get { isConnectedLock.withLock { _isConnected } } + set { isConnectedLock.withLock { _isConnected = newValue } } + } + + init(urlSession: URLSession = GenAIURLSession.default, urlRequest: URLRequest) { + webSocketTask = urlSession.webSocketTask(with: urlRequest) + (stream, continuation) = AsyncThrowingStream + .makeStream() + } + + deinit { + webSocketTask.cancel(with: .goingAway, reason: nil) + } + + func connect() -> AsyncThrowingStream { + webSocketTask.resume() + isConnected = true + startReceiving() + return stream + } + + func disconnect() { + webSocketTask.cancel(with: .goingAway, reason: nil) + isConnected = false + continuationLock.withLock { + self.continuation.finish() + self.continuationFinished = true + } + } + + func send(_ message: URLSessionWebSocketTask.Message) async throws { + // TODO: Throw error if socket already closed + try await webSocketTask.send(message) + } + + private func startReceiving() { + Task { + while !Task.isCancelled && self.webSocketTask.isOpen && self.isConnected { + let message = try await webSocketTask.receive() + // TODO: Check continuationFinished before yielding. Use the same thread for NSLock. + continuation.yield(message) + } + } + } + + func urlSession(_ session: URLSession, + webSocketTask: URLSessionWebSocketTask, + didCloseWith closeCode: URLSessionWebSocketTask.CloseCode, + reason: Data?) { + continuationLock.withLock { + guard !continuationFinished else { return } + continuation.finish() + continuationFinished = true + } + } +} + +private extension URLSessionWebSocketTask { + var isOpen: Bool { + return closeCode == .invalid + } +} + +struct WebSocketClosedError: Error, Sendable, CustomNSError { + let closeCode: URLSessionWebSocketTask.CloseCode + let closeReason: String + + init(closeCode: URLSessionWebSocketTask.CloseCode, closeReason: Data?) { + self.closeCode = closeCode + self.closeReason = closeReason + .flatMap { String(data: $0, encoding: .utf8) } ?? "Unknown reason." + } + + var errorCode: Int { closeCode.rawValue } + + var errorUserInfo: [String: Any] { + [ + NSLocalizedDescriptionKey: "WebSocket closed with code \(closeCode.rawValue). Reason: \(closeReason)", + ] + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientContent.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientContent.swift new file mode 100644 index 00000000000..a24944d83fd --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientContent.swift @@ -0,0 +1,35 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Incremental update of the current conversation delivered from the client. +/// All the content here is unconditionally appended to the conversation +/// history and used as part of the prompt to the model to generate content. +/// +/// A message here will interrupt any current model generation. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentClientContent: Encodable { + /// The content appended to the current conversation with the model. + /// + /// For single-turn queries, this is a single instance. For multi-turn + /// queries, this is a repeated field that contains conversation history and + /// latest request. + let turns: [ModelContent]? + + /// If true, indicates that the server content generation should start with + /// the currently accumulated prompt. Otherwise, the server will await + /// additional messages before starting generation. + let turnComplete: Bool? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientMessage.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientMessage.swift new file mode 100644 index 00000000000..d4e47982af1 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientMessage.swift @@ -0,0 +1,55 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Messages sent by the client in the BidiGenerateContent RPC call. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +enum BidiGenerateContentClientMessage { + /// Message to be sent in the first and only first client message. + case setup(BidiGenerateContentSetup) + + /// Incremental update of the current conversation delivered from the client. + case clientContent(BidiGenerateContentClientContent) + + /// User input that is sent in real time. + case realtimeInput(BidiGenerateContentRealtimeInput) + + /// Response to a `ToolCallMessage` received from the server. + case toolResponse(BidiGenerateContentToolResponse) +} + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +extension BidiGenerateContentClientMessage: Encodable { + enum CodingKeys: CodingKey { + case setup + case clientContent + case realtimeInput + case toolResponse + } + + func encode(to encoder: any Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + switch self { + case let .setup(setup): + try container.encode(setup, forKey: .setup) + case let .clientContent(clientContent): + try container.encode(clientContent, forKey: .clientContent) + case let .realtimeInput(realtimeInput): + try container.encode(realtimeInput, forKey: .realtimeInput) + case let .toolResponse(toolResponse): + try container.encode(toolResponse, forKey: .toolResponse) + } + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentRealtimeInput.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentRealtimeInput.swift new file mode 100644 index 00000000000..3849b10c561 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentRealtimeInput.swift @@ -0,0 +1,64 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// User input that is sent in real time. +/// +/// This is different from `ClientContentUpdate` in a few ways: +/// +/// - Can be sent continuously without interruption to model generation. +/// - If there is a need to mix data interleaved across the +/// `ClientContentUpdate` and the `RealtimeUpdate`, server attempts to +/// optimize for best response, but there are no guarantees. +/// - End of turn is not explicitly specified, but is rather derived from user +/// activity (for example, end of speech). +/// - Even before the end of turn, the data is processed incrementally +/// to optimize for a fast start of the response from the model. +/// - Is always assumed to be the user's input (cannot be used to populate +/// conversation history). +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentRealtimeInput: Encodable { + /// These form the realtime audio input stream. + let audio: InlineData? + + /// Indicates that the audio stream has ended, e.g. because the microphone was + /// turned off. + /// + /// This should only be sent when automatic activity detection is enabled + /// (which is the default). + /// + /// The client can reopen the stream by sending an audio message. + let audioStreamEnd: Bool? + + /// These form the realtime video input stream. + let video: Data? + + /// These form the realtime text input stream. + let text: String? + + /// Marks the start of user activity. + struct ActivityStart: Encodable {} + + /// Marks the start of user activity. This can only be sent if automatic + /// (i.e. server-side) activity detection is disabled. + let activityStart: ActivityStart? + + /// Marks the end of user activity. + struct ActivityEnd: Encodable {} + + /// Marks the end of user activity. This can only be sent if automatic (i.e. + // server-side) activity detection is disabled. + let activityEnd: ActivityEnd? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift new file mode 100644 index 00000000000..8d9d1e8940b --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift @@ -0,0 +1,53 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Incremental server update generated by the model in response to client +/// messages. +/// +/// Content is generated as quickly as possible, and not in realtime. Clients +/// may choose to buffer and play it out in realtime. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentServerContent: Decodable { + /// The content that the model has generated as part of the current + /// conversation with the user. + let modelTurn: ModelContent? + + /// If true, indicates that the model is done generating. Generation will only + /// start in response to additional client messages. Can be set alongside + /// `content`, indicating that the `content` is the last in the turn. + let turnComplete: Bool? + + /// If true, indicates that a client message has interrupted current model + /// generation. If the client is playing out the content in realtime, this is a + /// good signal to stop and empty the current queue. If the client is playing + /// out the content in realtime, this is a good signal to stop and empty the + /// current playback queue. + let interrupted: Bool? + + /// If true, indicates that the model is done generating. + /// + /// When model is interrupted while generating there will be no + /// 'generation_complete' message in interrupted turn, it will go through + /// 'interrupted > turn_complete'. + /// + /// When model assumes realtime playback there will be delay between + /// generation_complete and turn_complete that is caused by model waiting for + /// playback to finish. + let generationComplete: Bool? + + /// Metadata specifies sources used to ground generated content. + let groundingMetadata: GroundingMetadata? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerMessage.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerMessage.swift new file mode 100644 index 00000000000..950819e0343 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerMessage.swift @@ -0,0 +1,101 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Response message for BidiGenerateContent RPC call. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct BidiGenerateContentServerMessage: Sendable { + // TODO: Make this type `internal` + + /// The type of the message. + enum MessageType { + /// Sent in response to a `BidiGenerateContentSetup` message from the client. + case setupComplete(BidiGenerateContentSetupComplete) + + /// Content generated by the model in response to client messages. + case serverContent(BidiGenerateContentServerContent) + + /// Request for the client to execute the `function_calls` and return the + /// responses with the matching `id`s. + case toolCall(BidiGenerateContentToolCall) + + /// Notification for the client that a previously issued + /// `ToolCallMessage` with the specified `id`s should have been not executed + /// and should be cancelled. + case toolCallCancellation(BidiGenerateContentToolCallCancellation) + + /// Server will disconnect soon. + case goAway(GoAway) + } + + /// The message type. + let messageType: MessageType + + /// Usage metadata about the response(s). + let usageMetadata: GenerateContentResponse.UsageMetadata? +} + +// MARK: - Decodable + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +extension BidiGenerateContentServerMessage: Decodable { + enum CodingKeys: String, CodingKey { + case setupComplete + case serverContent + case toolCall + case toolCallCancellation + case goAway + case usageMetadata + } + + public init(from decoder: any Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + + if let setupComplete = try container.decodeIfPresent( + BidiGenerateContentSetupComplete.self, + forKey: .setupComplete + ) { + messageType = .setupComplete(setupComplete) + } else if let serverContent = try container.decodeIfPresent( + BidiGenerateContentServerContent.self, + forKey: .serverContent + ) { + messageType = .serverContent(serverContent) + } else if let toolCall = try container.decodeIfPresent( + BidiGenerateContentToolCall.self, + forKey: .toolCall + ) { + messageType = .toolCall(toolCall) + } else if let toolCallCancellation = try container.decodeIfPresent( + BidiGenerateContentToolCallCancellation.self, + forKey: .toolCallCancellation + ) { + messageType = .toolCallCancellation(toolCallCancellation) + } else if let goAway = try container.decodeIfPresent(GoAway.self, forKey: .goAway) { + messageType = .goAway(goAway) + } else { + let context = DecodingError.Context( + codingPath: decoder.codingPath, + debugDescription: "Could not decode server message." + ) + throw DecodingError.dataCorrupted(context) + } + + usageMetadata = try container.decodeIfPresent( + GenerateContentResponse.UsageMetadata.self, + forKey: .usageMetadata + ) + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift new file mode 100644 index 00000000000..5541b7c107a --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift @@ -0,0 +1,60 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Message to be sent in the first and only first +/// `BidiGenerateContentClientMessage`. Contains configuration that will apply +/// for the duration of the streaming RPC. +/// +/// Clients should wait for a `BidiGenerateContentSetupComplete` message before +/// sending any additional messages. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentSetup: Encodable { + /// The fully qualified name of the publisher model. + /// + /// Publisher model format: + /// `projects/{project}/locations/{location}/publishers/*/models/*` + let model: String + + /// Generation config. + let generationConfig: LiveGenerationConfig? + + /// The user provided system instructions for the model. + /// Note: only text should be used in parts and content in each part will be + /// in a separate paragraph. + let systemInstruction: ModelContent? + + /// A list of `Tools` the model may use to generate the next response. + /// + /// A `Tool` is a piece of code that enables the system to interact with + /// external systems to perform an action, or set of actions, outside of + /// knowledge and scope of the model. + let tools: [Tool]? + + /// Configures the handling of realtime input. + let realtimeInputConfig: RealtimeInputConfig? + + init(model: String, + generationConfig: LiveGenerationConfig? = nil, + systemInstruction: ModelContent? = nil, + tools: [Tool]? = nil, + realtimeInputConfig: RealtimeInputConfig? = nil) { + self.model = model + self.generationConfig = generationConfig + self.systemInstruction = systemInstruction + self.tools = tools + self.realtimeInputConfig = realtimeInputConfig + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetupComplete.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetupComplete.swift new file mode 100644 index 00000000000..cbf1dc6d960 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetupComplete.swift @@ -0,0 +1,19 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Sent in response to a `BidiGenerateContentSetup` message from the client. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentSetupComplete: Decodable {} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCall.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCall.swift new file mode 100644 index 00000000000..86ded221fc3 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCall.swift @@ -0,0 +1,23 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Request for the client to execute the `function_calls` and return the +/// responses with the matching `id`s. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentToolCall: Decodable { + /// The function call to be executed. + let functionCalls: [FunctionCall]? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCallCancellation.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCallCancellation.swift new file mode 100644 index 00000000000..096e8a1a11e --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCallCancellation.swift @@ -0,0 +1,26 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Notification for the client that a previously issued `ToolCallMessage` +/// with the specified `id`s should have been not executed and should be +/// cancelled. If there were side-effects to those tool calls, clients may +/// attempt to undo the tool calls. This message occurs only in cases where the +/// clients interrupt server turns. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentToolCallCancellation: Decodable { + /// The ids of the tool calls to be cancelled. + let ids: [String]? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolResponse.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolResponse.swift new file mode 100644 index 00000000000..8b4e4ba48b2 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolResponse.swift @@ -0,0 +1,29 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Client generated response to a `ToolCall` received from the server. +/// Individual `FunctionResponse` objects are matched to the respective +/// `FunctionCall` objects by the `id` field. +/// +/// Note that in the unary and server-streaming GenerateContent APIs function +/// calling happens by exchanging the `Content` parts, while in the bidi +/// GenerateContent APIs function calling happens over these dedicated set of +/// messages. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct BidiGenerateContentToolResponse: Encodable { + /// The response to the function calls. + let functionResponses: [FunctionResponse]? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/EndSensitivity.swift b/FirebaseAI/Sources/Types/Internal/Live/EndSensitivity.swift new file mode 100644 index 00000000000..6caa5d85440 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/EndSensitivity.swift @@ -0,0 +1,33 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// End of speech sensitivity. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct EndSensitivity: EncodableProtoEnum, Hashable, Sendable { + enum Kind: String { + case high = "END_SENSITIVITY_HIGH" + case low = "END_SENSITIVITY_LOW" + } + + /// Automatic detection will end speech more often. + public static let high = EndSensitivity(kind: .high) + + /// Automatic detection will end speech less often. + public static let low = EndSensitivity(kind: .low) + + /// Returns the raw string representation of the `EndSensitivity` value. + public let rawValue: String +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/GoAway.swift b/FirebaseAI/Sources/Types/Internal/Live/GoAway.swift new file mode 100644 index 00000000000..45a2a7e944d --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/GoAway.swift @@ -0,0 +1,24 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Server will not be able to service client soon. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct GoAway: Decodable { + /// The remaining time before the connection will be terminated as ABORTED. + /// The minimal time returned here is specified differently together with + /// the rate limits for a given model. + let timeLeft: TimeInterval? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/RealtimeInputConfig.swift b/FirebaseAI/Sources/Types/Internal/Live/RealtimeInputConfig.swift new file mode 100644 index 00000000000..08bcfe076f8 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/RealtimeInputConfig.swift @@ -0,0 +1,55 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Configures the realtime input behavior in `BidiGenerateContent`. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +struct RealtimeInputConfig: Encodable { + /// Configures automatic detection of activity. + struct AutomaticActivityDetection: Encodable { + /// If enabled, detected voice and text input count as activity. If + /// disabled, the client must send activity signals. + let disabled: Bool? + + /// Determines how likely speech is to be detected. + let startOfSpeechSensitivity: StartSensitivity? + + /// Determines how likely detected speech is ended. + let endOfSpeechSensitivity: EndSensitivity? + + /// The required duration of detected speech before start-of-speech is + /// committed. The lower this value the more sensitive the start-of-speech + /// detection is and the shorter speech can be recognized. However, this + /// also increases the probability of false positives. + let prefixPaddingMS: Int? + + /// The required duration of detected silence (or non-speech) before + // end-of-speech is committed. The larger this value, the longer speech + // gaps can be without interrupting the user's activity but this will + // increase the model's latency. + let silenceDurationMS: Int? + } + + /// If not set, automatic activity detection is enabled by default. If + /// automatic voice detection is disabled, the client must send activity + /// signals. + let automaticActivityDetection: AutomaticActivityDetection? + + /// Defines what effect activity has. + let activityHandling: ActivityHandling? + + /// Defines which input is included in the user's turn. + let turnCoverage: TurnCoverage? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/StartSensitivity.swift b/FirebaseAI/Sources/Types/Internal/Live/StartSensitivity.swift new file mode 100644 index 00000000000..ef0e1fda073 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/StartSensitivity.swift @@ -0,0 +1,33 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Start of speech sensitivity. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct StartSensitivity: EncodableProtoEnum, Hashable, Sendable { + enum Kind: String { + case high = "START_SENSITIVITY_HIGH" + case low = "START_SENSITIVITY_LOW" + } + + /// Automatic detection will detect the start of speech more often. + public static let high = StartSensitivity(kind: .high) + + /// Automatic detection will detect the start of speech less often. + public static let low = StartSensitivity(kind: .low) + + /// Returns the raw string representation of the `StartSensitivity` value. + public let rawValue: String +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/TurnCoverage.swift b/FirebaseAI/Sources/Types/Internal/Live/TurnCoverage.swift new file mode 100644 index 00000000000..5d69fee78ce --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/TurnCoverage.swift @@ -0,0 +1,36 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Options about which input is included in the user's turn. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct TurnCoverage: EncodableProtoEnum, Hashable, Sendable { + enum Kind: String { + case onlyActivity = "TURN_INCLUDES_ONLY_ACTIVITY" + case allInput = "TURN_INCLUDES_ALL_INPUT" + } + + /// The users turn only includes activity since the last turn, excluding + /// inactivity (e.g. silence on the audio stream). + public static let onlyActivity = TurnCoverage(kind: .onlyActivity) + + /// The users turn includes all realtime input since the last turn, including + /// inactivity (e.g. silence on the audio stream). This is the default + // behavior. + public static let allInput = TurnCoverage(kind: .allInput) + + /// Returns the raw string representation of the `TurnCoverage` value. + public let rawValue: String +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift b/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift new file mode 100644 index 00000000000..ae961d14fb0 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift @@ -0,0 +1,155 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// A struct defining model parameters to be used when sending generative AI +/// requests to the backend model. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct LiveGenerationConfig: Sendable { + /// Controls the degree of randomness in token selection. + let temperature: Float? + + /// Controls diversity of generated text. + let topP: Float? + + /// Limits the number of highest probability words considered. + let topK: Int? + + /// The number of response variations to return. + let candidateCount: Int? + + /// Maximum number of tokens that can be generated in the response. + let maxOutputTokens: Int? + + /// Controls the likelihood of repeating the same words or phrases already generated in the text. + let presencePenalty: Float? + + /// Controls the likelihood of repeating words, with the penalty increasing for each repetition. + let frequencyPenalty: Float? + + /// Supported modalities of the response. + let responseModalities: [ResponseModality]? + + /// Creates a new `GenerationConfig` value. + /// + /// See the + /// [Configure model parameters](https://firebase.google.com/docs/vertex-ai/model-parameters) + /// guide and the + /// [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// for more details. + /// + /// - Parameters: + /// - temperature:Controls the randomness of the language model's output. Higher values (for + /// example, 1.0) make the text more random and creative, while lower values (for example, + /// 0.1) make it more focused and deterministic. + /// + /// > Note: A temperature of 0 means that the highest probability tokens are always selected. + /// > In this case, responses for a given prompt are mostly deterministic, but a small amount + /// > of variation is still possible. + /// + /// > Important: The range of supported temperature values depends on the model; see the + /// > [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#temperature) + /// > for more details. + /// - topP: Controls diversity of generated text. Higher values (e.g., 0.9) produce more diverse + /// text, while lower values (e.g., 0.5) make the output more focused. + /// + /// The supported range is 0.0 to 1.0. + /// + /// > Important: The default `topP` value depends on the model; see the + /// > [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#top-p) + /// > for more details. + /// - topK: Limits the number of highest probability words the model considers when generating + /// text. For example, a topK of 40 means only the 40 most likely words are considered for the + /// next token. A higher value increases diversity, while a lower value makes the output more + /// deterministic. + /// + /// The supported range is 1 to 40. + /// + /// > Important: Support for `topK` and the default value depends on the model; see the + /// [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#top-k) + /// for more details. + /// - candidateCount: The number of response variations to return; defaults to 1 if not set. + /// Support for multiple candidates depends on the model; see the + /// [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// for more details. + /// - maxOutputTokens: Maximum number of tokens that can be generated in the response. + /// See the configure model parameters [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#max-output-tokens) + /// for more details. + /// - presencePenalty: Controls the likelihood of repeating the same words or phrases already + /// generated in the text. Higher values increase the penalty of repetition, resulting in more + /// diverse output. + /// + /// > Note: While both `presencePenalty` and `frequencyPenalty` discourage repetition, + /// > `presencePenalty` applies the same penalty regardless of how many times the word/phrase + /// > has already appeared, whereas `frequencyPenalty` increases the penalty for *each* + /// > repetition of a word/phrase. + /// + /// > Important: The range of supported `presencePenalty` values depends on the model; see the + /// > [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// > for more details + /// - frequencyPenalty: Controls the likelihood of repeating words or phrases, with the penalty + /// increasing for each repetition. Higher values increase the penalty of repetition, + /// resulting in more diverse output. + /// + /// > Note: While both `frequencyPenalty` and `presencePenalty` discourage repetition, + /// > `frequencyPenalty` increases the penalty for *each* repetition of a word/phrase, whereas + /// > `presencePenalty` applies the same penalty regardless of how many times the word/phrase + /// > has already appeared. + /// + /// > Important: The range of supported `frequencyPenalty` values depends on the model; see + /// > the + /// > [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// > for more details + /// - responseModalities: The data types (modalities) that may be returned in model responses. + /// + /// See the [multimodal + /// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation) + /// documentation for more details. + /// + /// > Warning: Specifying response modalities is a **Public Preview** feature, which means + /// > that it is not subject to any SLA or deprecation policy and could change in + /// > backwards-incompatible ways. + public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil, + candidateCount: Int? = nil, maxOutputTokens: Int? = nil, + presencePenalty: Float? = nil, frequencyPenalty: Float? = nil, + responseModalities: [ResponseModality]? = nil) { + // Explicit init because otherwise if we re-arrange the above variables it changes the API + // surface. + self.temperature = temperature + self.topP = topP + self.topK = topK + self.candidateCount = candidateCount + self.maxOutputTokens = maxOutputTokens + self.presencePenalty = presencePenalty + self.frequencyPenalty = frequencyPenalty + self.responseModalities = responseModalities + } +} + +// MARK: - Codable Conformances + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +extension LiveGenerationConfig: Encodable { + enum CodingKeys: String, CodingKey { + case temperature + case topP + case topK + case candidateCount + case maxOutputTokens + case presencePenalty + case frequencyPenalty + case responseModalities + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveGenerativeModel.swift b/FirebaseAI/Sources/Types/Public/Live/LiveGenerativeModel.swift new file mode 100644 index 00000000000..689e690a631 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveGenerativeModel.swift @@ -0,0 +1,68 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public final class LiveGenerativeModel { + let modelResourceName: String + let firebaseInfo: FirebaseInfo + let apiConfig: APIConfig + let generationConfig: LiveGenerationConfig? + let requestOptions: RequestOptions + let urlSession: URLSession + + init(modelResourceName: String, + firebaseInfo: FirebaseInfo, + apiConfig: APIConfig, + generationConfig: LiveGenerationConfig? = nil, + requestOptions: RequestOptions, + urlSession: URLSession = GenAIURLSession.default) { + self.modelResourceName = modelResourceName + self.firebaseInfo = firebaseInfo + self.apiConfig = apiConfig + self.generationConfig = generationConfig + // TODO: Add tools + // TODO: Add tool config + // TODO: Add system instruction + self.requestOptions = requestOptions + self.urlSession = urlSession + } + + public func connect() -> LiveSession { + let liveSession = LiveSession( + modelResourceName: modelResourceName, + generationConfig: generationConfig, + url: webSocketURL(), + urlSession: urlSession + ) + print("Opening Live Session...") + liveSession.openConnection() + return liveSession + } + + func webSocketURL() -> URL { + let urlString = switch apiConfig.service { + case .vertexAI: + "wss://firebasevertexai.googleapis.com/ws/google.firebase.vertexai.v1beta.LlmBidiService/BidiGenerateContent/locations/us-central1?key=\(firebaseInfo.apiKey)" + case .googleAI: + "wss://firebasevertexai.googleapis.com/ws/google.firebase.vertexai.v1beta.GenerativeService/BidiGenerateContent?key=\(firebaseInfo.apiKey)" + } + guard let url = URL(string: urlString) else { + // TODO: Add error handling + fatalError() + } + return url + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveSession.swift b/FirebaseAI/Sources/Types/Public/Live/LiveSession.swift new file mode 100644 index 00000000000..edf248c440e --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveSession.swift @@ -0,0 +1,87 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +// TODO: Extract most of this file into a service class similar to `GenerativeAIService`. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public final class LiveSession: Sendable { + let modelResourceName: String + let generationConfig: LiveGenerationConfig? + let webSocket: AsyncWebSocket + + public let responses: AsyncThrowingStream + private let responseContinuation: AsyncThrowingStream + .Continuation + + private let jsonEncoder = JSONEncoder() + private let jsonDecoder = JSONDecoder() + + init(modelResourceName: String, + generationConfig: LiveGenerationConfig?, + url: URL, + urlSession: URLSession) { + self.modelResourceName = modelResourceName + self.generationConfig = generationConfig + webSocket = AsyncWebSocket(urlSession: urlSession, urlRequest: URLRequest(url: url)) + (responses, responseContinuation) = AsyncThrowingStream.makeStream() + } + + deinit { + webSocket.disconnect() + } + + public func sendMessage(_ message: String) async throws { + let content = ModelContent(role: "user", parts: [message]) + let clientContent = BidiGenerateContentClientContent(turns: [content], turnComplete: true) + let clientMessage = BidiGenerateContentClientMessage.clientContent(clientContent) + let clientMessageData = try jsonEncoder.encode(clientMessage) + try await webSocket.send(.data(clientMessageData)) + } + + func openConnection() { + Task { + do { + let stream = webSocket.connect() + try await sendSetupMessage() + for try await message in stream { + switch message { + case let .string(string): + print("Unexpected string response: \(string)") + case let .data(data): + let response = try jsonDecoder.decode( + BidiGenerateContentServerMessage.self, + from: data + ) + responseContinuation.yield(response) + @unknown default: + print("Unknown message received") + } + } + } catch { + responseContinuation.finish(throwing: error) + } + responseContinuation.finish() + } + } + + private func sendSetupMessage() async throws { + let setup = BidiGenerateContentSetup( + model: modelResourceName, generationConfig: generationConfig + ) + let message = BidiGenerateContentClientMessage.setup(setup) + let messageData = try jsonEncoder.encode(message) + try await webSocket.send(.data(messageData)) + } +} diff --git a/FirebaseAI/Tests/TestApp/Sources/ContentView.swift b/FirebaseAI/Tests/TestApp/Sources/ContentView.swift index 52af5939455..37ef5fd527a 100644 --- a/FirebaseAI/Tests/TestApp/Sources/ContentView.swift +++ b/FirebaseAI/Tests/TestApp/Sources/ContentView.swift @@ -12,17 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. +import FirebaseAI import SwiftUI struct ContentView: View { + // TODO: Revert changes in this file. For prototyping purposes only. + let liveModel: LiveGenerativeModel = { + // let firebaseAI = FirebaseAI.firebaseAI(backend: .vertexAI()) + let firebaseAI = FirebaseAI.firebaseAI() + return firebaseAI.liveModel( + modelName: "gemini-2.0-flash-live-001", + generationConfig: LiveGenerationConfig(responseModalities: [.text]) + ) + }() + + @State private var responses: [String] = [] + var body: some View { VStack { - Image(systemName: "globe") - .imageScale(.large) - .foregroundStyle(.tint) - Text("Hello, world!") + List(responses, id: \.self) { + Text($0) + } } .padding() + .task { + do { + let liveSession = liveModel.connect() + try await liveSession.sendMessage("Why is the sky blue?") + for try await response in liveSession.responses { + responses.append(String(describing: response)) + } + } catch { + print(error) + } + } } }