diff --git a/FirebaseVertexAI/CHANGELOG.md b/FirebaseVertexAI/CHANGELOG.md index d70368aa708..b33ebfd7192 100644 --- a/FirebaseVertexAI/CHANGELOG.md +++ b/FirebaseVertexAI/CHANGELOG.md @@ -1,3 +1,12 @@ +# Unreleased +- [added] **Public Preview**: Added support for specifying response modalities + in `GenerationConfig`. This includes **public experimental** support for image + generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658) +

+ Note: This feature is in Public Preview and relies on experimental models, + which means that it is not subject to any SLA or deprecation policy and could + change in backwards-incompatible ways. + # 11.11.0 - [added] Emits a warning when attempting to use an incompatible model with `GenerativeModel` or `ImagenModel`. (#14610) diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift index 7765c053cda..3daebbae692 100644 --- a/FirebaseVertexAI/Sources/GenerationConfig.swift +++ b/FirebaseVertexAI/Sources/GenerationConfig.swift @@ -48,6 +48,9 @@ public struct GenerationConfig: Sendable { /// Output schema of the generated candidate text. let responseSchema: Schema? + /// Supported modalities of the response. + let responseModalities: [ResponseModality]? + /// Creates a new `GenerationConfig` value. /// /// See the @@ -140,11 +143,20 @@ public struct GenerationConfig: Sendable { /// [Generate structured /// output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide /// for more details. + /// - responseModalities: The data types (modalities) that may be returned in model responses. + /// + /// See the [multimodal + /// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation) + /// documentation for more details. + /// + /// > Warning: Specifying response modalities is a **Public Preview** feature, which means + /// > that it is not subject to any SLA or deprecation policy and could change in + /// > backwards-incompatible ways. public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil, candidateCount: Int? = nil, maxOutputTokens: Int? = nil, presencePenalty: Float? = nil, frequencyPenalty: Float? = nil, stopSequences: [String]? = nil, responseMIMEType: String? = nil, - responseSchema: Schema? = nil) { + responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil) { // Explicit init because otherwise if we re-arrange the above variables it changes the API // surface. self.temperature = temperature @@ -157,6 +169,7 @@ public struct GenerationConfig: Sendable { self.stopSequences = stopSequences self.responseMIMEType = responseMIMEType self.responseSchema = responseSchema + self.responseModalities = responseModalities } } @@ -175,5 +188,6 @@ extension GenerationConfig: Encodable { case stopSequences case responseMIMEType = "responseMimeType" case responseSchema + case responseModalities } } diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift new file mode 100644 index 00000000000..442fed5f434 --- /dev/null +++ b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift @@ -0,0 +1,52 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Represents the different types, or modalities, of data that a model can produce as output. +/// +/// To configure the desired output modalities for model requests, set the `responseModalities` +/// parameter when initializing a ``GenerationConfig``. See the [multimodal +/// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation) +/// documentation for more details. +/// +/// > Important: Support for each response modality, or combination of modalities, depends on the +/// > model. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct ResponseModality: EncodableProtoEnum, Sendable { + enum Kind: String { + case text = "TEXT" + case image = "IMAGE" + } + + /// Specifies that the model should generate textual content. + /// + /// Use this modality when you need the model to produce written language, such as answers to + /// questions, summaries, creative writing, code snippets, or structured data formats like JSON. + public static let text = ResponseModality(kind: .text) + + /// **Public Experimental**: Specifies that the model should generate image data. + /// + /// Use this modality when you want the model to create visual content based on the provided input + /// or prompts. The response might contain one or more generated images. See the [image + /// generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation#image-generation) + /// documentation for more details. + /// + /// > Warning: Image generation using Gemini 2.0 Flash is a **Public Experimental** feature, which + /// > means that it is not subject to any SLA or deprecation policy and could change in + /// > backwards-incompatible ways. + public static let image = ResponseModality(kind: .image) + + let rawValue: String +} diff --git a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift index f26fec45fb3..3a731813704 100644 --- a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift +++ b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift @@ -23,4 +23,5 @@ public enum FirebaseAppNames { public enum ModelNames { public static let gemini2Flash = "gemini-2.0-flash-001" public static let gemini2FlashLite = "gemini-2.0-flash-lite-001" + public static let gemini2FlashExperimental = "gemini-2.0-flash-exp" } diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift index acbe552e1a0..715be6e3e32 100644 --- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift +++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift @@ -19,6 +19,12 @@ import FirebaseVertexAI import Testing import VertexAITestApp +#if canImport(UIKit) + import UIKit +#endif // canImport(UIKit) + +@testable import struct FirebaseVertexAI.BackendError + @Suite(.serialized) struct GenerateContentIntegrationTests { // Set temperature, topP and topK to lowest allowed values to make responses more deterministic. @@ -119,6 +125,51 @@ struct GenerateContentIntegrationTests { #expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount) } + @Test(arguments: [ + InstanceConfig.vertexV1Beta, + InstanceConfig.developerV1Beta, + ]) + func generateImage(_ config: InstanceConfig) async throws { + let generationConfig = GenerationConfig( + temperature: 0.0, + topP: 0.0, + topK: 1, + responseModalities: [.text, .image] + ) + let model = VertexAI.componentInstance(config).generativeModel( + modelName: ModelNames.gemini2FlashExperimental, + generationConfig: generationConfig, + safetySettings: safetySettings + ) + let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn." + + var response: GenerateContentResponse? + try await withKnownIssue( + "Backend may fail with a 503 - Service Unavailable error when overloaded", + isIntermittent: true + ) { + response = try await model.generateContent(prompt) + } matching: { issue in + (issue.error as? BackendError).map { $0.httpResponseCode == 503 } ?? false + } + + guard let response else { return } + let candidate = try #require(response.candidates.first) + let inlineDataPart = try #require(candidate.content.parts + .first { $0 is InlineDataPart } as? InlineDataPart) + #expect(inlineDataPart.mimeType == "image/png") + #expect(inlineDataPart.data.count > 0) + #if canImport(UIKit) + let uiImage = try #require(UIImage(data: inlineDataPart.data)) + // Gemini 2.0 Flash Experimental returns images sized to fit within a 1024x1024 pixel box but + // dimensions may vary depending on the aspect ratio. + #expect(uiImage.size.width <= 1024) + #expect(uiImage.size.width >= 500) + #expect(uiImage.size.height <= 1024) + #expect(uiImage.size.height >= 500) + #endif // canImport(UIKit) + } + // MARK: Streaming Tests @Test(arguments: InstanceConfig.allConfigs) diff --git a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift index 23f85e8bdbd..5585c1ae995 100644 --- a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift +++ b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift @@ -61,7 +61,8 @@ final class GenerationConfigTests: XCTestCase { frequencyPenalty: frequencyPenalty, stopSequences: stopSequences, responseMIMEType: responseMIMEType, - responseSchema: .array(items: .string()) + responseSchema: .array(items: .string()), + responseModalities: [.text, .image] ) let jsonData = try encoder.encode(generationConfig) @@ -74,6 +75,10 @@ final class GenerationConfigTests: XCTestCase { "maxOutputTokens" : \(maxOutputTokens), "presencePenalty" : \(presencePenalty), "responseMimeType" : "\(responseMIMEType)", + "responseModalities" : [ + "TEXT", + "IMAGE" + ], "responseSchema" : { "items" : { "nullable" : false,