From 12a7befbf902c39a8633ee1a8d3595ce37e69f5b Mon Sep 17 00:00:00 2001 From: Andrew Heard Date: Fri, 4 Apr 2025 18:11:38 -0400 Subject: [PATCH 1/6] [Vertex AI] Add `responseModalities` to `GenerationConfig` --- .../Sources/GenerationConfig.swift | 8 ++++- .../Types/Public/ResponseModality.swift | 36 +++++++++++++++++++ .../Tests/TestApp/Sources/Constants.swift | 1 + .../GenerateContentIntegrationTests.swift | 36 +++++++++++++++++++ 4 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift index 7765c053cda..465ce2f9195 100644 --- a/FirebaseVertexAI/Sources/GenerationConfig.swift +++ b/FirebaseVertexAI/Sources/GenerationConfig.swift @@ -48,6 +48,9 @@ public struct GenerationConfig: Sendable { /// Output schema of the generated candidate text. let responseSchema: Schema? + /// Supported modalities of the response. + let responseModalities: [ResponseModality]? + /// Creates a new `GenerationConfig` value. /// /// See the @@ -140,11 +143,12 @@ public struct GenerationConfig: Sendable { /// [Generate structured /// output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide /// for more details. + /// - responseModalities: Supported modalities of the response. public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil, candidateCount: Int? = nil, maxOutputTokens: Int? = nil, presencePenalty: Float? = nil, frequencyPenalty: Float? = nil, stopSequences: [String]? = nil, responseMIMEType: String? = nil, - responseSchema: Schema? = nil) { + responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil) { // Explicit init because otherwise if we re-arrange the above variables it changes the API // surface. self.temperature = temperature @@ -157,6 +161,7 @@ public struct GenerationConfig: Sendable { self.stopSequences = stopSequences self.responseMIMEType = responseMIMEType self.responseSchema = responseSchema + self.responseModalities = responseModalities } } @@ -175,5 +180,6 @@ extension GenerationConfig: Encodable { case stopSequences case responseMIMEType = "responseMimeType" case responseSchema + case responseModalities } } diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift new file mode 100644 index 00000000000..1ed1d0af052 --- /dev/null +++ b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift @@ -0,0 +1,36 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Represents the available response modalities. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +public struct ResponseModality: EncodableProtoEnum, Sendable { + enum Kind: String { + case text = "TEXT" + case image = "IMAGE" + case audio = "AUDIO" + } + + /// Text response modality. + public static let text = ResponseModality(kind: .text) + + /// Image response modality. + public static let image = ResponseModality(kind: .image) + + /// Audio response modality. + public static let audio = ResponseModality(kind: .audio) + + let rawValue: String +} diff --git a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift index f26fec45fb3..3a731813704 100644 --- a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift +++ b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift @@ -23,4 +23,5 @@ public enum FirebaseAppNames { public enum ModelNames { public static let gemini2Flash = "gemini-2.0-flash-001" public static let gemini2FlashLite = "gemini-2.0-flash-lite-001" + public static let gemini2FlashExperimental = "gemini-2.0-flash-exp" } diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift index bec13f076b6..f07025692ca 100644 --- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift +++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift @@ -19,6 +19,10 @@ import FirebaseVertexAI import Testing import VertexAITestApp +#if canImport(UIKit) + import UIKit +#endif // canImport(UIKit) + @Suite(.serialized) struct GenerateContentIntegrationTests { // Set temperature, topP and topK to lowest allowed values to make responses more deterministic. @@ -118,6 +122,38 @@ struct GenerateContentIntegrationTests { #expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount) } + @Test(arguments: [InstanceConfig.vertexV1Beta]) + func generateImage(_ config: InstanceConfig) async throws { + let generationConfig = GenerationConfig( + temperature: 0.0, + topP: 0.0, + topK: 1, + responseModalities: [.text, .image] + ) + let model = VertexAI.componentInstance(config).generativeModel( + modelName: ModelNames.gemini2FlashExperimental, + generationConfig: generationConfig, + safetySettings: safetySettings + ) + let prompt = """ + Generate an image of a cute cartoon kitten playing with a ball of yarn. Do not respond with any + text. + """ + + let response = try await model.generateContent(prompt) + + let candidate = try #require(response.candidates.first) + let inlineDataPart = try #require(candidate.content.parts + .first { $0 is InlineDataPart } as? InlineDataPart) + #expect(inlineDataPart.mimeType == "image/png") + #expect(inlineDataPart.data.count > 0) + #if canImport(UIKit) + let uiImage = try #require(UIImage(data: inlineDataPart.data)) + #expect(uiImage.size.width == 1024.0) + #expect(uiImage.size.height == 1024.0) + #endif // canImport(UIKit) + } + // MARK: Streaming Tests @Test(arguments: InstanceConfig.allConfigs) From 0ca33d427c7263e9c92412453032d9d52534b2c0 Mon Sep 17 00:00:00 2001 From: Andrew Heard Date: Mon, 7 Apr 2025 17:44:49 -0400 Subject: [PATCH 2/6] Add developer API integration testing and update docs --- FirebaseVertexAI/CHANGELOG.md | 9 ++++++ .../Sources/GenerationConfig.swift | 10 ++++++- .../Types/Public/ResponseModality.swift | 30 ++++++++++++++----- .../GenerateContentIntegrationTests.swift | 5 +++- 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/FirebaseVertexAI/CHANGELOG.md b/FirebaseVertexAI/CHANGELOG.md index d70368aa708..dd822e265ab 100644 --- a/FirebaseVertexAI/CHANGELOG.md +++ b/FirebaseVertexAI/CHANGELOG.md @@ -1,3 +1,12 @@ +# Unreleased +- [added] **Public Preview**: Added support for specifying response modalities + in `GenerationConfig`. This includes **public experimental** support for image + generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658) +

+ Note: This feature is in Public Preview (and relies on experimental models), + which means that it is not subject to any SLA or deprecation policy and could + change in backwards-incompatible ways. + # 11.11.0 - [added] Emits a warning when attempting to use an incompatible model with `GenerativeModel` or `ImagenModel`. (#14610) diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift index 465ce2f9195..3daebbae692 100644 --- a/FirebaseVertexAI/Sources/GenerationConfig.swift +++ b/FirebaseVertexAI/Sources/GenerationConfig.swift @@ -143,7 +143,15 @@ public struct GenerationConfig: Sendable { /// [Generate structured /// output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide /// for more details. - /// - responseModalities: Supported modalities of the response. + /// - responseModalities: The data types (modalities) that may be returned in model responses. + /// + /// See the [multimodal + /// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation) + /// documentation for more details. + /// + /// > Warning: Specifying response modalities is a **Public Preview** feature, which means + /// > that it is not subject to any SLA or deprecation policy and could change in + /// > backwards-incompatible ways. public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil, candidateCount: Int? = nil, maxOutputTokens: Int? = nil, presencePenalty: Float? = nil, frequencyPenalty: Float? = nil, diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift index 1ed1d0af052..442fed5f434 100644 --- a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift +++ b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift @@ -14,23 +14,39 @@ import Foundation -/// Represents the available response modalities. +/// Represents the different types, or modalities, of data that a model can produce as output. +/// +/// To configure the desired output modalities for model requests, set the `responseModalities` +/// parameter when initializing a ``GenerationConfig``. See the [multimodal +/// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation) +/// documentation for more details. +/// +/// > Important: Support for each response modality, or combination of modalities, depends on the +/// > model. @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) public struct ResponseModality: EncodableProtoEnum, Sendable { enum Kind: String { case text = "TEXT" case image = "IMAGE" - case audio = "AUDIO" } - /// Text response modality. + /// Specifies that the model should generate textual content. + /// + /// Use this modality when you need the model to produce written language, such as answers to + /// questions, summaries, creative writing, code snippets, or structured data formats like JSON. public static let text = ResponseModality(kind: .text) - /// Image response modality. + /// **Public Experimental**: Specifies that the model should generate image data. + /// + /// Use this modality when you want the model to create visual content based on the provided input + /// or prompts. The response might contain one or more generated images. See the [image + /// generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation#image-generation) + /// documentation for more details. + /// + /// > Warning: Image generation using Gemini 2.0 Flash is a **Public Experimental** feature, which + /// > means that it is not subject to any SLA or deprecation policy and could change in + /// > backwards-incompatible ways. public static let image = ResponseModality(kind: .image) - /// Audio response modality. - public static let audio = ResponseModality(kind: .audio) - let rawValue: String } diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift index f07025692ca..b35175ee586 100644 --- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift +++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift @@ -122,7 +122,10 @@ struct GenerateContentIntegrationTests { #expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount) } - @Test(arguments: [InstanceConfig.vertexV1Beta]) + @Test(arguments: [ + InstanceConfig.vertexV1Beta, + InstanceConfig.developerV1Beta, + ]) func generateImage(_ config: InstanceConfig) async throws { let generationConfig = GenerationConfig( temperature: 0.0, From 8326319b64a3aa2771a834e8ce81819f3b21b92f Mon Sep 17 00:00:00 2001 From: Andrew Heard Date: Mon, 7 Apr 2025 17:59:09 -0400 Subject: [PATCH 3/6] Remove "no text in response" and add comment for image size --- .../Tests/Integration/GenerateContentIntegrationTests.swift | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift index b35175ee586..e97af967ae7 100644 --- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift +++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift @@ -138,10 +138,7 @@ struct GenerateContentIntegrationTests { generationConfig: generationConfig, safetySettings: safetySettings ) - let prompt = """ - Generate an image of a cute cartoon kitten playing with a ball of yarn. Do not respond with any - text. - """ + let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn." let response = try await model.generateContent(prompt) @@ -152,6 +149,7 @@ struct GenerateContentIntegrationTests { #expect(inlineDataPart.data.count > 0) #if canImport(UIKit) let uiImage = try #require(UIImage(data: inlineDataPart.data)) + // Gemini 2.0 Flash Experimental returns 1024x1024 sized images. #expect(uiImage.size.width == 1024.0) #expect(uiImage.size.height == 1024.0) #endif // canImport(UIKit) From ec5df60ad6666bb0a6a9cd7f5d4ac005900ad5ef Mon Sep 17 00:00:00 2001 From: Andrew Heard Date: Mon, 7 Apr 2025 18:07:20 -0400 Subject: [PATCH 4/6] Update `GenerationConfig` unit test to include `responseModalities` --- FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift index 23f85e8bdbd..5585c1ae995 100644 --- a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift +++ b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift @@ -61,7 +61,8 @@ final class GenerationConfigTests: XCTestCase { frequencyPenalty: frequencyPenalty, stopSequences: stopSequences, responseMIMEType: responseMIMEType, - responseSchema: .array(items: .string()) + responseSchema: .array(items: .string()), + responseModalities: [.text, .image] ) let jsonData = try encoder.encode(generationConfig) @@ -74,6 +75,10 @@ final class GenerationConfigTests: XCTestCase { "maxOutputTokens" : \(maxOutputTokens), "presencePenalty" : \(presencePenalty), "responseMimeType" : "\(responseMIMEType)", + "responseModalities" : [ + "TEXT", + "IMAGE" + ], "responseSchema" : { "items" : { "nullable" : false, From d959d33695020910cc918bed2090a07569b7e117 Mon Sep 17 00:00:00 2001 From: Andrew Heard Date: Tue, 8 Apr 2025 16:17:11 -0400 Subject: [PATCH 5/6] Handle 503 errors in integration test using `withKnownIssue` --- .../GenerateContentIntegrationTests.swift | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift index e97af967ae7..6654a14fda6 100644 --- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift +++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift @@ -23,6 +23,8 @@ import VertexAITestApp import UIKit #endif // canImport(UIKit) +@testable import struct FirebaseVertexAI.BackendError + @Suite(.serialized) struct GenerateContentIntegrationTests { // Set temperature, topP and topK to lowest allowed values to make responses more deterministic. @@ -140,8 +142,17 @@ struct GenerateContentIntegrationTests { ) let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn." - let response = try await model.generateContent(prompt) + var response: GenerateContentResponse? + try await withKnownIssue( + "Backend may fail with a 503 - Service Unavailable error when overloaded", + isIntermittent: true + ) { + response = try await model.generateContent(prompt) + } matching: { issue in + (issue.error as? BackendError).map { $0.httpResponseCode == 503 } ?? false + } + guard let response else { return } let candidate = try #require(response.candidates.first) let inlineDataPart = try #require(candidate.content.parts .first { $0 is InlineDataPart } as? InlineDataPart) @@ -149,9 +160,12 @@ struct GenerateContentIntegrationTests { #expect(inlineDataPart.data.count > 0) #if canImport(UIKit) let uiImage = try #require(UIImage(data: inlineDataPart.data)) - // Gemini 2.0 Flash Experimental returns 1024x1024 sized images. - #expect(uiImage.size.width == 1024.0) - #expect(uiImage.size.height == 1024.0) + // Gemini 2.0 Flash Experimental returns images sized to fit within a 1024x1024 pixel box but + // dimensions may vary depending on the aspect ratio. + #expect(uiImage.size.width <= 1024) + #expect(uiImage.size.width >= 500) + #expect(uiImage.size.height <= 1024) + #expect(uiImage.size.height >= 500) #endif // canImport(UIKit) } From 9a986d55cd4713c25204aee7b48b440f1214846f Mon Sep 17 00:00:00 2001 From: Andrew Heard Date: Tue, 8 Apr 2025 16:23:06 -0400 Subject: [PATCH 6/6] Reword CHANGELOG slightly --- FirebaseVertexAI/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FirebaseVertexAI/CHANGELOG.md b/FirebaseVertexAI/CHANGELOG.md index dd822e265ab..b33ebfd7192 100644 --- a/FirebaseVertexAI/CHANGELOG.md +++ b/FirebaseVertexAI/CHANGELOG.md @@ -3,7 +3,7 @@ in `GenerationConfig`. This includes **public experimental** support for image generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658)

- Note: This feature is in Public Preview (and relies on experimental models), + Note: This feature is in Public Preview and relies on experimental models, which means that it is not subject to any SLA or deprecation policy and could change in backwards-incompatible ways.