diff --git a/FirebaseVertexAI/CHANGELOG.md b/FirebaseVertexAI/CHANGELOG.md
index d70368aa708..b33ebfd7192 100644
--- a/FirebaseVertexAI/CHANGELOG.md
+++ b/FirebaseVertexAI/CHANGELOG.md
@@ -1,3 +1,12 @@
+# Unreleased
+- [added] **Public Preview**: Added support for specifying response modalities
+ in `GenerationConfig`. This includes **public experimental** support for image
+ generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658)
+
+ Note: This feature is in Public Preview and relies on experimental models,
+ which means that it is not subject to any SLA or deprecation policy and could
+ change in backwards-incompatible ways.
+
# 11.11.0
- [added] Emits a warning when attempting to use an incompatible model with
`GenerativeModel` or `ImagenModel`. (#14610)
diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift
index 7765c053cda..3daebbae692 100644
--- a/FirebaseVertexAI/Sources/GenerationConfig.swift
+++ b/FirebaseVertexAI/Sources/GenerationConfig.swift
@@ -48,6 +48,9 @@ public struct GenerationConfig: Sendable {
/// Output schema of the generated candidate text.
let responseSchema: Schema?
+ /// Supported modalities of the response.
+ let responseModalities: [ResponseModality]?
+
/// Creates a new `GenerationConfig` value.
///
/// See the
@@ -140,11 +143,20 @@ public struct GenerationConfig: Sendable {
/// [Generate structured
/// output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide
/// for more details.
+ /// - responseModalities: The data types (modalities) that may be returned in model responses.
+ ///
+ /// See the [multimodal
+ /// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
+ /// documentation for more details.
+ ///
+ /// > Warning: Specifying response modalities is a **Public Preview** feature, which means
+ /// > that it is not subject to any SLA or deprecation policy and could change in
+ /// > backwards-incompatible ways.
public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
stopSequences: [String]? = nil, responseMIMEType: String? = nil,
- responseSchema: Schema? = nil) {
+ responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil) {
// Explicit init because otherwise if we re-arrange the above variables it changes the API
// surface.
self.temperature = temperature
@@ -157,6 +169,7 @@ public struct GenerationConfig: Sendable {
self.stopSequences = stopSequences
self.responseMIMEType = responseMIMEType
self.responseSchema = responseSchema
+ self.responseModalities = responseModalities
}
}
@@ -175,5 +188,6 @@ extension GenerationConfig: Encodable {
case stopSequences
case responseMIMEType = "responseMimeType"
case responseSchema
+ case responseModalities
}
}
diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
new file mode 100644
index 00000000000..442fed5f434
--- /dev/null
+++ b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
@@ -0,0 +1,52 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Represents the different types, or modalities, of data that a model can produce as output.
+///
+/// To configure the desired output modalities for model requests, set the `responseModalities`
+/// parameter when initializing a ``GenerationConfig``. See the [multimodal
+/// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
+/// documentation for more details.
+///
+/// > Important: Support for each response modality, or combination of modalities, depends on the
+/// > model.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+public struct ResponseModality: EncodableProtoEnum, Sendable {
+ enum Kind: String {
+ case text = "TEXT"
+ case image = "IMAGE"
+ }
+
+ /// Specifies that the model should generate textual content.
+ ///
+ /// Use this modality when you need the model to produce written language, such as answers to
+ /// questions, summaries, creative writing, code snippets, or structured data formats like JSON.
+ public static let text = ResponseModality(kind: .text)
+
+ /// **Public Experimental**: Specifies that the model should generate image data.
+ ///
+ /// Use this modality when you want the model to create visual content based on the provided input
+ /// or prompts. The response might contain one or more generated images. See the [image
+ /// generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation#image-generation)
+ /// documentation for more details.
+ ///
+ /// > Warning: Image generation using Gemini 2.0 Flash is a **Public Experimental** feature, which
+ /// > means that it is not subject to any SLA or deprecation policy and could change in
+ /// > backwards-incompatible ways.
+ public static let image = ResponseModality(kind: .image)
+
+ let rawValue: String
+}
diff --git a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
index f26fec45fb3..3a731813704 100644
--- a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
+++ b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
@@ -23,4 +23,5 @@ public enum FirebaseAppNames {
public enum ModelNames {
public static let gemini2Flash = "gemini-2.0-flash-001"
public static let gemini2FlashLite = "gemini-2.0-flash-lite-001"
+ public static let gemini2FlashExperimental = "gemini-2.0-flash-exp"
}
diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
index acbe552e1a0..715be6e3e32 100644
--- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
+++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
@@ -19,6 +19,12 @@ import FirebaseVertexAI
import Testing
import VertexAITestApp
+#if canImport(UIKit)
+ import UIKit
+#endif // canImport(UIKit)
+
+@testable import struct FirebaseVertexAI.BackendError
+
@Suite(.serialized)
struct GenerateContentIntegrationTests {
// Set temperature, topP and topK to lowest allowed values to make responses more deterministic.
@@ -119,6 +125,51 @@ struct GenerateContentIntegrationTests {
#expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount)
}
+ @Test(arguments: [
+ InstanceConfig.vertexV1Beta,
+ InstanceConfig.developerV1Beta,
+ ])
+ func generateImage(_ config: InstanceConfig) async throws {
+ let generationConfig = GenerationConfig(
+ temperature: 0.0,
+ topP: 0.0,
+ topK: 1,
+ responseModalities: [.text, .image]
+ )
+ let model = VertexAI.componentInstance(config).generativeModel(
+ modelName: ModelNames.gemini2FlashExperimental,
+ generationConfig: generationConfig,
+ safetySettings: safetySettings
+ )
+ let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn."
+
+ var response: GenerateContentResponse?
+ try await withKnownIssue(
+ "Backend may fail with a 503 - Service Unavailable error when overloaded",
+ isIntermittent: true
+ ) {
+ response = try await model.generateContent(prompt)
+ } matching: { issue in
+ (issue.error as? BackendError).map { $0.httpResponseCode == 503 } ?? false
+ }
+
+ guard let response else { return }
+ let candidate = try #require(response.candidates.first)
+ let inlineDataPart = try #require(candidate.content.parts
+ .first { $0 is InlineDataPart } as? InlineDataPart)
+ #expect(inlineDataPart.mimeType == "image/png")
+ #expect(inlineDataPart.data.count > 0)
+ #if canImport(UIKit)
+ let uiImage = try #require(UIImage(data: inlineDataPart.data))
+ // Gemini 2.0 Flash Experimental returns images sized to fit within a 1024x1024 pixel box but
+ // dimensions may vary depending on the aspect ratio.
+ #expect(uiImage.size.width <= 1024)
+ #expect(uiImage.size.width >= 500)
+ #expect(uiImage.size.height <= 1024)
+ #expect(uiImage.size.height >= 500)
+ #endif // canImport(UIKit)
+ }
+
// MARK: Streaming Tests
@Test(arguments: InstanceConfig.allConfigs)
diff --git a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift
index 23f85e8bdbd..5585c1ae995 100644
--- a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift
+++ b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift
@@ -61,7 +61,8 @@ final class GenerationConfigTests: XCTestCase {
frequencyPenalty: frequencyPenalty,
stopSequences: stopSequences,
responseMIMEType: responseMIMEType,
- responseSchema: .array(items: .string())
+ responseSchema: .array(items: .string()),
+ responseModalities: [.text, .image]
)
let jsonData = try encoder.encode(generationConfig)
@@ -74,6 +75,10 @@ final class GenerationConfigTests: XCTestCase {
"maxOutputTokens" : \(maxOutputTokens),
"presencePenalty" : \(presencePenalty),
"responseMimeType" : "\(responseMIMEType)",
+ "responseModalities" : [
+ "TEXT",
+ "IMAGE"
+ ],
"responseSchema" : {
"items" : {
"nullable" : false,