From 12a7befbf902c39a8633ee1a8d3595ce37e69f5b Mon Sep 17 00:00:00 2001
From: Andrew Heard <andrewheard@google.com>
Date: Fri, 4 Apr 2025 18:11:38 -0400
Subject: [PATCH 1/6] [Vertex AI] Add `responseModalities` to
 `GenerationConfig`

---
 .../Sources/GenerationConfig.swift            |  8 ++++-
 .../Types/Public/ResponseModality.swift       | 36 +++++++++++++++++++
 .../Tests/TestApp/Sources/Constants.swift     |  1 +
 .../GenerateContentIntegrationTests.swift     | 36 +++++++++++++++++++
 4 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift

diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift
index 7765c053cda..465ce2f9195 100644
--- a/FirebaseVertexAI/Sources/GenerationConfig.swift
+++ b/FirebaseVertexAI/Sources/GenerationConfig.swift
@@ -48,6 +48,9 @@ public struct GenerationConfig: Sendable {
   /// Output schema of the generated candidate text.
   let responseSchema: Schema?
 
+  /// Supported modalities of the response.
+  let responseModalities: [ResponseModality]?
+
   /// Creates a new `GenerationConfig` value.
   ///
   /// See the
@@ -140,11 +143,12 @@ public struct GenerationConfig: Sendable {
   ///     [Generate structured
   ///     output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide
   ///     for more details.
+  ///   - responseModalities: Supported modalities of the response.
   public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
               candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
               presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
               stopSequences: [String]? = nil, responseMIMEType: String? = nil,
-              responseSchema: Schema? = nil) {
+              responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil) {
     // Explicit init because otherwise if we re-arrange the above variables it changes the API
     // surface.
     self.temperature = temperature
@@ -157,6 +161,7 @@ public struct GenerationConfig: Sendable {
     self.stopSequences = stopSequences
     self.responseMIMEType = responseMIMEType
     self.responseSchema = responseSchema
+    self.responseModalities = responseModalities
   }
 }
 
@@ -175,5 +180,6 @@ extension GenerationConfig: Encodable {
     case stopSequences
     case responseMIMEType = "responseMimeType"
     case responseSchema
+    case responseModalities
   }
 }
diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
new file mode 100644
index 00000000000..1ed1d0af052
--- /dev/null
+++ b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Represents the available response modalities.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+public struct ResponseModality: EncodableProtoEnum, Sendable {
+  enum Kind: String {
+    case text = "TEXT"
+    case image = "IMAGE"
+    case audio = "AUDIO"
+  }
+
+  /// Text response modality.
+  public static let text = ResponseModality(kind: .text)
+
+  /// Image response modality.
+  public static let image = ResponseModality(kind: .image)
+
+  /// Audio response modality.
+  public static let audio = ResponseModality(kind: .audio)
+
+  let rawValue: String
+}
diff --git a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
index f26fec45fb3..3a731813704 100644
--- a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
+++ b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
@@ -23,4 +23,5 @@ public enum FirebaseAppNames {
 public enum ModelNames {
   public static let gemini2Flash = "gemini-2.0-flash-001"
   public static let gemini2FlashLite = "gemini-2.0-flash-lite-001"
+  public static let gemini2FlashExperimental = "gemini-2.0-flash-exp"
 }
diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
index bec13f076b6..f07025692ca 100644
--- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
+++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
@@ -19,6 +19,10 @@ import FirebaseVertexAI
 import Testing
 import VertexAITestApp
 
+#if canImport(UIKit)
+  import UIKit
+#endif // canImport(UIKit)
+
 @Suite(.serialized)
 struct GenerateContentIntegrationTests {
   // Set temperature, topP and topK to lowest allowed values to make responses more deterministic.
@@ -118,6 +122,38 @@ struct GenerateContentIntegrationTests {
     #expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount)
   }
 
+  @Test(arguments: [InstanceConfig.vertexV1Beta])
+  func generateImage(_ config: InstanceConfig) async throws {
+    let generationConfig = GenerationConfig(
+      temperature: 0.0,
+      topP: 0.0,
+      topK: 1,
+      responseModalities: [.text, .image]
+    )
+    let model = VertexAI.componentInstance(config).generativeModel(
+      modelName: ModelNames.gemini2FlashExperimental,
+      generationConfig: generationConfig,
+      safetySettings: safetySettings
+    )
+    let prompt = """
+    Generate an image of a cute cartoon kitten playing with a ball of yarn. Do not respond with any
+    text.
+    """
+
+    let response = try await model.generateContent(prompt)
+
+    let candidate = try #require(response.candidates.first)
+    let inlineDataPart = try #require(candidate.content.parts
+      .first { $0 is InlineDataPart } as? InlineDataPart)
+    #expect(inlineDataPart.mimeType == "image/png")
+    #expect(inlineDataPart.data.count > 0)
+    #if canImport(UIKit)
+      let uiImage = try #require(UIImage(data: inlineDataPart.data))
+      #expect(uiImage.size.width == 1024.0)
+      #expect(uiImage.size.height == 1024.0)
+    #endif // canImport(UIKit)
+  }
+
   // MARK: Streaming Tests
 
   @Test(arguments: InstanceConfig.allConfigs)

From 0ca33d427c7263e9c92412453032d9d52534b2c0 Mon Sep 17 00:00:00 2001
From: Andrew Heard <andrewheard@google.com>
Date: Mon, 7 Apr 2025 17:44:49 -0400
Subject: [PATCH 2/6] Add developer API integration testing and update docs

---
 FirebaseVertexAI/CHANGELOG.md                 |  9 ++++++
 .../Sources/GenerationConfig.swift            | 10 ++++++-
 .../Types/Public/ResponseModality.swift       | 30 ++++++++++++++-----
 .../GenerateContentIntegrationTests.swift     |  5 +++-
 4 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/FirebaseVertexAI/CHANGELOG.md b/FirebaseVertexAI/CHANGELOG.md
index d70368aa708..dd822e265ab 100644
--- a/FirebaseVertexAI/CHANGELOG.md
+++ b/FirebaseVertexAI/CHANGELOG.md
@@ -1,3 +1,12 @@
+# Unreleased
+- [added] **Public Preview**: Added support for specifying response modalities
+  in `GenerationConfig`. This includes **public experimental** support for image
+  generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658)
+  <br /><br />
+  Note: This feature is in Public Preview (and relies on experimental models),
+  which means that it is not subject to any SLA or deprecation policy and could
+  change in backwards-incompatible ways.
+
 # 11.11.0
 - [added] Emits a warning when attempting to use an incompatible model with
   `GenerativeModel` or `ImagenModel`. (#14610)
diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift
index 465ce2f9195..3daebbae692 100644
--- a/FirebaseVertexAI/Sources/GenerationConfig.swift
+++ b/FirebaseVertexAI/Sources/GenerationConfig.swift
@@ -143,7 +143,15 @@ public struct GenerationConfig: Sendable {
   ///     [Generate structured
   ///     output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide
   ///     for more details.
-  ///   - responseModalities: Supported modalities of the response.
+  ///   - responseModalities: The data types (modalities) that may be returned in model responses.
+  ///
+  ///     See the [multimodal
+  ///     responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
+  ///     documentation for more details.
+  ///
+  ///     > Warning: Specifying response modalities is a **Public Preview** feature, which means
+  ///     > that it is not subject to any SLA or deprecation policy and could change in
+  ///     > backwards-incompatible ways.
   public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
               candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
               presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
index 1ed1d0af052..442fed5f434 100644
--- a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
+++ b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
@@ -14,23 +14,39 @@
 
 import Foundation
 
-/// Represents the available response modalities.
+/// Represents the different types, or modalities, of data that a model can produce as output.
+///
+/// To configure the desired output modalities for model requests, set the `responseModalities`
+/// parameter when initializing a ``GenerationConfig``. See the [multimodal
+/// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
+/// documentation for more details.
+///
+/// > Important: Support for each response modality, or combination of modalities, depends on the
+/// > model.
 @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
 public struct ResponseModality: EncodableProtoEnum, Sendable {
   enum Kind: String {
     case text = "TEXT"
     case image = "IMAGE"
-    case audio = "AUDIO"
   }
 
-  /// Text response modality.
+  /// Specifies that the model should generate textual content.
+  ///
+  /// Use this modality when you need the model to produce written language, such as answers to
+  /// questions, summaries, creative writing, code snippets, or structured data formats like JSON.
   public static let text = ResponseModality(kind: .text)
 
-  /// Image response modality.
+  /// **Public Experimental**: Specifies that the model should generate image data.
+  ///
+  /// Use this modality when you want the model to create visual content based on the provided input
+  /// or prompts. The response might contain one or more generated images. See the [image
+  /// generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation#image-generation)
+  /// documentation for more details.
+  ///
+  /// > Warning: Image generation using Gemini 2.0 Flash is a **Public Experimental** feature, which
+  /// > means that it is not subject to any SLA or deprecation policy and could change in
+  /// > backwards-incompatible ways.
   public static let image = ResponseModality(kind: .image)
 
-  /// Audio response modality.
-  public static let audio = ResponseModality(kind: .audio)
-
   let rawValue: String
 }
diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
index f07025692ca..b35175ee586 100644
--- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
+++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
@@ -122,7 +122,10 @@ struct GenerateContentIntegrationTests {
     #expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount)
   }
 
-  @Test(arguments: [InstanceConfig.vertexV1Beta])
+  @Test(arguments: [
+    InstanceConfig.vertexV1Beta,
+    InstanceConfig.developerV1Beta,
+  ])
   func generateImage(_ config: InstanceConfig) async throws {
     let generationConfig = GenerationConfig(
       temperature: 0.0,

From 8326319b64a3aa2771a834e8ce81819f3b21b92f Mon Sep 17 00:00:00 2001
From: Andrew Heard <andrewheard@google.com>
Date: Mon, 7 Apr 2025 17:59:09 -0400
Subject: [PATCH 3/6] Remove "no text in response" and add comment for image
 size

---
 .../Tests/Integration/GenerateContentIntegrationTests.swift | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
index b35175ee586..e97af967ae7 100644
--- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
+++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
@@ -138,10 +138,7 @@ struct GenerateContentIntegrationTests {
       generationConfig: generationConfig,
       safetySettings: safetySettings
     )
-    let prompt = """
-    Generate an image of a cute cartoon kitten playing with a ball of yarn. Do not respond with any
-    text.
-    """
+    let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn."
 
     let response = try await model.generateContent(prompt)
 
@@ -152,6 +149,7 @@ struct GenerateContentIntegrationTests {
     #expect(inlineDataPart.data.count > 0)
     #if canImport(UIKit)
       let uiImage = try #require(UIImage(data: inlineDataPart.data))
+      // Gemini 2.0 Flash Experimental returns 1024x1024 sized images.
       #expect(uiImage.size.width == 1024.0)
       #expect(uiImage.size.height == 1024.0)
     #endif // canImport(UIKit)

From ec5df60ad6666bb0a6a9cd7f5d4ac005900ad5ef Mon Sep 17 00:00:00 2001
From: Andrew Heard <andrewheard@google.com>
Date: Mon, 7 Apr 2025 18:07:20 -0400
Subject: [PATCH 4/6] Update `GenerationConfig` unit test to include
 `responseModalities`

---
 FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift
index 23f85e8bdbd..5585c1ae995 100644
--- a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift
+++ b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift
@@ -61,7 +61,8 @@ final class GenerationConfigTests: XCTestCase {
       frequencyPenalty: frequencyPenalty,
       stopSequences: stopSequences,
       responseMIMEType: responseMIMEType,
-      responseSchema: .array(items: .string())
+      responseSchema: .array(items: .string()),
+      responseModalities: [.text, .image]
     )
 
     let jsonData = try encoder.encode(generationConfig)
@@ -74,6 +75,10 @@ final class GenerationConfigTests: XCTestCase {
       "maxOutputTokens" : \(maxOutputTokens),
       "presencePenalty" : \(presencePenalty),
       "responseMimeType" : "\(responseMIMEType)",
+      "responseModalities" : [
+        "TEXT",
+        "IMAGE"
+      ],
       "responseSchema" : {
         "items" : {
           "nullable" : false,

From d959d33695020910cc918bed2090a07569b7e117 Mon Sep 17 00:00:00 2001
From: Andrew Heard <andrewheard@google.com>
Date: Tue, 8 Apr 2025 16:17:11 -0400
Subject: [PATCH 5/6] Handle 503 errors in integration test using
 `withKnownIssue`

---
 .../GenerateContentIntegrationTests.swift     | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
index e97af967ae7..6654a14fda6 100644
--- a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
+++ b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
@@ -23,6 +23,8 @@ import VertexAITestApp
   import UIKit
 #endif // canImport(UIKit)
 
+@testable import struct FirebaseVertexAI.BackendError
+
 @Suite(.serialized)
 struct GenerateContentIntegrationTests {
   // Set temperature, topP and topK to lowest allowed values to make responses more deterministic.
@@ -140,8 +142,17 @@ struct GenerateContentIntegrationTests {
     )
     let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn."
 
-    let response = try await model.generateContent(prompt)
+    var response: GenerateContentResponse?
+    try await withKnownIssue(
+      "Backend may fail with a 503 - Service Unavailable error when overloaded",
+      isIntermittent: true
+    ) {
+      response = try await model.generateContent(prompt)
+    } matching: { issue in
+      (issue.error as? BackendError).map { $0.httpResponseCode == 503 } ?? false
+    }
 
+    guard let response else { return }
     let candidate = try #require(response.candidates.first)
     let inlineDataPart = try #require(candidate.content.parts
       .first { $0 is InlineDataPart } as? InlineDataPart)
@@ -149,9 +160,12 @@ struct GenerateContentIntegrationTests {
     #expect(inlineDataPart.data.count > 0)
     #if canImport(UIKit)
       let uiImage = try #require(UIImage(data: inlineDataPart.data))
-      // Gemini 2.0 Flash Experimental returns 1024x1024 sized images.
-      #expect(uiImage.size.width == 1024.0)
-      #expect(uiImage.size.height == 1024.0)
+      // Gemini 2.0 Flash Experimental returns images sized to fit within a 1024x1024 pixel box but
+      // dimensions may vary depending on the aspect ratio.
+      #expect(uiImage.size.width <= 1024)
+      #expect(uiImage.size.width >= 500)
+      #expect(uiImage.size.height <= 1024)
+      #expect(uiImage.size.height >= 500)
     #endif // canImport(UIKit)
   }
 

From 9a986d55cd4713c25204aee7b48b440f1214846f Mon Sep 17 00:00:00 2001
From: Andrew Heard <andrewheard@google.com>
Date: Tue, 8 Apr 2025 16:23:06 -0400
Subject: [PATCH 6/6] Reword CHANGELOG slightly

---
 FirebaseVertexAI/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FirebaseVertexAI/CHANGELOG.md b/FirebaseVertexAI/CHANGELOG.md
index dd822e265ab..b33ebfd7192 100644
--- a/FirebaseVertexAI/CHANGELOG.md
+++ b/FirebaseVertexAI/CHANGELOG.md
@@ -3,7 +3,7 @@
   in `GenerationConfig`. This includes **public experimental** support for image
   generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658)
   <br /><br />
-  Note: This feature is in Public Preview (and relies on experimental models),
+  Note: This feature is in Public Preview and relies on experimental models,
   which means that it is not subject to any SLA or deprecation policy and could
   change in backwards-incompatible ways.