[Vertex AI] Add responseModalities to GenerationConfig

andrewheard · andrewheard · commit 12a7befbf902 · 2025-04-04T18:11:38.000-04:00
diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift
@@ -48,6 +48,9 @@ public struct GenerationConfig: Sendable {
   /// Output schema of the generated candidate text.
   let responseSchema: Schema?
 
+  /// Supported modalities of the response.
+  let responseModalities: [ResponseModality]?
+
   /// Creates a new `GenerationConfig` value.
   ///
   /// See the
@@ -140,11 +143,12 @@ public struct GenerationConfig: Sendable {
   ///     [Generate structured
   ///     output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide
   ///     for more details.
+  ///   - responseModalities: Supported modalities of the response.
   public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
               candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
               presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
               stopSequences: [String]? = nil, responseMIMEType: String? = nil,
-              responseSchema: Schema? = nil) {
+              responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil) {
     // Explicit init because otherwise if we re-arrange the above variables it changes the API
     // surface.
     self.temperature = temperature
@@ -157,6 +161,7 @@ public struct GenerationConfig: Sendable {
     self.stopSequences = stopSequences
     self.responseMIMEType = responseMIMEType
     self.responseSchema = responseSchema
+    self.responseModalities = responseModalities
   }
 }
 
@@ -175,5 +180,6 @@ extension GenerationConfig: Encodable {
     case stopSequences
     case responseMIMEType = "responseMimeType"
     case responseSchema
+    case responseModalities
   }
 }
diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Represents the available response modalities.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+public struct ResponseModality: EncodableProtoEnum, Sendable {
+  enum Kind: String {
+    case text = "TEXT"
+    case image = "IMAGE"
+    case audio = "AUDIO"
+  }
+
+  /// Text response modality.
+  public static let text = ResponseModality(kind: .text)
+
+  /// Image response modality.
+  public static let image = ResponseModality(kind: .image)
+
+  /// Audio response modality.
+  public static let audio = ResponseModality(kind: .audio)
+
+  let rawValue: String
+}
diff --git a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
@@ -23,4 +23,5 @@ public enum FirebaseAppNames {
 public enum ModelNames {
   public static let gemini2Flash = "gemini-2.0-flash-001"
   public static let gemini2FlashLite = "gemini-2.0-flash-lite-001"
+  public static let gemini2FlashExperimental = "gemini-2.0-flash-exp"
 }
diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
@@ -19,6 +19,10 @@ import FirebaseVertexAI
 import Testing
 import VertexAITestApp
 
+#if canImport(UIKit)
+  import UIKit
+#endif // canImport(UIKit)
+
 @Suite(.serialized)
 struct GenerateContentIntegrationTests {
   // Set temperature, topP and topK to lowest allowed values to make responses more deterministic.
@@ -118,6 +122,38 @@ struct GenerateContentIntegrationTests {
     #expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount)
   }
 
+  @Test(arguments: [InstanceConfig.vertexV1Beta])
+  func generateImage(_ config: InstanceConfig) async throws {
+    let generationConfig = GenerationConfig(
+      temperature: 0.0,
+      topP: 0.0,
+      topK: 1,
+      responseModalities: [.text, .image]
+    )
+    let model = VertexAI.componentInstance(config).generativeModel(
+      modelName: ModelNames.gemini2FlashExperimental,
+      generationConfig: generationConfig,
+      safetySettings: safetySettings
+    )
+    let prompt = """
+    Generate an image of a cute cartoon kitten playing with a ball of yarn. Do not respond with any
+    text.
+    """
+
+    let response = try await model.generateContent(prompt)
+
+    let candidate = try #require(response.candidates.first)
+    let inlineDataPart = try #require(candidate.content.parts
+      .first { $0 is InlineDataPart } as? InlineDataPart)
+    #expect(inlineDataPart.mimeType == "image/png")
+    #expect(inlineDataPart.data.count > 0)
+    #if canImport(UIKit)
+      let uiImage = try #require(UIImage(data: inlineDataPart.data))
+      #expect(uiImage.size.width == 1024.0)
+      #expect(uiImage.size.height == 1024.0)
+    #endif // canImport(UIKit)
+  }
+
   // MARK: Streaming Tests
 
   @Test(arguments: InstanceConfig.allConfigs)

Original file line number	Diff line number	Diff line change
`@@ -23,4 +23,5 @@ public enum FirebaseAppNames {`
`23`	`23`	`public enum ModelNames {`
`24`	`24`	`public static let gemini2Flash = "gemini-2.0-flash-001"`
`25`	`25`	`public static let gemini2FlashLite = "gemini-2.0-flash-lite-001"`
	`26`	`+ public static let gemini2FlashExperimental = "gemini-2.0-flash-exp"`
`26`	`27`	`}`