Add official support for transcripts

daymxn · daymxn · commit c4339fa99b24 · 2025-09-25T12:40:11.000-05:00
diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift
@@ -51,5 +51,7 @@ struct BidiGenerateContentServerContent: Decodable, Sendable {
   /// Metadata specifies sources used to ground generated content.
   let groundingMetadata: GroundingMetadata?
 
+  let inputTranscription: BidiGenerateContentTranscription?
+
   let outputTranscription: BidiGenerateContentTranscription?
 }
diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift
@@ -29,7 +29,7 @@ struct BidiGenerateContentSetup: Encodable {
   let model: String
 
   /// Generation config.
-  let generationConfig: LiveGenerationConfig?
+  let generationConfig: BidiGenerationConfig?
 
   /// The user provided system instructions for the model.
   /// Note: only text should be used in parts and content in each part will be
@@ -48,18 +48,24 @@ struct BidiGenerateContentSetup: Encodable {
   /// Configures the handling of realtime input.
   let realtimeInputConfig: RealtimeInputConfig?
 
-  let inputAudioTranscription: AudioTranscriptionConfig?
+  /// Input transcription. The transcription is independent to the model turn
+  /// which means it doesn't imply any ordering between transcription and model
+  /// turn.
+  let inputAudioTranscription: BidiAudioTranscriptionConfig?
 
-  let outputAudioTranscription: AudioTranscriptionConfig?
+  /// Output transcription. The transcription is independent to the model turn
+  /// which means it doesn't imply any ordering between transcription and model
+  /// turn.
+  let outputAudioTranscription: BidiAudioTranscriptionConfig?
 
   init(model: String,
-       generationConfig: LiveGenerationConfig? = nil,
+       generationConfig: BidiGenerationConfig? = nil,
        systemInstruction: ModelContent? = nil,
        tools: [Tool]? = nil,
        toolConfig: ToolConfig? = nil,
        realtimeInputConfig: RealtimeInputConfig? = nil,
-       inputAudioTranscription: AudioTranscriptionConfig? = nil,
-       outputAudioTranscription: AudioTranscriptionConfig? = nil) {
+       inputAudioTranscription: BidiAudioTranscriptionConfig? = nil,
+       outputAudioTranscription: BidiAudioTranscriptionConfig? = nil) {
     self.model = model
     self.generationConfig = generationConfig
     self.systemInstruction = systemInstruction
@@ -71,4 +77,4 @@ struct BidiGenerateContentSetup: Encodable {
   }
 }
 
-struct AudioTranscriptionConfig: Encodable {}
+struct BidiAudioTranscriptionConfig: Encodable {}
diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentTranscription.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentTranscription.swift
@@ -15,5 +15,4 @@
 @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
 struct BidiGenerateContentTranscription: Decodable, Sendable {
   let text: String?
-  let finished: Bool?
 }
diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerationConfig.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerationConfig.swift
@@ -0,0 +1,46 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Configuration options for live content generation.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+internal struct BidiGenerationConfig: Encodable, Sendable {
+  let temperature: Float?
+  let topP: Float?
+  let topK: Int?
+  let candidateCount: Int?
+  let maxOutputTokens: Int?
+  let presencePenalty: Float?
+  let frequencyPenalty: Float?
+  let responseModalities: [ResponseModality]?
+  let speechConfig: BidiSpeechConfig?
+
+  init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
+              candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
+              presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
+              responseModalities: [ResponseModality]? = nil,
+              speechConfig: BidiSpeechConfig? = nil
+  ) {
+    self.temperature = temperature
+    self.topP = topP
+    self.topK = topK
+    self.candidateCount = candidateCount
+    self.maxOutputTokens = maxOutputTokens
+    self.presencePenalty = presencePenalty
+    self.frequencyPenalty = frequencyPenalty
+    self.responseModalities = responseModalities
+    self.speechConfig = speechConfig
+  }
+}
diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiSpeechConfig.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiSpeechConfig.swift
@@ -15,7 +15,7 @@
 import Foundation
 
 /// Speech generation config.
-struct SpeechConfig: Encodable, Sendable {
+struct BidiSpeechConfig: Encodable, Sendable {
   /// The configuration for the speaker to use.
   let voiceConfig: VoiceConfig
 
diff --git a/FirebaseAI/Sources/Types/Internal/Live/LiveSessionService.swift b/FirebaseAI/Sources/Types/Internal/Live/LiveSessionService.swift
@@ -180,11 +180,12 @@ actor LiveSessionService {
     do {
       let setup = BidiGenerateContentSetup(
         model: modelResourceName,
-        generationConfig: generationConfig,
+        generationConfig: generationConfig?.bidiGenerationConfig,
         systemInstruction: systemInstruction,
         tools: tools,
         toolConfig: toolConfig,
-        outputAudioTranscription: AudioTranscriptionConfig()
+        inputAudioTranscription: generationConfig?.inputAudioTranscription,
+        outputAudioTranscription: generationConfig?.outputAudioTranscription
       )
       let data = try jsonEncoder.encode(BidiGenerateContentClientMessage.setup(setup))
       try await webSocket.send(.data(data))
diff --git a/FirebaseAI/Sources/Types/Public/Live/AudioTranscriptionConfig.swift b/FirebaseAI/Sources/Types/Public/Live/AudioTranscriptionConfig.swift
@@ -0,0 +1,30 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/// Configuration options for audio transcriptions when communicating with a live model.
+///
+/// While there are not currently any options, this will likely change in the future. For now, just providing
+/// an instance of this struct will enable audio transcriptions for the corresponding input or output fields.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+public struct AudioTranscriptionConfig: Sendable {
+  let audioTranscriptionConfig: BidiAudioTranscriptionConfig
+
+  init(_ audioTranscriptionConfig: BidiAudioTranscriptionConfig) {
+    self.audioTranscriptionConfig = audioTranscriptionConfig
+  }
+
+  public init() {
+    self.init(BidiAudioTranscriptionConfig())
+  }
+}
diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift b/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift
@@ -17,32 +17,9 @@ import Foundation
 /// Configuration options for live content generation.
 @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
 public struct LiveGenerationConfig: Sendable {
-  /// Controls the degree of randomness in token selection.
-  let temperature: Float?
-
-  /// Controls diversity of generated text.
-  let topP: Float?
-
-  /// Limits the number of highest probability words considered.
-  let topK: Int?
-
-  /// The number of response variations to return.
-  let candidateCount: Int?
-
-  /// Maximum number of tokens that can be generated in the response.
-  let maxOutputTokens: Int?
-
-  /// Controls the likelihood of repeating the same words or phrases already generated in the text.
-  let presencePenalty: Float?
-
-  /// Controls the likelihood of repeating words, with the penalty increasing for each repetition.
-  let frequencyPenalty: Float?
-
-  /// Supported modalities of the response.
-  let responseModalities: [ResponseModality]?
-
-  /// Controls the voice of the model during conversation.
-  let speechConfig: SpeechConfig?
+  let bidiGenerationConfig: BidiGenerationConfig
+  let inputAudioTranscription: BidiAudioTranscriptionConfig?
+  let outputAudioTranscription: BidiAudioTranscriptionConfig?
 
   /// Creates a new ``LiveGenerationConfig`` value.
   ///
@@ -125,38 +102,49 @@ public struct LiveGenerationConfig: Sendable {
   ///     > backwards-incompatible ways.
   ///   - speechConfig: Controls the voice of the model, when streaming `audio` via
   ///     ``ResponseModality``.
+  ///   - inputAudioTranscription: Configures (and enables) input transcriptions when streaming to the model.
+  ///
+  ///     Input transcripts are the model's interprutation of audio data sent to it, and they are populated in model responses via ``LiveServerContent``.
+  ///     When this fields is set to `nil`, input transcripts are not populated in model responses.
+  ///   - outputAudioTranscription: Configures (and enables) output transcriptions when streaming to the model.
+  ///
+  ///     Output transcripts are text representations of the audio the model is sending to the client, and they are populated in model responses via ``LiveServerContent``
+  ///     When this fields is set to `nil`, output transcripts are not populated in model responses.
+  ///
+  ///     > Important: Transcripts are independent to the model turn. This means transcripts may come earlier or later than when
+  ///     > the model sends the corresponding audio responses.
   public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
               candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
               presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
               responseModalities: [ResponseModality]? = nil,
-              speechConfig: LiveSpeechConfig? = nil) {
-    // Explicit init because otherwise if we re-arrange the above variables it changes the API
-    // surface.
-    self.temperature = temperature
-    self.topP = topP
-    self.topK = topK
-    self.candidateCount = candidateCount
-    self.maxOutputTokens = maxOutputTokens
-    self.presencePenalty = presencePenalty
-    self.frequencyPenalty = frequencyPenalty
-    self.responseModalities = responseModalities
-    self.speechConfig = speechConfig?.speechConfig
+              speechConfig: SpeechConfig? = nil,
+              inputAudioTranscription: AudioTranscriptionConfig? = nil,
+              outputAudioTranscription: AudioTranscriptionConfig? = nil
+  ) {
+    self.init(
+      BidiGenerationConfig(
+        temperature: temperature,
+        topP: topP,
+        topK: topK,
+        candidateCount: candidateCount,
+        maxOutputTokens: maxOutputTokens,
+        presencePenalty: presencePenalty,
+        frequencyPenalty: frequencyPenalty,
+        responseModalities: responseModalities,
+        speechConfig: speechConfig?.speechConfig
+      ),
+      inputAudioTranscription: inputAudioTranscription?.audioTranscriptionConfig,
+      outputAudioTranscription: outputAudioTranscription?.audioTranscriptionConfig
+    )
   }
-}
-
-// MARK: - Codable Conformances
 
-@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
-extension LiveGenerationConfig: Encodable {
-  enum CodingKeys: String, CodingKey {
-    case temperature
-    case topP
-    case topK
-    case candidateCount
-    case maxOutputTokens
-    case presencePenalty
-    case frequencyPenalty
-    case responseModalities
-    case speechConfig
+  init(
+    _ bidiGenerationConfig: BidiGenerationConfig,
+    inputAudioTranscription: BidiAudioTranscriptionConfig? = nil,
+    outputAudioTranscription: BidiAudioTranscriptionConfig? = nil
+  ) {
+    self.bidiGenerationConfig = bidiGenerationConfig
+    self.inputAudioTranscription = inputAudioTranscription
+    self.outputAudioTranscription = outputAudioTranscription
   }
 }
diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveServerContent.swift b/FirebaseAI/Sources/Types/Public/Live/LiveServerContent.swift
@@ -60,13 +60,21 @@ public struct LiveServerContent: Sendable {
   /// Metadata specifing the sources used to ground generated content.
   public var groundingMetadata: GroundingMetadata? { serverContent.groundingMetadata }
 
-  // TODO: remove
-  public var transcript: LiveTranscript? {
-    if let transcript = serverContent.outputTranscription {
-      LiveTranscript(transcript)
-    } else {
-      nil
-    }
+  /// The model's interpretation of what the client said in an audio message.
+  ///
+  /// This field is only populated when an ``AudioTranscriptionConfig`` is provided to ``LiveGenerationConfig``.
+  public var inputTranscription: LiveTranscription? {
+    serverContent.inputTranscription.map { LiveTranscription($0) }
+  }
+
+  /// Transcription matching the model's audio response.
+  ///
+  /// This field is only populated when an ``AudioTranscriptionConfig`` is provided to ``LiveGenerationConfig``.
+  ///
+  ///     > Important: Transcripts are independent to the model turn. This means transcripts may come earlier or later than when
+  ///     > the model sends the corresponding audio responses.
+  public var outputTranscription: LiveTranscription? {
+    serverContent.outputTranscription.map { LiveTranscription($0) }
   }
 
   init(_ serverContent: BidiGenerateContentServerContent) {
diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveTranscription.swift b/FirebaseAI/Sources/Types/Public/Live/LiveTranscription.swift
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// TODO: remove
+/// Text transcription of some audio form during a live interaction with the model.
 @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
-public struct LiveTranscript: Sendable {
+public struct LiveTranscription: Sendable {
   let transcript: BidiGenerateContentTranscription
+  /// Text representing the model's interpretation of what the audio said.
   public var text: String? { transcript.text }
-  public var finished: Bool? { transcript.finished }
 
   init(_ transcript: BidiGenerateContentTranscription) {
     self.transcript = transcript
diff --git a/FirebaseAI/Sources/Types/Public/Live/SpeechConfig.swift b/FirebaseAI/Sources/Types/Public/Live/SpeechConfig.swift
@@ -16,10 +16,10 @@ import Foundation
 
 /// Configuration for controlling the voice of the model during conversation.
 @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
-public struct LiveSpeechConfig: Sendable {
-  let speechConfig: SpeechConfig
+public struct SpeechConfig: Sendable {
+  let speechConfig: BidiSpeechConfig
 
-  init(_ speechConfig: SpeechConfig) {
+  init(_ speechConfig: BidiSpeechConfig) {
     self.speechConfig = speechConfig
   }
 
@@ -37,7 +37,7 @@ public struct LiveSpeechConfig: Sendable {
   ///     [Supported languages](https://ai.google.dev/gemini-api/docs/speech-generation#languages)\.
   public init(voiceName: String, languageCode: String? = nil) {
     self.init(
-      SpeechConfig(
+      BidiSpeechConfig(
         voiceConfig: .prebuiltVoiceConfig(.init(voiceName: voiceName)),
         languageCode: languageCode
       )

Original file line number	Diff line number	Diff line change
`@@ -51,5 +51,7 @@ struct BidiGenerateContentServerContent: Decodable, Sendable {`
`51`	`51`	`/// Metadata specifies sources used to ground generated content.`
`52`	`52`	`let groundingMetadata: GroundingMetadata?`
`53`	`53`
	`54`	`+ let inputTranscription: BidiGenerateContentTranscription?`
	`55`	`+`
`54`	`56`	`let outputTranscription: BidiGenerateContentTranscription?`
`55`	`57`	`}`
Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,4 @@`
`15`	`15`	`@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)`
`16`	`16`	`struct BidiGenerateContentTranscription: Decodable, Sendable {`
`17`	`17`	`let text: String?`
`18`		`- let finished: Bool?`
`19`	`18`	`}`