Skip to content

Commit c4339fa

Browse files
committed
Add official support for transcripts
1 parent 12f9f27 commit c4339fa

11 files changed

+158
-78
lines changed

FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,7 @@ struct BidiGenerateContentServerContent: Decodable, Sendable {
5151
/// Metadata specifies sources used to ground generated content.
5252
let groundingMetadata: GroundingMetadata?
5353

54+
let inputTranscription: BidiGenerateContentTranscription?
55+
5456
let outputTranscription: BidiGenerateContentTranscription?
5557
}

FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct BidiGenerateContentSetup: Encodable {
2929
let model: String
3030

3131
/// Generation config.
32-
let generationConfig: LiveGenerationConfig?
32+
let generationConfig: BidiGenerationConfig?
3333

3434
/// The user provided system instructions for the model.
3535
/// Note: only text should be used in parts and content in each part will be
@@ -48,18 +48,24 @@ struct BidiGenerateContentSetup: Encodable {
4848
/// Configures the handling of realtime input.
4949
let realtimeInputConfig: RealtimeInputConfig?
5050

51-
let inputAudioTranscription: AudioTranscriptionConfig?
51+
/// Input transcription. The transcription is independent to the model turn
52+
/// which means it doesn't imply any ordering between transcription and model
53+
/// turn.
54+
let inputAudioTranscription: BidiAudioTranscriptionConfig?
5255

53-
let outputAudioTranscription: AudioTranscriptionConfig?
56+
/// Output transcription. The transcription is independent to the model turn
57+
/// which means it doesn't imply any ordering between transcription and model
58+
/// turn.
59+
let outputAudioTranscription: BidiAudioTranscriptionConfig?
5460

5561
init(model: String,
56-
generationConfig: LiveGenerationConfig? = nil,
62+
generationConfig: BidiGenerationConfig? = nil,
5763
systemInstruction: ModelContent? = nil,
5864
tools: [Tool]? = nil,
5965
toolConfig: ToolConfig? = nil,
6066
realtimeInputConfig: RealtimeInputConfig? = nil,
61-
inputAudioTranscription: AudioTranscriptionConfig? = nil,
62-
outputAudioTranscription: AudioTranscriptionConfig? = nil) {
67+
inputAudioTranscription: BidiAudioTranscriptionConfig? = nil,
68+
outputAudioTranscription: BidiAudioTranscriptionConfig? = nil) {
6369
self.model = model
6470
self.generationConfig = generationConfig
6571
self.systemInstruction = systemInstruction
@@ -71,4 +77,4 @@ struct BidiGenerateContentSetup: Encodable {
7177
}
7278
}
7379

74-
struct AudioTranscriptionConfig: Encodable {}
80+
struct BidiAudioTranscriptionConfig: Encodable {}

FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentTranscription.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,4 @@
1515
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
1616
struct BidiGenerateContentTranscription: Decodable, Sendable {
1717
let text: String?
18-
let finished: Bool?
1918
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Configuration options for live content generation.
18+
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
19+
internal struct BidiGenerationConfig: Encodable, Sendable {
20+
let temperature: Float?
21+
let topP: Float?
22+
let topK: Int?
23+
let candidateCount: Int?
24+
let maxOutputTokens: Int?
25+
let presencePenalty: Float?
26+
let frequencyPenalty: Float?
27+
let responseModalities: [ResponseModality]?
28+
let speechConfig: BidiSpeechConfig?
29+
30+
init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
31+
candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
32+
presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
33+
responseModalities: [ResponseModality]? = nil,
34+
speechConfig: BidiSpeechConfig? = nil
35+
) {
36+
self.temperature = temperature
37+
self.topP = topP
38+
self.topK = topK
39+
self.candidateCount = candidateCount
40+
self.maxOutputTokens = maxOutputTokens
41+
self.presencePenalty = presencePenalty
42+
self.frequencyPenalty = frequencyPenalty
43+
self.responseModalities = responseModalities
44+
self.speechConfig = speechConfig
45+
}
46+
}

FirebaseAI/Sources/Types/Internal/Live/SpeechConfig.swift renamed to FirebaseAI/Sources/Types/Internal/Live/BidiSpeechConfig.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import Foundation
1616

1717
/// Speech generation config.
18-
struct SpeechConfig: Encodable, Sendable {
18+
struct BidiSpeechConfig: Encodable, Sendable {
1919
/// The configuration for the speaker to use.
2020
let voiceConfig: VoiceConfig
2121

FirebaseAI/Sources/Types/Internal/Live/LiveSessionService.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,12 @@ actor LiveSessionService {
180180
do {
181181
let setup = BidiGenerateContentSetup(
182182
model: modelResourceName,
183-
generationConfig: generationConfig,
183+
generationConfig: generationConfig?.bidiGenerationConfig,
184184
systemInstruction: systemInstruction,
185185
tools: tools,
186186
toolConfig: toolConfig,
187-
outputAudioTranscription: AudioTranscriptionConfig()
187+
inputAudioTranscription: generationConfig?.inputAudioTranscription,
188+
outputAudioTranscription: generationConfig?.outputAudioTranscription
188189
)
189190
let data = try jsonEncoder.encode(BidiGenerateContentClientMessage.setup(setup))
190191
try await webSocket.send(.data(data))
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
/// Configuration options for audio transcriptions when communicating with a live model.
16+
///
17+
/// While there are not currently any options, this will likely change in the future. For now, just providing
18+
/// an instance of this struct will enable audio transcriptions for the corresponding input or output fields.
19+
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
20+
public struct AudioTranscriptionConfig: Sendable {
21+
let audioTranscriptionConfig: BidiAudioTranscriptionConfig
22+
23+
init(_ audioTranscriptionConfig: BidiAudioTranscriptionConfig) {
24+
self.audioTranscriptionConfig = audioTranscriptionConfig
25+
}
26+
27+
public init() {
28+
self.init(BidiAudioTranscriptionConfig())
29+
}
30+
}

FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift

Lines changed: 41 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -17,32 +17,9 @@ import Foundation
1717
/// Configuration options for live content generation.
1818
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
1919
public struct LiveGenerationConfig: Sendable {
20-
/// Controls the degree of randomness in token selection.
21-
let temperature: Float?
22-
23-
/// Controls diversity of generated text.
24-
let topP: Float?
25-
26-
/// Limits the number of highest probability words considered.
27-
let topK: Int?
28-
29-
/// The number of response variations to return.
30-
let candidateCount: Int?
31-
32-
/// Maximum number of tokens that can be generated in the response.
33-
let maxOutputTokens: Int?
34-
35-
/// Controls the likelihood of repeating the same words or phrases already generated in the text.
36-
let presencePenalty: Float?
37-
38-
/// Controls the likelihood of repeating words, with the penalty increasing for each repetition.
39-
let frequencyPenalty: Float?
40-
41-
/// Supported modalities of the response.
42-
let responseModalities: [ResponseModality]?
43-
44-
/// Controls the voice of the model during conversation.
45-
let speechConfig: SpeechConfig?
20+
let bidiGenerationConfig: BidiGenerationConfig
21+
let inputAudioTranscription: BidiAudioTranscriptionConfig?
22+
let outputAudioTranscription: BidiAudioTranscriptionConfig?
4623

4724
/// Creates a new ``LiveGenerationConfig`` value.
4825
///
@@ -125,38 +102,49 @@ public struct LiveGenerationConfig: Sendable {
125102
/// > backwards-incompatible ways.
126103
/// - speechConfig: Controls the voice of the model, when streaming `audio` via
127104
/// ``ResponseModality``.
105+
/// - inputAudioTranscription: Configures (and enables) input transcriptions when streaming to the model.
106+
///
107+
/// Input transcripts are the model's interprutation of audio data sent to it, and they are populated in model responses via ``LiveServerContent``.
108+
/// When this fields is set to `nil`, input transcripts are not populated in model responses.
109+
/// - outputAudioTranscription: Configures (and enables) output transcriptions when streaming to the model.
110+
///
111+
/// Output transcripts are text representations of the audio the model is sending to the client, and they are populated in model responses via ``LiveServerContent``
112+
/// When this fields is set to `nil`, output transcripts are not populated in model responses.
113+
///
114+
/// > Important: Transcripts are independent to the model turn. This means transcripts may come earlier or later than when
115+
/// > the model sends the corresponding audio responses.
128116
public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
129117
candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
130118
presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
131119
responseModalities: [ResponseModality]? = nil,
132-
speechConfig: LiveSpeechConfig? = nil) {
133-
// Explicit init because otherwise if we re-arrange the above variables it changes the API
134-
// surface.
135-
self.temperature = temperature
136-
self.topP = topP
137-
self.topK = topK
138-
self.candidateCount = candidateCount
139-
self.maxOutputTokens = maxOutputTokens
140-
self.presencePenalty = presencePenalty
141-
self.frequencyPenalty = frequencyPenalty
142-
self.responseModalities = responseModalities
143-
self.speechConfig = speechConfig?.speechConfig
120+
speechConfig: SpeechConfig? = nil,
121+
inputAudioTranscription: AudioTranscriptionConfig? = nil,
122+
outputAudioTranscription: AudioTranscriptionConfig? = nil
123+
) {
124+
self.init(
125+
BidiGenerationConfig(
126+
temperature: temperature,
127+
topP: topP,
128+
topK: topK,
129+
candidateCount: candidateCount,
130+
maxOutputTokens: maxOutputTokens,
131+
presencePenalty: presencePenalty,
132+
frequencyPenalty: frequencyPenalty,
133+
responseModalities: responseModalities,
134+
speechConfig: speechConfig?.speechConfig
135+
),
136+
inputAudioTranscription: inputAudioTranscription?.audioTranscriptionConfig,
137+
outputAudioTranscription: outputAudioTranscription?.audioTranscriptionConfig
138+
)
144139
}
145-
}
146-
147-
// MARK: - Codable Conformances
148140

149-
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
150-
extension LiveGenerationConfig: Encodable {
151-
enum CodingKeys: String, CodingKey {
152-
case temperature
153-
case topP
154-
case topK
155-
case candidateCount
156-
case maxOutputTokens
157-
case presencePenalty
158-
case frequencyPenalty
159-
case responseModalities
160-
case speechConfig
141+
init(
142+
_ bidiGenerationConfig: BidiGenerationConfig,
143+
inputAudioTranscription: BidiAudioTranscriptionConfig? = nil,
144+
outputAudioTranscription: BidiAudioTranscriptionConfig? = nil
145+
) {
146+
self.bidiGenerationConfig = bidiGenerationConfig
147+
self.inputAudioTranscription = inputAudioTranscription
148+
self.outputAudioTranscription = outputAudioTranscription
161149
}
162150
}

FirebaseAI/Sources/Types/Public/Live/LiveServerContent.swift

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,21 @@ public struct LiveServerContent: Sendable {
6060
/// Metadata specifing the sources used to ground generated content.
6161
public var groundingMetadata: GroundingMetadata? { serverContent.groundingMetadata }
6262

63-
// TODO: remove
64-
public var transcript: LiveTranscript? {
65-
if let transcript = serverContent.outputTranscription {
66-
LiveTranscript(transcript)
67-
} else {
68-
nil
69-
}
63+
/// The model's interpretation of what the client said in an audio message.
64+
///
65+
/// This field is only populated when an ``AudioTranscriptionConfig`` is provided to ``LiveGenerationConfig``.
66+
public var inputTranscription: LiveTranscription? {
67+
serverContent.inputTranscription.map { LiveTranscription($0) }
68+
}
69+
70+
/// Transcription matching the model's audio response.
71+
///
72+
/// This field is only populated when an ``AudioTranscriptionConfig`` is provided to ``LiveGenerationConfig``.
73+
///
74+
/// > Important: Transcripts are independent to the model turn. This means transcripts may come earlier or later than when
75+
/// > the model sends the corresponding audio responses.
76+
public var outputTranscription: LiveTranscription? {
77+
serverContent.outputTranscription.map { LiveTranscription($0) }
7078
}
7179

7280
init(_ serverContent: BidiGenerateContentServerContent) {

FirebaseAI/Sources/Types/Public/Live/LiveTranscript.swift renamed to FirebaseAI/Sources/Types/Public/Live/LiveTranscription.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
// TODO: remove
15+
/// Text transcription of some audio form during a live interaction with the model.
1616
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
17-
public struct LiveTranscript: Sendable {
17+
public struct LiveTranscription: Sendable {
1818
let transcript: BidiGenerateContentTranscription
19+
/// Text representing the model's interpretation of what the audio said.
1920
public var text: String? { transcript.text }
20-
public var finished: Bool? { transcript.finished }
2121

2222
init(_ transcript: BidiGenerateContentTranscription) {
2323
self.transcript = transcript

0 commit comments

Comments
 (0)