Skip to content

Commit b5b0107

Browse files
committed
Add support for SpeechConfig
1 parent a9eb3d8 commit b5b0107

File tree

8 files changed

+160
-12
lines changed

8 files changed

+160
-12
lines changed

FirebaseAI/Sources/AILog.swift

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ enum AILog {
7272
case liveSessionFailedToSendClientMessage = 3021
7373
case liveSessionUnexpectedResponse = 3022
7474

75-
7675
// SDK State Errors
7776
case generateContentResponseNoCandidates = 4000
7877
case generateContentResponseNoText = 4001
@@ -81,7 +80,6 @@ enum AILog {
8180
case invalidWebsocketURL = 4004
8281
case duplicateLiveSessionSetupComplete = 4005
8382

84-
8583
// SDK Debugging
8684
case loadRequestStreamResponseLine = 5000
8785
}

FirebaseAI/Sources/Types/Internal/AppCheck.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import FirebaseAppCheckInterop
1616

1717
// TODO: document
18-
internal extension AppCheckInterop {
18+
extension AppCheckInterop {
1919
// TODO: Document
2020
func fetchAppCheckToken(limitedUse: Bool,
2121
domain: String) async throws -> FIRAppCheckTokenResultInterop {

FirebaseAI/Sources/Types/Internal/Live/LiveSessionService.swift

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ actor LiveSessionService {
100100

101101
/// Start a new connection to the backend.
102102
///
103-
/// Seperated into its own function to make it easier to surface a way to call it seperately when resuming the same session.
103+
/// Seperated into its own function to make it easier to surface a way to call it seperately when
104+
/// resuming the same session.
104105
public func connect() {
105106
setupTask.cancel()
106107
setupTask = Task { [weak self] in
@@ -197,8 +198,8 @@ actor LiveSessionService {
197198
} else if let liveMessage = LiveServerMessage.tryFrom(response) {
198199
responseContinuation.yield(liveMessage)
199200
} else {
200-
// we don't raise an error, since this allows us to add support internally but not publicly
201-
// we still log it in debug though, in case it's not expected
201+
// we don't raise an error, since this allows us to add support internally but not
202+
// publicly we still log it in debug though, in case it's not expected
202203
AILog.debug(
203204
code: .liveSessionUnsupportedMessage,
204205
"The server sent a message that we don't currently have a mapping for: \(response)"
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Speech generation config.
18+
struct SpeechConfig: Encodable, Sendable {
19+
/// The configuration for the speaker to use.
20+
let voiceConfig: VoiceConfig
21+
22+
/// Language code (ISO 639. e.g. en-US) for the speech synthesization.
23+
let languageCode: String?
24+
25+
init(voiceConfig: VoiceConfig, languageCode: String?) {
26+
self.voiceConfig = voiceConfig
27+
self.languageCode = languageCode
28+
}
29+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Configuration for the speaker to use.
18+
enum VoiceConfig {
19+
/// Configuration for the prebuilt voice to use.
20+
case prebuiltVoiceConfig(PrebuiltVoiceConfig)
21+
22+
/// Configuration for the custom voice to use.
23+
case customVoiceConfig(CustomVoiceConfig)
24+
}
25+
26+
/// The configuration for the prebuilt speaker to use.
27+
///
28+
/// Not just a string on the parent proto, because there'll likely be a lot
29+
/// more options here.
30+
struct PrebuiltVoiceConfig: Encodable, Sendable {
31+
/// The name of the preset voice to use.
32+
let voiceName: String
33+
34+
init(voiceName: String) {
35+
self.voiceName = voiceName
36+
}
37+
}
38+
39+
/// The configuration for the custom voice to use.
40+
struct CustomVoiceConfig: Encodable, Sendable {
41+
/// The sample of the custom voice, in pcm16 s16e format.
42+
let customVoiceSample: Data
43+
44+
init(customVoiceSample: Data) {
45+
self.customVoiceSample = customVoiceSample
46+
}
47+
}
48+
49+
// MARK: - Encodable conformance
50+
51+
extension VoiceConfig: Encodable {
52+
enum CodingKeys: CodingKey {
53+
case prebuiltVoiceConfig
54+
case customVoiceConfig
55+
}
56+
57+
func encode(to encoder: any Encoder) throws {
58+
var container = encoder.container(keyedBy: CodingKeys.self)
59+
switch self {
60+
case let .prebuiltVoiceConfig(setup):
61+
try container.encode(setup, forKey: .prebuiltVoiceConfig)
62+
case let .customVoiceConfig(clientContent):
63+
try container.encode(clientContent, forKey: .customVoiceConfig)
64+
}
65+
}
66+
}

FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import Foundation
1616

17-
// TODO: add support for SpeechConfig
1817
/// A struct defining model parameters to be used when sending generative AI
1918
/// requests to the backend model.
2019
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
@@ -43,6 +42,9 @@ public struct LiveGenerationConfig: Sendable {
4342
/// Supported modalities of the response.
4443
let responseModalities: [ResponseModality]?
4544

45+
/// Controls the voice of the model during conversation.
46+
let speechConfig: SpeechConfig?
47+
4648
/// Creates a new `GenerationConfig` value.
4749
///
4850
/// See the
@@ -122,10 +124,13 @@ public struct LiveGenerationConfig: Sendable {
122124
/// > Warning: Specifying response modalities is a **Public Preview** feature, which means
123125
/// > that it is not subject to any SLA or deprecation policy and could change in
124126
/// > backwards-incompatible ways.
127+
/// - speechConfig: Controls the voice of the model, when streaming `audio` via
128+
/// ``ResponseModality``.
125129
public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
126130
candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
127131
presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
128-
responseModalities: [ResponseModality]? = nil) {
132+
responseModalities: [ResponseModality]? = nil,
133+
speechConfig: LiveSpeechConfig? = nil) {
129134
// Explicit init because otherwise if we re-arrange the above variables it changes the API
130135
// surface.
131136
self.temperature = temperature
@@ -136,6 +141,7 @@ public struct LiveGenerationConfig: Sendable {
136141
self.presencePenalty = presencePenalty
137142
self.frequencyPenalty = frequencyPenalty
138143
self.responseModalities = responseModalities
144+
self.speechConfig = speechConfig?.speechConfig
139145
}
140146
}
141147

@@ -152,5 +158,6 @@ extension LiveGenerationConfig: Encodable {
152158
case presencePenalty
153159
case frequencyPenalty
154160
case responseModalities
161+
case speechConfig
155162
}
156163
}

FirebaseAI/Sources/Types/Public/Live/LiveServerMessage.swift

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ public struct LiveServerMessage: Sendable {
2525
/// Request for the client to execute the provided functions.
2626
case toolCall(LiveServerToolCall)
2727

28-
/// Notification for the client that a previously issued ``LiveServerToolCall`` should be cancelled.
28+
/// Notification for the client that a previously issued ``LiveServerToolCall`` should be
29+
/// cancelled.
2930
case toolCallCancellation(LiveServerToolCallCancellation)
3031

3132
/// Server will disconnect soon.
@@ -35,14 +36,14 @@ public struct LiveServerMessage: Sendable {
3536
/// The actual message sent from the server.
3637
public var messageType: MessageType
3738

38-
///
39+
// TODO: document
3940
public var usageMetadata: GenerateContentResponse.UsageMetadata? { serverMessage.usageMetadata }
4041
}
4142

4243
// MARK: - Internal parsing
4344

4445
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
45-
internal extension LiveServerMessage {
46+
extension LiveServerMessage {
4647
static func tryFrom(_ serverMessage: BidiGenerateContentServerMessage) -> Self? {
4748
guard let messageType = LiveServerMessage.MessageType.tryFrom(serverMessage.messageType) else {
4849
return nil
@@ -53,7 +54,7 @@ internal extension LiveServerMessage {
5354
}
5455

5556
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
56-
internal extension LiveServerMessage.MessageType {
57+
extension LiveServerMessage.MessageType {
5758
static func tryFrom(_ serverMessage: BidiGenerateContentServerMessage.MessageType) -> Self? {
5859
return switch serverMessage {
5960
case .setupComplete:
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Configuration for controlling the voice of the model during conversation.
18+
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
19+
public struct LiveSpeechConfig: Sendable {
20+
let speechConfig: SpeechConfig
21+
22+
init(_ speechConfig: SpeechConfig) {
23+
self.speechConfig = speechConfig
24+
}
25+
26+
/// Creates a new `LiveSpeechConfig` value.
27+
///
28+
/// - Parameters:
29+
/// - voiceName: The name of the prebuilt voice to be used for the model's speech response.
30+
///
31+
/// To learn more about the available voices, see the docs on
32+
/// [Voice options](https://ai.google.dev/gemini-api/docs/speech-generation#voices)\.
33+
/// - languageCode: ISO-639 language code to use when parsing text sent from the client, instead
34+
/// of audio. By default, the model will attempt to detect the input language automatically.
35+
///
36+
/// To learn which codes are supported, see the docs on
37+
/// [Supported languages](https://ai.google.dev/gemini-api/docs/speech-generation#languages)\.
38+
public init(voiceName: String, languageCode: String? = nil) {
39+
self.init(
40+
SpeechConfig(
41+
voiceConfig: .prebuiltVoiceConfig(.init(voiceName: voiceName)),
42+
languageCode: languageCode
43+
)
44+
)
45+
}
46+
}

0 commit comments

Comments
 (0)