Skip to content

Commit 864824a

Browse files
author
David Motsonashvili
committed
Add LiveAPI Transcription
1 parent 0e81807 commit 864824a

File tree

6 files changed

+140
-14
lines changed

6 files changed

+140
-14
lines changed

firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ internal constructor(
111111
modelName,
112112
config?.toInternal(),
113113
tools?.map { it.toInternal() },
114-
systemInstruction?.toInternal()
114+
systemInstruction?.toInternal(),
115+
config?.inputAudioTranscription?.toInternal(),
116+
config?.outputAudioTranscription?.toInternal()
115117
)
116118
.toInternal()
117119
val data: String = Json.encodeToString(clientMessage)
@@ -135,7 +137,7 @@ internal constructor(
135137
} catch (e: ClosedReceiveChannelException) {
136138
val reason = webSession?.closeReason?.await()
137139
val message =
138-
"Channel was closed by the server.${if(reason!=null) " Details: ${reason.message}" else "" }"
140+
"Channel was closed by the server.${if (reason != null) " Details: ${reason.message}" else ""}"
139141
throw ServiceConnectionHandshakeFailedException(message, e)
140142
}
141143
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.ai.type
18+
19+
import kotlinx.serialization.SerialName
20+
import kotlinx.serialization.Serializable
21+
import kotlinx.serialization.json.JsonObject
22+
23+
/**
24+
* The audio transcription configuration.
25+
* @property enable If true, the server will use Gemini to transcribe the audio.
26+
* @property prefixPrompt Prefix prompt for the audio transcription op. This is useful to override
27+
* the default prefix prompt that only asks the model to transcribe the audio. Overriding can be
28+
* useful to provide additional context to the model such as what language is expected to be spoken
29+
* in the audio.
30+
*/
31+
public class AudioTranscriptionConfig(
32+
internal val enable: Boolean? = null,
33+
internal val prefixPrompt: String? = null
34+
) {
35+
36+
internal fun toInternal() = JsonObject(emptyMap())
37+
38+
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package com.google.firebase.ai.type
1818

1919
import kotlinx.serialization.ExperimentalSerializationApi
2020
import kotlinx.serialization.Serializable
21+
import kotlinx.serialization.json.JsonObject
2122

2223
/**
2324
* First message in a live session.
@@ -32,7 +33,9 @@ internal class LiveClientSetupMessage(
3233
// needs its own config class
3334
val generationConfig: LiveGenerationConfig.Internal?,
3435
val tools: List<Tool.Internal>?,
35-
val systemInstruction: Content.Internal?
36+
val systemInstruction: Content.Internal?,
37+
val inputAudioTranscription: JsonObject?,
38+
val outputAudioTranscription: JsonObject?,
3639
) {
3740
@Serializable
3841
internal class Internal(val setup: LiveClientSetup) {
@@ -41,10 +44,21 @@ internal class LiveClientSetupMessage(
4144
val model: String,
4245
val generationConfig: LiveGenerationConfig.Internal?,
4346
val tools: List<Tool.Internal>?,
44-
val systemInstruction: Content.Internal?
47+
val systemInstruction: Content.Internal?,
48+
val inputAudioTranscription: JsonObject?,
49+
val outputAudioTranscription: JsonObject?,
4550
)
4651
}
4752

4853
fun toInternal() =
49-
Internal(Internal.LiveClientSetup(model, generationConfig, tools, systemInstruction))
54+
Internal(
55+
Internal.LiveClientSetup(
56+
model,
57+
generationConfig,
58+
tools,
59+
systemInstruction,
60+
inputAudioTranscription,
61+
outputAudioTranscription
62+
)
63+
)
5064
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ import kotlinx.serialization.Serializable
5353
*
5454
* @property speechConfig Specifies the voice configuration of the audio response from the server.
5555
*
56+
* @property inputAudioTranscription Specifies the configuration for transcribing input audio.
57+
*
58+
* @property outputAudioTranscription Specifies the configuration for transcribing output audio from
59+
* the model.
60+
*
5661
* Refer to the
5762
* [Control generated output](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/control-generated-output)
5863
* guide for more details.
@@ -67,7 +72,9 @@ private constructor(
6772
internal val presencePenalty: Float?,
6873
internal val frequencyPenalty: Float?,
6974
internal val responseModality: ResponseModality?,
70-
internal val speechConfig: SpeechConfig?
75+
internal val speechConfig: SpeechConfig?,
76+
internal val inputAudioTranscription: AudioTranscriptionConfig?,
77+
internal val outputAudioTranscription: AudioTranscriptionConfig?,
7178
) {
7279

7380
/**
@@ -91,6 +98,10 @@ private constructor(
9198
* @property responseModality See [LiveGenerationConfig.responseModality]
9299
*
93100
* @property speechConfig See [LiveGenerationConfig.speechConfig]
101+
*
102+
* @property inputAudioTranscription see [LiveGenerationConfig.inputAudioTranscription]
103+
*
104+
* @property outputAudioTranscription see [LiveGenerationConfig.outputAudioTranscription]
94105
*/
95106
public class Builder {
96107
@JvmField public var temperature: Float? = null
@@ -101,6 +112,8 @@ private constructor(
101112
@JvmField public var frequencyPenalty: Float? = null
102113
@JvmField public var responseModality: ResponseModality? = null
103114
@JvmField public var speechConfig: SpeechConfig? = null
115+
@JvmField public var inputAudioTranscription: AudioTranscriptionConfig? = null
116+
@JvmField public var outputAudioTranscription: AudioTranscriptionConfig? = null
104117

105118
public fun setTemperature(temperature: Float?): Builder = apply {
106119
this.temperature = temperature
@@ -123,6 +136,14 @@ private constructor(
123136
this.speechConfig = speechConfig
124137
}
125138

139+
public fun setInputAudioTranscript(config: AudioTranscriptionConfig?): Builder = apply {
140+
this.inputAudioTranscription = config
141+
}
142+
143+
public fun setOutputAudioTranscript(config: AudioTranscriptionConfig?): Builder = apply {
144+
this.outputAudioTranscription = config
145+
}
146+
126147
/** Create a new [LiveGenerationConfig] with the attached arguments. */
127148
public fun build(): LiveGenerationConfig =
128149
LiveGenerationConfig(
@@ -133,7 +154,9 @@ private constructor(
133154
presencePenalty = presencePenalty,
134155
frequencyPenalty = frequencyPenalty,
135156
speechConfig = speechConfig,
136-
responseModality = responseModality
157+
responseModality = responseModality,
158+
inputAudioTranscription = inputAudioTranscription,
159+
outputAudioTranscription = outputAudioTranscription,
137160
)
138161
}
139162

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveServerMessage.kt

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,24 +82,40 @@ public class LiveServerContent(
8282
* [interrupted] -> [turnComplete].
8383
*/
8484
public val generationComplete: Boolean,
85+
86+
/**
87+
* The input transcription. The transcription is independent to the model turn which means it
88+
* doesn't imply any ordering between transcription and model turn.
89+
*/
90+
public val inputTranscription: Transcription?,
91+
92+
/**
93+
* The output transcription. The transcription is independent to the model turn which means it
94+
* doesn't imply any ordering between transcription and model turn.
95+
*/
96+
public val outputTranscription: Transcription?
8597
) : LiveServerMessage {
8698
@OptIn(ExperimentalSerializationApi::class)
8799
@Serializable
88100
internal data class Internal(
89-
val modelTurn: Content.Internal? = null,
90-
val interrupted: Boolean = false,
91-
val turnComplete: Boolean = false,
92-
val generationComplete: Boolean = false
101+
val modelTurn: Content.Internal?,
102+
val interrupted: Boolean?,
103+
val turnComplete: Boolean?,
104+
val generationComplete: Boolean?,
105+
val inputTranscription: Transcription.Internal?,
106+
val outputTranscription: Transcription.Internal?
93107
)
94108
@Serializable
95109
internal data class InternalWrapper(val serverContent: Internal) : InternalLiveServerMessage {
96110
@OptIn(ExperimentalSerializationApi::class)
97111
override fun toPublic() =
98112
LiveServerContent(
99113
serverContent.modelTurn?.toPublic(),
100-
serverContent.interrupted,
101-
serverContent.turnComplete,
102-
serverContent.generationComplete
114+
serverContent.interrupted ?: false,
115+
serverContent.turnComplete?: false,
116+
serverContent.generationComplete?: false,
117+
serverContent.inputTranscription?.toPublic(),
118+
serverContent.outputTranscription?.toPublic()
103119
)
104120
}
105121
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.ai.type
18+
19+
import kotlinx.serialization.Serializable
20+
21+
/**
22+
* Audio transcription message.
23+
* @property text Transcription text
24+
*/
25+
public class Transcription(public val text: String?) {
26+
27+
@Serializable
28+
internal data class Internal(val text: String?) {
29+
fun toPublic(): Transcription {
30+
return Transcription(text)
31+
}
32+
}
33+
}

0 commit comments

Comments
 (0)