Skip to content

Commit 0d99e6e

Browse files
committed
Merge remote-tracking branch 'origin/main' into QueryHashIncorporatesAuthUid2
2 parents 5b851e7 + 89c3e53 commit 0d99e6e

26 files changed

+602
-86
lines changed

firebase-ai/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Unreleased
22

3+
- [changed] Added better scheduling and louder output for Live API.
4+
- [changed] Added support for input and output transcription. (#7482)
35
- [feature] Added support for sending realtime audio and video in a `LiveSession`.
46
- [changed] Removed redundant internal exception types. (#7475)
57

firebase-ai/api.txt

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ package com.google.firebase.ai.java {
154154
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(boolean enableInterruptions);
155155
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
156156
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, boolean enableInterruptions);
157+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler, boolean enableInterruptions);
158+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler);
159+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler, boolean enableInterruptions);
157160
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
158161
method public abstract void stopReceiving();
159162
field public static final com.google.firebase.ai.java.LiveSessionFutures.Companion Companion;
@@ -174,6 +177,10 @@ package com.google.firebase.ai.type {
174177
ctor public AudioRecordInitializationFailedException(String message);
175178
}
176179

180+
public final class AudioTranscriptionConfig {
181+
ctor public AudioTranscriptionConfig();
182+
}
183+
177184
public final class BlockReason {
178185
method public String getName();
179186
method public int getOrdinal();
@@ -839,15 +846,19 @@ package com.google.firebase.ai.type {
839846
ctor public LiveGenerationConfig.Builder();
840847
method public com.google.firebase.ai.type.LiveGenerationConfig build();
841848
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setFrequencyPenalty(Float? frequencyPenalty);
849+
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setInputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
842850
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setMaxOutputTokens(Integer? maxOutputTokens);
851+
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setOutputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
843852
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setPresencePenalty(Float? presencePenalty);
844853
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setResponseModality(com.google.firebase.ai.type.ResponseModality? responseModality);
845854
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setSpeechConfig(com.google.firebase.ai.type.SpeechConfig? speechConfig);
846855
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTemperature(Float? temperature);
847856
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopK(Integer? topK);
848857
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopP(Float? topP);
849858
field public Float? frequencyPenalty;
859+
field public com.google.firebase.ai.type.AudioTranscriptionConfig? inputAudioTranscription;
850860
field public Integer? maxOutputTokens;
861+
field public com.google.firebase.ai.type.AudioTranscriptionConfig? outputAudioTranscription;
851862
field public Float? presencePenalty;
852863
field public com.google.firebase.ai.type.ResponseModality? responseModality;
853864
field public com.google.firebase.ai.type.SpeechConfig? speechConfig;
@@ -865,14 +876,18 @@ package com.google.firebase.ai.type {
865876
}
866877

867878
@com.google.firebase.ai.type.PublicPreviewAPI public final class LiveServerContent implements com.google.firebase.ai.type.LiveServerMessage {
868-
ctor public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete);
879+
ctor @Deprecated public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete, com.google.firebase.ai.type.Transcription? inputTranscription, com.google.firebase.ai.type.Transcription? outputTranscription);
869880
method public com.google.firebase.ai.type.Content? getContent();
870881
method public boolean getGenerationComplete();
882+
method public com.google.firebase.ai.type.Transcription? getInputTranscription();
871883
method public boolean getInterrupted();
884+
method public com.google.firebase.ai.type.Transcription? getOutputTranscription();
872885
method public boolean getTurnComplete();
873886
property public final com.google.firebase.ai.type.Content? content;
874887
property public final boolean generationComplete;
888+
property public final com.google.firebase.ai.type.Transcription? inputTranscription;
875889
property public final boolean interrupted;
890+
property public final com.google.firebase.ai.type.Transcription? outputTranscription;
876891
property public final boolean turnComplete;
877892
}
878893

@@ -909,6 +924,7 @@ package com.google.firebase.ai.type {
909924
method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation<? super kotlin.Unit>);
910925
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
911926
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.coroutines.Continuation<? super kotlin.Unit>);
927+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
912928
method public void stopAudioConversation();
913929
method public void stopReceiving();
914930
}
@@ -1235,6 +1251,11 @@ package com.google.firebase.ai.type {
12351251
ctor public ToolConfig(com.google.firebase.ai.type.FunctionCallingConfig? functionCallingConfig);
12361252
}
12371253

1254+
public final class Transcription {
1255+
method public String? getText();
1256+
property public final String? text;
1257+
}
1258+
12381259
public final class UnknownException extends com.google.firebase.ai.type.FirebaseAIException {
12391260
}
12401261

firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ internal constructor(
111111
modelName,
112112
config?.toInternal(),
113113
tools?.map { it.toInternal() },
114-
systemInstruction?.toInternal()
114+
systemInstruction?.toInternal(),
115+
config?.inputAudioTranscription?.toInternal(),
116+
config?.outputAudioTranscription?.toInternal()
115117
)
116118
.toInternal()
117119
val data: String = Json.encodeToString(clientMessage)
@@ -135,7 +137,7 @@ internal constructor(
135137
} catch (e: ClosedReceiveChannelException) {
136138
val reason = webSession?.closeReason?.await()
137139
val message =
138-
"Channel was closed by the server.${if(reason!=null) " Details: ${reason.message}" else "" }"
140+
"Channel was closed by the server.${if (reason != null) " Details: ${reason.message}" else ""}"
139141
throw ServiceConnectionHandshakeFailedException(message, e)
140142
}
141143
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/common/APIController.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ import kotlinx.coroutines.flow.map
7777
import kotlinx.coroutines.launch
7878
import kotlinx.coroutines.withTimeout
7979
import kotlinx.serialization.ExperimentalSerializationApi
80+
import kotlinx.serialization.json.ClassDiscriminatorMode
8081
import kotlinx.serialization.json.Json
8182

8283
@OptIn(ExperimentalSerializationApi::class)
@@ -85,6 +86,7 @@ internal val JSON = Json {
8586
prettyPrint = false
8687
isLenient = true
8788
explicitNulls = false
89+
classDiscriminatorMode = ClassDiscriminatorMode.NONE
8890
}
8991

9092
/**

firebase-ai/src/main/kotlin/com/google/firebase/ai/common/util/android.kt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@
1717
package com.google.firebase.ai.common.util
1818

1919
import android.media.AudioRecord
20-
import kotlin.time.Duration.Companion.milliseconds
2120
import kotlinx.coroutines.delay
2221
import kotlinx.coroutines.flow.flow
23-
import kotlinx.coroutines.yield
2422

2523
/**
2624
* The minimum buffer size for this instance.
@@ -40,15 +38,17 @@ internal fun AudioRecord.readAsFlow() = flow {
4038

4139
while (true) {
4240
if (recordingState != AudioRecord.RECORDSTATE_RECORDING) {
43-
// TODO(vguthal): Investigate if both yield and delay are required.
44-
delay(10.milliseconds)
45-
yield()
41+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement when
42+
// compared to yield.
43+
delay(0)
4644
continue
4745
}
4846
val bytesRead = read(buffer, 0, buffer.size)
4947
if (bytesRead > 0) {
5048
emit(buffer.copyOf(bytesRead))
5149
}
52-
yield()
50+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement when
51+
// compared to yield.
52+
delay(0)
5353
}
5454
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import com.google.firebase.ai.type.LiveSession
2929
import com.google.firebase.ai.type.MediaData
3030
import com.google.firebase.ai.type.PublicPreviewAPI
3131
import com.google.firebase.ai.type.SessionAlreadyReceivingException
32+
import com.google.firebase.ai.type.Transcription
3233
import io.ktor.websocket.close
3334
import kotlinx.coroutines.reactive.asPublisher
3435
import org.reactivestreams.Publisher
@@ -41,6 +42,13 @@ import org.reactivestreams.Publisher
4142
@PublicPreviewAPI
4243
public abstract class LiveSessionFutures internal constructor() {
4344

45+
/**
46+
* Starts an audio conversation with the model, which can only be stopped using
47+
* [stopAudioConversation].
48+
*/
49+
@RequiresPermission(RECORD_AUDIO)
50+
public abstract fun startAudioConversation(): ListenableFuture<Unit>
51+
4452
/**
4553
* Starts an audio conversation with the model, which can only be stopped using
4654
* [stopAudioConversation] or [close].
@@ -56,9 +64,14 @@ public abstract class LiveSessionFutures internal constructor() {
5664
/**
5765
* Starts an audio conversation with the model, which can only be stopped using
5866
* [stopAudioConversation].
67+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
68+
* transcript. The first [Transcription] object is the input transcription, and the second is the
69+
* output transcription
5970
*/
6071
@RequiresPermission(RECORD_AUDIO)
61-
public abstract fun startAudioConversation(): ListenableFuture<Unit>
72+
public abstract fun startAudioConversation(
73+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
74+
): ListenableFuture<Unit>
6275

6376
/**
6477
* Starts an audio conversation with the model, which can only be stopped using
@@ -73,6 +86,26 @@ public abstract class LiveSessionFutures internal constructor() {
7386
@RequiresPermission(RECORD_AUDIO)
7487
public abstract fun startAudioConversation(enableInterruptions: Boolean): ListenableFuture<Unit>
7588

89+
/**
90+
* Starts an audio conversation with the model, which can only be stopped using
91+
* [stopAudioConversation] or [close].
92+
*
93+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
94+
* transcript. The first [Transcription] object is the input transcription, and the second is the
95+
* output transcription
96+
*
97+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
98+
* ongoing reply.
99+
*
100+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
101+
* consistently available.
102+
*/
103+
@RequiresPermission(RECORD_AUDIO)
104+
public abstract fun startAudioConversation(
105+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
106+
enableInterruptions: Boolean
107+
): ListenableFuture<Unit>
108+
76109
/**
77110
* Starts an audio conversation with the model, which can only be stopped using
78111
* [stopAudioConversation] or [close].
@@ -92,6 +125,30 @@ public abstract class LiveSessionFutures internal constructor() {
92125
enableInterruptions: Boolean
93126
): ListenableFuture<Unit>
94127

128+
/**
129+
* Starts an audio conversation with the model, which can only be stopped using
130+
* [stopAudioConversation] or [close].
131+
*
132+
* @param functionCallHandler A callback function that is invoked whenever the model receives a
133+
* function call.
134+
*
135+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
136+
* transcript. The first [Transcription] object is the input transcription, and the second is the
137+
* output transcription
138+
*
139+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
140+
* ongoing reply.
141+
*
142+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
143+
* consistently available.
144+
*/
145+
@RequiresPermission(RECORD_AUDIO)
146+
public abstract fun startAudioConversation(
147+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
148+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
149+
enableInterruptions: Boolean
150+
): ListenableFuture<Unit>
151+
95152
/**
96153
* Stops the audio conversation with the Gemini Server.
97154
*
@@ -233,6 +290,14 @@ public abstract class LiveSessionFutures internal constructor() {
233290
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?
234291
) = SuspendToFutureAdapter.launchFuture { session.startAudioConversation(functionCallHandler) }
235292

293+
@RequiresPermission(RECORD_AUDIO)
294+
override fun startAudioConversation(
295+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?
296+
) =
297+
SuspendToFutureAdapter.launchFuture {
298+
session.startAudioConversation(transcriptHandler = transcriptHandler)
299+
}
300+
236301
@RequiresPermission(RECORD_AUDIO)
237302
override fun startAudioConversation() =
238303
SuspendToFutureAdapter.launchFuture { session.startAudioConversation() }
@@ -243,6 +308,32 @@ public abstract class LiveSessionFutures internal constructor() {
243308
session.startAudioConversation(enableInterruptions = enableInterruptions)
244309
}
245310

311+
@RequiresPermission(RECORD_AUDIO)
312+
override fun startAudioConversation(
313+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
314+
enableInterruptions: Boolean
315+
) =
316+
SuspendToFutureAdapter.launchFuture {
317+
session.startAudioConversation(
318+
transcriptHandler = transcriptHandler,
319+
enableInterruptions = enableInterruptions
320+
)
321+
}
322+
323+
@RequiresPermission(RECORD_AUDIO)
324+
override fun startAudioConversation(
325+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
326+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
327+
enableInterruptions: Boolean
328+
) =
329+
SuspendToFutureAdapter.launchFuture {
330+
session.startAudioConversation(
331+
functionCallHandler = functionCallHandler,
332+
transcriptHandler = transcriptHandler,
333+
enableInterruptions = enableInterruptions
334+
)
335+
}
336+
246337
@RequiresPermission(RECORD_AUDIO)
247338
override fun startAudioConversation(
248339
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,10 @@ internal class AudioHelper(
162162
fun build(): AudioHelper {
163163
val playbackTrack =
164164
AudioTrack(
165-
AudioAttributes.Builder().setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION).build(),
165+
AudioAttributes.Builder()
166+
.setUsage(AudioAttributes.USAGE_MEDIA)
167+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
168+
.build(),
166169
AudioFormat.Builder()
167170
.setSampleRate(24000)
168171
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.ai.type
18+
19+
import kotlinx.serialization.Serializable
20+
21+
/** The audio transcription configuration. Its presence enables audio transcription */
22+
public class AudioTranscriptionConfig {
23+
24+
@Serializable internal object Internal
25+
26+
internal fun toInternal() = Internal
27+
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/FunctionDeclaration.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,12 @@ public class FunctionDeclaration(
6161
internal val schema: Schema =
6262
Schema.obj(properties = parameters, optionalProperties = optionalParameters, nullable = false)
6363

64-
internal fun toInternal() = Internal(name, description, schema.toInternal())
64+
internal fun toInternal() = Internal(name, description, schema.toInternalOpenApi())
6565

6666
@Serializable
6767
internal data class Internal(
6868
val name: String,
6969
val description: String,
70-
val parameters: Schema.Internal
70+
val parameters: Schema.InternalOpenAPI
7171
)
7272
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/GenerationConfig.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ private constructor(
200200
frequencyPenalty = frequencyPenalty,
201201
presencePenalty = presencePenalty,
202202
responseMimeType = responseMimeType,
203-
responseSchema = responseSchema?.toInternal(),
203+
responseSchema = responseSchema?.toInternalOpenApi(),
204204
responseModalities = responseModalities?.map { it.toInternal() },
205205
thinkingConfig = thinkingConfig?.toInternal()
206206
)
@@ -216,7 +216,7 @@ private constructor(
216216
@SerialName("response_mime_type") val responseMimeType: String? = null,
217217
@SerialName("presence_penalty") val presencePenalty: Float? = null,
218218
@SerialName("frequency_penalty") val frequencyPenalty: Float? = null,
219-
@SerialName("response_schema") val responseSchema: Schema.Internal? = null,
219+
@SerialName("response_schema") val responseSchema: Schema.InternalOpenAPI? = null,
220220
@SerialName("response_modalities") val responseModalities: List<String>? = null,
221221
@SerialName("thinking_config") val thinkingConfig: ThinkingConfig.Internal? = null
222222
)

0 commit comments

Comments
 (0)