Skip to content

Commit 1bbe2b8

Browse files
authored
Merge branch 'main' into feat/add-no-image-finish-reason
2 parents d47e3a4 + e0e995a commit 1bbe2b8

26 files changed

+593
-83
lines changed

firebase-ai/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Unreleased
22

3+
- [changed] Added better scheduling and louder output for Live API.
4+
- [changed] Added support for input and output transcription. (#7482)
35
- [feature] Added support for sending realtime audio and video in a `LiveSession`.
46
- [changed] Removed redundant internal exception types. (#7475)
57
- [feature] Added support for configuring the aspect ratio when generating images with the

firebase-ai/api.txt

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ package com.google.firebase.ai.java {
154154
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(boolean enableInterruptions);
155155
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
156156
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, boolean enableInterruptions);
157+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler, boolean enableInterruptions);
158+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler);
159+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler, boolean enableInterruptions);
157160
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
158161
method public abstract void stopReceiving();
159162
field public static final com.google.firebase.ai.java.LiveSessionFutures.Companion Companion;
@@ -191,6 +194,10 @@ package com.google.firebase.ai.type {
191194
ctor public AudioRecordInitializationFailedException(String message);
192195
}
193196

197+
public final class AudioTranscriptionConfig {
198+
ctor public AudioTranscriptionConfig();
199+
}
200+
194201
public final class BlockReason {
195202
method public String getName();
196203
method public int getOrdinal();
@@ -874,15 +881,19 @@ package com.google.firebase.ai.type {
874881
ctor public LiveGenerationConfig.Builder();
875882
method public com.google.firebase.ai.type.LiveGenerationConfig build();
876883
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setFrequencyPenalty(Float? frequencyPenalty);
884+
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setInputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
877885
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setMaxOutputTokens(Integer? maxOutputTokens);
886+
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setOutputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
878887
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setPresencePenalty(Float? presencePenalty);
879888
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setResponseModality(com.google.firebase.ai.type.ResponseModality? responseModality);
880889
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setSpeechConfig(com.google.firebase.ai.type.SpeechConfig? speechConfig);
881890
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTemperature(Float? temperature);
882891
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopK(Integer? topK);
883892
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopP(Float? topP);
884893
field public Float? frequencyPenalty;
894+
field public com.google.firebase.ai.type.AudioTranscriptionConfig? inputAudioTranscription;
885895
field public Integer? maxOutputTokens;
896+
field public com.google.firebase.ai.type.AudioTranscriptionConfig? outputAudioTranscription;
886897
field public Float? presencePenalty;
887898
field public com.google.firebase.ai.type.ResponseModality? responseModality;
888899
field public com.google.firebase.ai.type.SpeechConfig? speechConfig;
@@ -900,14 +911,18 @@ package com.google.firebase.ai.type {
900911
}
901912

902913
@com.google.firebase.ai.type.PublicPreviewAPI public final class LiveServerContent implements com.google.firebase.ai.type.LiveServerMessage {
903-
ctor public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete);
914+
ctor @Deprecated public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete, com.google.firebase.ai.type.Transcription? inputTranscription, com.google.firebase.ai.type.Transcription? outputTranscription);
904915
method public com.google.firebase.ai.type.Content? getContent();
905916
method public boolean getGenerationComplete();
917+
method public com.google.firebase.ai.type.Transcription? getInputTranscription();
906918
method public boolean getInterrupted();
919+
method public com.google.firebase.ai.type.Transcription? getOutputTranscription();
907920
method public boolean getTurnComplete();
908921
property public final com.google.firebase.ai.type.Content? content;
909922
property public final boolean generationComplete;
923+
property public final com.google.firebase.ai.type.Transcription? inputTranscription;
910924
property public final boolean interrupted;
925+
property public final com.google.firebase.ai.type.Transcription? outputTranscription;
911926
property public final boolean turnComplete;
912927
}
913928

@@ -944,6 +959,7 @@ package com.google.firebase.ai.type {
944959
method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation<? super kotlin.Unit>);
945960
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
946961
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.coroutines.Continuation<? super kotlin.Unit>);
962+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
947963
method public void stopAudioConversation();
948964
method public void stopReceiving();
949965
}
@@ -1270,6 +1286,11 @@ package com.google.firebase.ai.type {
12701286
ctor public ToolConfig(com.google.firebase.ai.type.FunctionCallingConfig? functionCallingConfig);
12711287
}
12721288

1289+
public final class Transcription {
1290+
method public String? getText();
1291+
property public final String? text;
1292+
}
1293+
12731294
public final class UnknownException extends com.google.firebase.ai.type.FirebaseAIException {
12741295
}
12751296

firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ internal constructor(
111111
modelName,
112112
config?.toInternal(),
113113
tools?.map { it.toInternal() },
114-
systemInstruction?.toInternal()
114+
systemInstruction?.toInternal(),
115+
config?.inputAudioTranscription?.toInternal(),
116+
config?.outputAudioTranscription?.toInternal()
115117
)
116118
.toInternal()
117119
val data: String = Json.encodeToString(clientMessage)
@@ -135,7 +137,7 @@ internal constructor(
135137
} catch (e: ClosedReceiveChannelException) {
136138
val reason = webSession?.closeReason?.await()
137139
val message =
138-
"Channel was closed by the server.${if(reason!=null) " Details: ${reason.message}" else "" }"
140+
"Channel was closed by the server.${if (reason != null) " Details: ${reason.message}" else ""}"
139141
throw ServiceConnectionHandshakeFailedException(message, e)
140142
}
141143
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/common/APIController.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ import kotlinx.coroutines.flow.map
7777
import kotlinx.coroutines.launch
7878
import kotlinx.coroutines.withTimeout
7979
import kotlinx.serialization.ExperimentalSerializationApi
80+
import kotlinx.serialization.json.ClassDiscriminatorMode
8081
import kotlinx.serialization.json.Json
8182

8283
@OptIn(ExperimentalSerializationApi::class)
@@ -85,6 +86,7 @@ internal val JSON = Json {
8586
prettyPrint = false
8687
isLenient = true
8788
explicitNulls = false
89+
classDiscriminatorMode = ClassDiscriminatorMode.NONE
8890
}
8991

9092
/**

firebase-ai/src/main/kotlin/com/google/firebase/ai/common/util/android.kt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@
1717
package com.google.firebase.ai.common.util
1818

1919
import android.media.AudioRecord
20-
import kotlin.time.Duration.Companion.milliseconds
2120
import kotlinx.coroutines.delay
2221
import kotlinx.coroutines.flow.flow
23-
import kotlinx.coroutines.yield
2422

2523
/**
2624
* The minimum buffer size for this instance.
@@ -40,15 +38,17 @@ internal fun AudioRecord.readAsFlow() = flow {
4038

4139
while (true) {
4240
if (recordingState != AudioRecord.RECORDSTATE_RECORDING) {
43-
// TODO(vguthal): Investigate if both yield and delay are required.
44-
delay(10.milliseconds)
45-
yield()
41+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement when
42+
// compared to yield.
43+
delay(0)
4644
continue
4745
}
4846
val bytesRead = read(buffer, 0, buffer.size)
4947
if (bytesRead > 0) {
5048
emit(buffer.copyOf(bytesRead))
5149
}
52-
yield()
50+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement when
51+
// compared to yield.
52+
delay(0)
5353
}
5454
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import com.google.firebase.ai.type.LiveSession
2929
import com.google.firebase.ai.type.MediaData
3030
import com.google.firebase.ai.type.PublicPreviewAPI
3131
import com.google.firebase.ai.type.SessionAlreadyReceivingException
32+
import com.google.firebase.ai.type.Transcription
3233
import io.ktor.websocket.close
3334
import kotlinx.coroutines.reactive.asPublisher
3435
import org.reactivestreams.Publisher
@@ -41,6 +42,13 @@ import org.reactivestreams.Publisher
4142
@PublicPreviewAPI
4243
public abstract class LiveSessionFutures internal constructor() {
4344

45+
/**
46+
* Starts an audio conversation with the model, which can only be stopped using
47+
* [stopAudioConversation].
48+
*/
49+
@RequiresPermission(RECORD_AUDIO)
50+
public abstract fun startAudioConversation(): ListenableFuture<Unit>
51+
4452
/**
4553
* Starts an audio conversation with the model, which can only be stopped using
4654
* [stopAudioConversation] or [close].
@@ -56,9 +64,14 @@ public abstract class LiveSessionFutures internal constructor() {
5664
/**
5765
* Starts an audio conversation with the model, which can only be stopped using
5866
* [stopAudioConversation].
67+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
68+
* transcript. The first [Transcription] object is the input transcription, and the second is the
69+
* output transcription
5970
*/
6071
@RequiresPermission(RECORD_AUDIO)
61-
public abstract fun startAudioConversation(): ListenableFuture<Unit>
72+
public abstract fun startAudioConversation(
73+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
74+
): ListenableFuture<Unit>
6275

6376
/**
6477
* Starts an audio conversation with the model, which can only be stopped using
@@ -73,6 +86,26 @@ public abstract class LiveSessionFutures internal constructor() {
7386
@RequiresPermission(RECORD_AUDIO)
7487
public abstract fun startAudioConversation(enableInterruptions: Boolean): ListenableFuture<Unit>
7588

89+
/**
90+
* Starts an audio conversation with the model, which can only be stopped using
91+
* [stopAudioConversation] or [close].
92+
*
93+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
94+
* transcript. The first [Transcription] object is the input transcription, and the second is the
95+
* output transcription
96+
*
97+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
98+
* ongoing reply.
99+
*
100+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
101+
* consistently available.
102+
*/
103+
@RequiresPermission(RECORD_AUDIO)
104+
public abstract fun startAudioConversation(
105+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
106+
enableInterruptions: Boolean
107+
): ListenableFuture<Unit>
108+
76109
/**
77110
* Starts an audio conversation with the model, which can only be stopped using
78111
* [stopAudioConversation] or [close].
@@ -92,6 +125,30 @@ public abstract class LiveSessionFutures internal constructor() {
92125
enableInterruptions: Boolean
93126
): ListenableFuture<Unit>
94127

128+
/**
129+
* Starts an audio conversation with the model, which can only be stopped using
130+
* [stopAudioConversation] or [close].
131+
*
132+
* @param functionCallHandler A callback function that is invoked whenever the model receives a
133+
* function call.
134+
*
135+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
136+
* transcript. The first [Transcription] object is the input transcription, and the second is the
137+
* output transcription
138+
*
139+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
140+
* ongoing reply.
141+
*
142+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
143+
* consistently available.
144+
*/
145+
@RequiresPermission(RECORD_AUDIO)
146+
public abstract fun startAudioConversation(
147+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
148+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
149+
enableInterruptions: Boolean
150+
): ListenableFuture<Unit>
151+
95152
/**
96153
* Stops the audio conversation with the Gemini Server.
97154
*
@@ -233,6 +290,14 @@ public abstract class LiveSessionFutures internal constructor() {
233290
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?
234291
) = SuspendToFutureAdapter.launchFuture { session.startAudioConversation(functionCallHandler) }
235292

293+
@RequiresPermission(RECORD_AUDIO)
294+
override fun startAudioConversation(
295+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?
296+
) =
297+
SuspendToFutureAdapter.launchFuture {
298+
session.startAudioConversation(transcriptHandler = transcriptHandler)
299+
}
300+
236301
@RequiresPermission(RECORD_AUDIO)
237302
override fun startAudioConversation() =
238303
SuspendToFutureAdapter.launchFuture { session.startAudioConversation() }
@@ -243,6 +308,32 @@ public abstract class LiveSessionFutures internal constructor() {
243308
session.startAudioConversation(enableInterruptions = enableInterruptions)
244309
}
245310

311+
@RequiresPermission(RECORD_AUDIO)
312+
override fun startAudioConversation(
313+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
314+
enableInterruptions: Boolean
315+
) =
316+
SuspendToFutureAdapter.launchFuture {
317+
session.startAudioConversation(
318+
transcriptHandler = transcriptHandler,
319+
enableInterruptions = enableInterruptions
320+
)
321+
}
322+
323+
@RequiresPermission(RECORD_AUDIO)
324+
override fun startAudioConversation(
325+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
326+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
327+
enableInterruptions: Boolean
328+
) =
329+
SuspendToFutureAdapter.launchFuture {
330+
session.startAudioConversation(
331+
functionCallHandler = functionCallHandler,
332+
transcriptHandler = transcriptHandler,
333+
enableInterruptions = enableInterruptions
334+
)
335+
}
336+
246337
@RequiresPermission(RECORD_AUDIO)
247338
override fun startAudioConversation(
248339
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,10 @@ internal class AudioHelper(
162162
fun build(): AudioHelper {
163163
val playbackTrack =
164164
AudioTrack(
165-
AudioAttributes.Builder().setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION).build(),
165+
AudioAttributes.Builder()
166+
.setUsage(AudioAttributes.USAGE_MEDIA)
167+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
168+
.build(),
166169
AudioFormat.Builder()
167170
.setSampleRate(24000)
168171
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.ai.type
18+
19+
import kotlinx.serialization.Serializable
20+
21+
/** The audio transcription configuration. Its presence enables audio transcription */
22+
public class AudioTranscriptionConfig {
23+
24+
@Serializable internal object Internal
25+
26+
internal fun toInternal() = Internal
27+
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/FunctionDeclaration.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,12 @@ public class FunctionDeclaration(
6161
internal val schema: Schema =
6262
Schema.obj(properties = parameters, optionalProperties = optionalParameters, nullable = false)
6363

64-
internal fun toInternal() = Internal(name, description, schema.toInternal())
64+
internal fun toInternal() = Internal(name, description, schema.toInternalOpenApi())
6565

6666
@Serializable
6767
internal data class Internal(
6868
val name: String,
6969
val description: String,
70-
val parameters: Schema.Internal
70+
val parameters: Schema.InternalOpenAPI
7171
)
7272
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/GenerationConfig.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ private constructor(
206206
frequencyPenalty = frequencyPenalty,
207207
presencePenalty = presencePenalty,
208208
responseMimeType = responseMimeType,
209-
responseSchema = responseSchema?.toInternal(),
209+
responseSchema = responseSchema?.toInternalOpenApi(),
210210
responseModalities = responseModalities?.map { it.toInternal() },
211211
thinkingConfig = thinkingConfig?.toInternal(),
212212
imageConfig = imageConfig?.toInternal()
@@ -223,7 +223,7 @@ private constructor(
223223
@SerialName("response_mime_type") val responseMimeType: String? = null,
224224
@SerialName("presence_penalty") val presencePenalty: Float? = null,
225225
@SerialName("frequency_penalty") val frequencyPenalty: Float? = null,
226-
@SerialName("response_schema") val responseSchema: Schema.Internal? = null,
226+
@SerialName("response_schema") val responseSchema: Schema.InternalOpenAPI? = null,
227227
@SerialName("response_modalities") val responseModalities: List<String>? = null,
228228
@SerialName("thinking_config") val thinkingConfig: ThinkingConfig.Internal? = null,
229229
@SerialName("image_config") val imageConfig: ImageConfig.Internal? = null

0 commit comments

Comments
 (0)