Skip to content

Commit 668bd66

Browse files
VinayGuthalrlazo
andauthored
Audio Conversation user interrupt support and threading fixes (#7413)
This PR does the following 2 things. - Adds a paramater enableInterruptions which when set true will allow users to interrupt the server response. - Fixes threading issue with audio recording taking up the space by yielding after everytime a recording is done. --------- Co-authored-by: Rodrigo Lazo <[email protected]>
1 parent 803dc34 commit 668bd66

File tree

6 files changed

+92
-8
lines changed

6 files changed

+92
-8
lines changed

firebase-ai/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Unreleased
22

33
- [changed] **Breaking Change**: Removed the `candidateCount` option from `LiveGenerationConfig`
4+
- [changed] Added support for user interrupts for the `startAudioConversation` method in the
5+
`LiveSession` class. (#7413)
46
- [changed] Added support for the URL context tool, which allows the model to access content from
57
provided public web URLs to inform and enhance its responses. (#7382)
68
- [changed] Added better error messages to `ServiceConnectionHandshakeFailedException` (#7412)

firebase-ai/api.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,9 @@ package com.google.firebase.ai.java {
148148
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendFunctionResponse(java.util.List<com.google.firebase.ai.type.FunctionResponsePart> functionList);
149149
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendMediaStream(java.util.List<com.google.firebase.ai.type.MediaData> mediaChunks);
150150
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation();
151-
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
151+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(boolean enableInterruptions);
152+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
153+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, boolean enableInterruptions);
152154
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
153155
method public abstract void stopReceiving();
154156
field public static final com.google.firebase.ai.java.LiveSessionFutures.Companion Companion;
@@ -891,6 +893,7 @@ package com.google.firebase.ai.type {
891893
method public suspend Object? send(String text, kotlin.coroutines.Continuation<? super kotlin.Unit>);
892894
method public suspend Object? sendFunctionResponse(java.util.List<com.google.firebase.ai.type.FunctionResponsePart> functionList, kotlin.coroutines.Continuation<? super kotlin.Unit>);
893895
method public suspend Object? sendMediaStream(java.util.List<com.google.firebase.ai.type.MediaData> mediaChunks, kotlin.coroutines.Continuation<? super kotlin.Unit>);
896+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
894897
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.coroutines.Continuation<? super kotlin.Unit>);
895898
method public void stopAudioConversation();
896899
method public void stopReceiving();

firebase-ai/src/main/kotlin/com/google/firebase/ai/common/util/android.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
package com.google.firebase.ai.common.util
1818

1919
import android.media.AudioRecord
20+
import kotlin.time.Duration.Companion.milliseconds
21+
import kotlinx.coroutines.delay
2022
import kotlinx.coroutines.flow.flow
2123
import kotlinx.coroutines.yield
2224

@@ -38,13 +40,15 @@ internal fun AudioRecord.readAsFlow() = flow {
3840

3941
while (true) {
4042
if (recordingState != AudioRecord.RECORDSTATE_RECORDING) {
43+
// TODO(vguthal): Investigate if both yield and delay are required.
44+
delay(10.milliseconds)
4145
yield()
4246
continue
4347
}
44-
4548
val bytesRead = read(buffer, 0, buffer.size)
4649
if (bytesRead > 0) {
4750
emit(buffer.copyOf(bytesRead))
4851
}
52+
yield()
4953
}
5054
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ public abstract class LiveSessionFutures internal constructor() {
4747
* @param functionCallHandler A callback function that is invoked whenever the model receives a
4848
* function call.
4949
*/
50+
@RequiresPermission(RECORD_AUDIO)
5051
public abstract fun startAudioConversation(
5152
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?
5253
): ListenableFuture<Unit>
@@ -58,6 +59,38 @@ public abstract class LiveSessionFutures internal constructor() {
5859
@RequiresPermission(RECORD_AUDIO)
5960
public abstract fun startAudioConversation(): ListenableFuture<Unit>
6061

62+
/**
63+
* Starts an audio conversation with the model, which can only be stopped using
64+
* [stopAudioConversation] or [close].
65+
*
66+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
67+
* ongoing reply.
68+
*
69+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
70+
* consistently available.
71+
*/
72+
@RequiresPermission(RECORD_AUDIO)
73+
public abstract fun startAudioConversation(enableInterruptions: Boolean): ListenableFuture<Unit>
74+
75+
/**
76+
* Starts an audio conversation with the model, which can only be stopped using
77+
* [stopAudioConversation] or [close].
78+
*
79+
* @param functionCallHandler A callback function that is invoked whenever the model receives a
80+
* function call.
81+
*
82+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
83+
* ongoing reply.
84+
*
85+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
86+
* consistently available.
87+
*/
88+
@RequiresPermission(RECORD_AUDIO)
89+
public abstract fun startAudioConversation(
90+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
91+
enableInterruptions: Boolean
92+
): ListenableFuture<Unit>
93+
6194
/**
6295
* Stops the audio conversation with the Gemini Server.
6396
*
@@ -169,6 +202,24 @@ public abstract class LiveSessionFutures internal constructor() {
169202
override fun startAudioConversation() =
170203
SuspendToFutureAdapter.launchFuture { session.startAudioConversation() }
171204

205+
@RequiresPermission(RECORD_AUDIO)
206+
override fun startAudioConversation(enableInterruptions: Boolean) =
207+
SuspendToFutureAdapter.launchFuture {
208+
session.startAudioConversation(enableInterruptions = enableInterruptions)
209+
}
210+
211+
@RequiresPermission(RECORD_AUDIO)
212+
override fun startAudioConversation(
213+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
214+
enableInterruptions: Boolean
215+
) =
216+
SuspendToFutureAdapter.launchFuture {
217+
session.startAudioConversation(
218+
functionCallHandler,
219+
enableInterruptions = enableInterruptions
220+
)
221+
}
222+
172223
override fun stopAudioConversation() =
173224
SuspendToFutureAdapter.launchFuture { session.stopAudioConversation() }
174225

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,6 @@ internal class AudioHelper(
141141
*/
142142
fun listenToRecording(): Flow<ByteArray> {
143143
if (released) return emptyFlow()
144-
145144
resumeRecording()
146145

147146
return recorder.readAsFlow()

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,28 @@ internal constructor(
9797
public suspend fun startAudioConversation(
9898
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)? = null
9999
) {
100+
startAudioConversation(functionCallHandler, false)
101+
}
102+
103+
/**
104+
* Starts an audio conversation with the model, which can only be stopped using
105+
* [stopAudioConversation] or [close].
106+
*
107+
* @param functionCallHandler A callback function that is invoked whenever the model receives a
108+
* function call. The [FunctionResponsePart] that the callback function returns will be
109+
* automatically sent to the model.
110+
*
111+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
112+
* ongoing reply.
113+
*
114+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
115+
* consistently available.
116+
*/
117+
@RequiresPermission(RECORD_AUDIO)
118+
public suspend fun startAudioConversation(
119+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)? = null,
120+
enableInterruptions: Boolean = false,
121+
) {
100122

101123
val context = firebaseApp.applicationContext
102124
if (
@@ -120,7 +142,7 @@ internal constructor(
120142

121143
recordUserAudio()
122144
processModelResponses(functionCallHandler)
123-
listenForModelPlayback()
145+
listenForModelPlayback(enableInterruptions)
124146
}
125147
}
126148

@@ -375,23 +397,26 @@ internal constructor(
375397
*
376398
* Launched asynchronously on [scope].
377399
*/
378-
private fun listenForModelPlayback() {
400+
private fun listenForModelPlayback(enableInterruptions: Boolean = false) {
379401
scope.launch {
380402
while (isActive) {
381403
val playbackData = playBackQueue.poll()
382404
if (playbackData == null) {
383405
// The model playback queue is complete, so we can continue recording
384406
// TODO(b/408223520): Conditionally resume when param is added
385-
audioHelper?.resumeRecording()
407+
if (!enableInterruptions) {
408+
audioHelper?.resumeRecording()
409+
}
386410
yield()
387411
} else {
388412
/**
389413
* We pause the recording while the model is speaking to avoid interrupting it because of
390414
* no echo cancellation
391415
*/
392416
// TODO(b/408223520): Conditionally pause when param is added
393-
audioHelper?.pauseRecording()
394-
417+
if (enableInterruptions != true) {
418+
audioHelper?.pauseRecording()
419+
}
395420
audioHelper?.playAudio(playbackData)
396421
}
397422
}

0 commit comments

Comments
 (0)