diff --git a/firebase-ai/CHANGELOG.md b/firebase-ai/CHANGELOG.md index abf0bf55c68..dee8bfb1563 100644 --- a/firebase-ai/CHANGELOG.md +++ b/firebase-ai/CHANGELOG.md @@ -1,5 +1,7 @@ # Unreleased +- [changed] Added `LiveAudioConversationConfig` to control different aspects of the conversation + while using the `startAudioConversation` function. - [changed] Added better scheduling and louder output for Live API. - [changed] Added support for input and output transcription. (#7482) - [feature] Added support for sending realtime audio and video in a `LiveSession`. diff --git a/firebase-ai/api.txt b/firebase-ai/api.txt index f73c51d7112..ed3d7135138 100644 --- a/firebase-ai/api.txt +++ b/firebase-ai/api.txt @@ -152,6 +152,7 @@ package com.google.firebase.ai.java { method public abstract com.google.common.util.concurrent.ListenableFuture sendVideoRealtime(com.google.firebase.ai.type.InlineData video); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(boolean enableInterruptions); + method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler, boolean enableInterruptions); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler, kotlin.jvm.functions.Function2? transcriptHandler, boolean enableInterruptions); @@ -838,6 +839,31 @@ package com.google.firebase.ai.type { public final class InvalidStateException extends com.google.firebase.ai.type.FirebaseAIException { } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveAudioConversationConfig { + field public static final com.google.firebase.ai.type.LiveAudioConversationConfig.Companion Companion; + } + + public static final class LiveAudioConversationConfig.Builder { + ctor public LiveAudioConversationConfig.Builder(); + method public com.google.firebase.ai.type.LiveAudioConversationConfig build(); + method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setAudioHandler(kotlin.jvm.functions.Function2? audioHandler); + method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setEnableInterruptions(boolean enableInterruptions); + method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setFunctionCallHandler(kotlin.jvm.functions.Function1? functionCallHandler); + method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setTranscriptHandler(kotlin.jvm.functions.Function2? transcriptHandler); + field public kotlin.jvm.functions.Function2? audioHandler; + field public boolean enableInterruptions; + field public kotlin.jvm.functions.Function1? functionCallHandler; + field public kotlin.jvm.functions.Function2? transcriptHandler; + } + + public static final class LiveAudioConversationConfig.Companion { + method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder builder(); + } + + public final class LiveAudioConversationConfigKt { + method public static com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig(kotlin.jvm.functions.Function1 init); + } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveGenerationConfig { field public static final com.google.firebase.ai.type.LiveGenerationConfig.Companion Companion; } @@ -922,6 +948,7 @@ package com.google.firebase.ai.type { method @Deprecated public suspend Object? sendMediaStream(java.util.List mediaChunks, kotlin.coroutines.Continuation); method public suspend Object? sendTextRealtime(String text, kotlin.coroutines.Continuation); method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation); + method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig, kotlin.coroutines.Continuation); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler = null, kotlin.coroutines.Continuation); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler = null, kotlin.jvm.functions.Function2? transcriptHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation); diff --git a/firebase-ai/gradle.properties b/firebase-ai/gradle.properties index c4acd5b3aae..388149e856a 100644 --- a/firebase-ai/gradle.properties +++ b/firebase-ai/gradle.properties @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -version=17.5.0 +version=99.9.9 latestReleasedVersion=17.4.0 diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt index 5a04ed9f97c..f84b1e7f6d0 100644 --- a/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt @@ -24,6 +24,7 @@ import com.google.firebase.ai.type.Content import com.google.firebase.ai.type.FunctionCallPart import com.google.firebase.ai.type.FunctionResponsePart import com.google.firebase.ai.type.InlineData +import com.google.firebase.ai.type.LiveAudioConversationConfig import com.google.firebase.ai.type.LiveServerMessage import com.google.firebase.ai.type.LiveSession import com.google.firebase.ai.type.MediaData @@ -49,6 +50,18 @@ public abstract class LiveSessionFutures internal constructor() { @RequiresPermission(RECORD_AUDIO) public abstract fun startAudioConversation(): ListenableFuture + /** + * Starts an audio conversation with the model, which can only be stopped using + * [stopAudioConversation]. + * + * @param liveAudioConversationConfig A [LiveAudioConversationConfig] provided by the user to + * control the various aspects of the conversation. + */ + @RequiresPermission(RECORD_AUDIO) + public abstract fun startAudioConversation( + liveAudioConversationConfig: LiveAudioConversationConfig + ): ListenableFuture + /** * Starts an audio conversation with the model, which can only be stopped using * [stopAudioConversation] or [close]. @@ -298,6 +311,12 @@ public abstract class LiveSessionFutures internal constructor() { session.startAudioConversation(transcriptHandler = transcriptHandler) } + @RequiresPermission(RECORD_AUDIO) + override fun startAudioConversation(liveAudioConversationConfig: LiveAudioConversationConfig) = + SuspendToFutureAdapter.launchFuture { + session.startAudioConversation(liveAudioConversationConfig) + } + @RequiresPermission(RECORD_AUDIO) override fun startAudioConversation() = SuspendToFutureAdapter.launchFuture { session.startAudioConversation() } diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt index 06b4a3efe25..e907315fb0e 100644 --- a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt @@ -19,7 +19,6 @@ package com.google.firebase.ai.type import android.Manifest import android.media.AudioAttributes import android.media.AudioFormat -import android.media.AudioManager import android.media.AudioRecord import android.media.AudioTrack import android.media.MediaRecorder @@ -157,28 +156,39 @@ internal class AudioHelper( * * It also makes it easier to read, since the long initialization is separate from the * constructor. + * + * @param audioHandler A callback that is invoked immediately following the successful + * initialization of the associated [AudioRecord] and [AudioTrack] objects. This offers a final + * opportunity to configure these objects, which will remain valid and effective for the + * duration of the current audio session. */ @RequiresPermission(Manifest.permission.RECORD_AUDIO) - fun build(): AudioHelper { - val playbackTrack = - AudioTrack( - AudioAttributes.Builder() - .setUsage(AudioAttributes.USAGE_MEDIA) - .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) - .build(), + fun build( + audioHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)? = null + ): AudioHelper { + val playTrackBuilder = AudioTrack.Builder() + playTrackBuilder + .setAudioFormat( AudioFormat.Builder() .setSampleRate(24000) .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) .setEncoding(AudioFormat.ENCODING_PCM_16BIT) - .build(), + .build() + ) + .setAudioAttributes( + AudioAttributes.Builder() + .setUsage(AudioAttributes.USAGE_MEDIA) + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .build() + ) + .setBufferSizeInBytes( AudioTrack.getMinBufferSize( 24000, AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT ), - AudioTrack.MODE_STREAM, - AudioManager.AUDIO_SESSION_ID_GENERATE ) + .setTransferMode(AudioTrack.MODE_STREAM) val bufferSize = AudioRecord.getMinBufferSize( @@ -191,15 +201,22 @@ internal class AudioHelper( throw AudioRecordInitializationFailedException( "Audio Record buffer size is invalid ($bufferSize)" ) - - val recorder = - AudioRecord( - MediaRecorder.AudioSource.VOICE_COMMUNICATION, - 16000, - AudioFormat.CHANNEL_IN_MONO, - AudioFormat.ENCODING_PCM_16BIT, - bufferSize - ) + val recorderBuilder = + AudioRecord.Builder() + .setAudioSource(MediaRecorder.AudioSource.VOICE_COMMUNICATION) + .setAudioFormat( + AudioFormat.Builder() + .setEncoding(AudioFormat.ENCODING_PCM_16BIT) + .setSampleRate(16000) + .setChannelMask(AudioFormat.CHANNEL_IN_MONO) + .build() + ) + .setBufferSizeInBytes(bufferSize) + if (audioHandler != null) { + audioHandler(recorderBuilder, playTrackBuilder) + } + val recorder = recorderBuilder.build() + val playbackTrack = playTrackBuilder.build() if (recorder.state != AudioRecord.STATE_INITIALIZED) throw AudioRecordInitializationFailedException( "Audio Record initialization has failed. State: ${recorder.state}" diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveAudioConversationConfig.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveAudioConversationConfig.kt new file mode 100644 index 00000000000..3ac00eca76b --- /dev/null +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveAudioConversationConfig.kt @@ -0,0 +1,130 @@ +/* + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.ai.type + +import android.media.AudioRecord +import android.media.AudioTrack + +/** + * Configuration parameters to use for conversation config. + * + * @property functionCallHandler A callback that is invoked whenever the model receives a function + * call. The [FunctionResponsePart] that the callback function returns will be automatically sent to + * the model. + * + * @property transcriptHandler A callback that is invoked whenever the model receives a transcript. + * The first [Transcription] object is the input transcription, and the second is the output + * transcription. + * + * @property audioHandler A callback that is invoked immediately following the successful + * initialization of the associated [AudioRecord.Builder] and [AudioTrack.Builder] objects. This + * offers a final opportunity to configure these objects, which will remain valid and effective for + * the duration of the current audio session. + * + * @property enableInterruptions If enabled, allows the user to speak over or interrupt the model's + * ongoing reply. + * + * **WARNING**: The user interruption feature relies on device-specific support, and may not be + * consistently available. + */ +@PublicPreviewAPI +public class LiveAudioConversationConfig +private constructor( + internal val functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?, + internal val audioHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)?, + internal val transcriptHandler: ((Transcription?, Transcription?) -> Unit)?, + internal val enableInterruptions: Boolean +) { + + /** + * Builder for creating a [LiveAudioConversationConfig]. + * + * Mainly intended for Java interop. Kotlin consumers should use [liveAudioConversationConfig] for + * a more idiomatic experience. + * + * @property functionCallHandler See [LiveAudioConversationConfig.functionCallHandler]. + * + * @property audioHandler See [LiveAudioConversationConfig.audioHandler]. + * + * @property transcriptHandler See [LiveAudioConversationConfig.transcriptHandler]. + * + * @property enableInterruptions See [LiveAudioConversationConfig.enableInterruptions]. + */ + public class Builder { + @JvmField public var functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)? = null + @JvmField public var audioHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)? = null + @JvmField public var transcriptHandler: ((Transcription?, Transcription?) -> Unit)? = null + @JvmField public var enableInterruptions: Boolean = false + + public fun setFunctionCallHandler( + functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)? + ): Builder = apply { this.functionCallHandler = functionCallHandler } + + public fun setAudioHandler( + audioHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)? + ): Builder = apply { this.audioHandler = audioHandler } + + public fun setTranscriptHandler( + transcriptHandler: ((Transcription?, Transcription?) -> Unit)? + ): Builder = apply { this.transcriptHandler = transcriptHandler } + + public fun setEnableInterruptions(enableInterruptions: Boolean): Builder = apply { + this.enableInterruptions = enableInterruptions + } + + /** Create a new [LiveAudioConversationConfig] with the attached arguments. */ + public fun build(): LiveAudioConversationConfig = + LiveAudioConversationConfig( + functionCallHandler = functionCallHandler, + audioHandler = audioHandler, + transcriptHandler = transcriptHandler, + enableInterruptions = enableInterruptions + ) + } + + public companion object { + + /** + * Alternative casing for [LiveAudioConversationConfig.Builder]: + * ``` + * val config = LiveAudioConversationConfig.builder() + * ``` + */ + public fun builder(): Builder = Builder() + } +} + +/** + * Helper method to construct a [LiveAudioConversationConfig] in a DSL-like manner. + * + * Example Usage: + * ``` + * liveAudioConversationConfig { + * functionCallHandler = ... + * audioHandler = ... + * ... + * } + * ``` + */ +@OptIn(PublicPreviewAPI::class) +public fun liveAudioConversationConfig( + init: LiveAudioConversationConfig.Builder.() -> Unit +): LiveAudioConversationConfig { + val builder = LiveAudioConversationConfig.builder() + builder.init() + return builder.build() +} diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt index 37d6f5011cb..a497a01b224 100644 --- a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt @@ -171,6 +171,26 @@ internal constructor( transcriptHandler: ((Transcription?, Transcription?) -> Unit)? = null, enableInterruptions: Boolean = false, ) { + startAudioConversation( + liveAudioConversationConfig { + this.functionCallHandler = functionCallHandler + this.transcriptHandler = transcriptHandler + this.enableInterruptions = enableInterruptions + } + ) + } + + /** + * Starts an audio conversation with the model, which can only be stopped using + * [stopAudioConversation] or [close]. + * + * @param liveAudioConversationConfig A [LiveAudioConversationConfig] provided by the user to + * control the various aspects of the conversation. + */ + @RequiresPermission(RECORD_AUDIO) + public suspend fun startAudioConversation( + liveAudioConversationConfig: LiveAudioConversationConfig + ) { val context = firebaseApp.applicationContext if ( @@ -191,11 +211,14 @@ internal constructor( networkScope = CoroutineScope(blockingDispatcher + childJob() + CoroutineName("LiveSession Network")) audioScope = CoroutineScope(audioDispatcher + childJob() + CoroutineName("LiveSession Audio")) - audioHelper = AudioHelper.build() + audioHelper = AudioHelper.build(liveAudioConversationConfig.audioHandler) recordUserAudio() - processModelResponses(functionCallHandler, transcriptHandler) - listenForModelPlayback(enableInterruptions) + processModelResponses( + liveAudioConversationConfig.functionCallHandler, + liveAudioConversationConfig.transcriptHandler + ) + listenForModelPlayback(liveAudioConversationConfig.enableInterruptions) } }