diff --git a/firebase-ai/CHANGELOG.md b/firebase-ai/CHANGELOG.md index bc356414fb7..053febea25a 100644 --- a/firebase-ai/CHANGELOG.md +++ b/firebase-ai/CHANGELOG.md @@ -1,5 +1,6 @@ # Unreleased +- [feature] Added support for sending realtime audio and video in a `LiveSession`. - [changed] Removed redundant internal exception types. (#7475) # 17.4.0 diff --git a/firebase-ai/api.txt b/firebase-ai/api.txt index f8df1f045bc..ba27e5682d8 100644 --- a/firebase-ai/api.txt +++ b/firebase-ai/api.txt @@ -145,8 +145,11 @@ package com.google.firebase.ai.java { method public abstract org.reactivestreams.Publisher receive(); method public abstract com.google.common.util.concurrent.ListenableFuture send(com.google.firebase.ai.type.Content content); method public abstract com.google.common.util.concurrent.ListenableFuture send(String text); + method public abstract com.google.common.util.concurrent.ListenableFuture sendAudioRealtime(com.google.firebase.ai.type.InlineData audio); method public abstract com.google.common.util.concurrent.ListenableFuture sendFunctionResponse(java.util.List functionList); - method public abstract com.google.common.util.concurrent.ListenableFuture sendMediaStream(java.util.List mediaChunks); + method @Deprecated public abstract com.google.common.util.concurrent.ListenableFuture sendMediaStream(java.util.List mediaChunks); + method public abstract com.google.common.util.concurrent.ListenableFuture sendTextRealtime(String text); + method public abstract com.google.common.util.concurrent.ListenableFuture sendVideoRealtime(com.google.firebase.ai.type.InlineData video); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(boolean enableInterruptions); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler); @@ -801,6 +804,14 @@ package com.google.firebase.ai.type { public static final class ImagenSubjectReferenceType.Companion { } + public final class InlineData { + ctor public InlineData(byte[] data, String mimeType); + method public byte[] getData(); + method public String getMimeType(); + property public final byte[] data; + property public final String mimeType; + } + public final class InlineDataPart implements com.google.firebase.ai.type.Part { ctor public InlineDataPart(byte[] inlineData, String mimeType); method public byte[] getInlineData(); @@ -891,20 +902,23 @@ package com.google.firebase.ai.type { method public kotlinx.coroutines.flow.Flow receive(); method public suspend Object? send(com.google.firebase.ai.type.Content content, kotlin.coroutines.Continuation); method public suspend Object? send(String text, kotlin.coroutines.Continuation); + method public suspend Object? sendAudioRealtime(com.google.firebase.ai.type.InlineData audio, kotlin.coroutines.Continuation); method public suspend Object? sendFunctionResponse(java.util.List functionList, kotlin.coroutines.Continuation); - method public suspend Object? sendMediaStream(java.util.List mediaChunks, kotlin.coroutines.Continuation); + method @Deprecated public suspend Object? sendMediaStream(java.util.List mediaChunks, kotlin.coroutines.Continuation); + method public suspend Object? sendTextRealtime(String text, kotlin.coroutines.Continuation); + method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1? functionCallHandler = null, kotlin.coroutines.Continuation); method public void stopAudioConversation(); method public void stopReceiving(); } - @com.google.firebase.ai.type.PublicPreviewAPI public final class MediaData { - ctor public MediaData(byte[] data, String mimeType); - method public byte[] getData(); - method public String getMimeType(); - property public final byte[] data; - property public final String mimeType; + @Deprecated @com.google.firebase.ai.type.PublicPreviewAPI public final class MediaData { + ctor @Deprecated public MediaData(byte[] data, String mimeType); + method @Deprecated public byte[] getData(); + method @Deprecated public String getMimeType(); + property @Deprecated public final byte[] data; + property @Deprecated public final String mimeType; } public final class ModalityTokenCount { diff --git a/firebase-ai/gradle.properties b/firebase-ai/gradle.properties index c1ee825b2cb..c4acd5b3aae 100644 --- a/firebase-ai/gradle.properties +++ b/firebase-ai/gradle.properties @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -version=17.4.1 +version=17.5.0 latestReleasedVersion=17.4.0 diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt index a9615ac2afb..2fb74689643 100644 --- a/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt @@ -23,6 +23,7 @@ import com.google.common.util.concurrent.ListenableFuture import com.google.firebase.ai.type.Content import com.google.firebase.ai.type.FunctionCallPart import com.google.firebase.ai.type.FunctionResponsePart +import com.google.firebase.ai.type.InlineData import com.google.firebase.ai.type.LiveServerMessage import com.google.firebase.ai.type.LiveSession import com.google.firebase.ai.type.MediaData @@ -126,6 +127,30 @@ public abstract class LiveSessionFutures internal constructor() { functionList: List ): ListenableFuture + /** + * Sends audio data to the server in realtime. Check + * https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime + * input usage. + * @param audio The audio data to send. + */ + public abstract fun sendAudioRealtime(audio: InlineData): ListenableFuture + + /** + * Sends video data to the server in realtime. Check + * https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime + * input usage. + * @param video The video data to send. Video MIME type could be either video or image. + */ + public abstract fun sendVideoRealtime(video: InlineData): ListenableFuture + + /** + * Sends text data to the server in realtime. Check + * https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime + * input usage. + * @param text The text data to send. + */ + public abstract fun sendTextRealtime(text: String): ListenableFuture + /** * Streams client data to the model. * @@ -133,6 +158,7 @@ public abstract class LiveSessionFutures internal constructor() { * * @param mediaChunks The list of [MediaData] instances representing the media data to be sent. */ + @Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead") public abstract fun sendMediaStream(mediaChunks: List): ListenableFuture /** @@ -190,6 +216,15 @@ public abstract class LiveSessionFutures internal constructor() { override fun sendFunctionResponse(functionList: List) = SuspendToFutureAdapter.launchFuture { session.sendFunctionResponse(functionList) } + override fun sendAudioRealtime(audio: InlineData): ListenableFuture = + SuspendToFutureAdapter.launchFuture { session.sendAudioRealtime(audio) } + + override fun sendVideoRealtime(video: InlineData): ListenableFuture = + SuspendToFutureAdapter.launchFuture { session.sendVideoRealtime(video) } + + override fun sendTextRealtime(text: String): ListenableFuture = + SuspendToFutureAdapter.launchFuture { session.sendTextRealtime(text) } + override fun sendMediaStream(mediaChunks: List) = SuspendToFutureAdapter.launchFuture { session.sendMediaStream(mediaChunks) } diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt index c703cd959c3..0e6796ab01b 100644 --- a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt @@ -28,6 +28,7 @@ import com.google.firebase.ai.common.JSON import com.google.firebase.ai.common.util.CancelledCoroutineScope import com.google.firebase.ai.common.util.accumulateUntil import com.google.firebase.ai.common.util.childJob +import com.google.firebase.ai.type.MediaData.Internal import com.google.firebase.annotations.concurrent.Blocking import io.ktor.client.plugins.websocket.DefaultClientWebSocketSession import io.ktor.websocket.Frame @@ -255,6 +256,51 @@ internal constructor( } } + /** + * Sends an audio input stream to the model, using the realtime API. + * + * To learn more about audio formats, and the required state they should be provided in, see the + * docs on + * [Supported audio formats](https://cloud.google.com/vertex-ai/generative-ai/docs/live-api#supported-audio-formats) + * + * @param audio Raw audio data used to update the model on the client's conversation. For best + * results, send 16-bit PCM audio at 24kHz. + */ + public suspend fun sendAudioRealtime(audio: InlineData) { + FirebaseAIException.catchAsync { + val jsonString = + Json.encodeToString(BidiGenerateContentRealtimeInputSetup(audio = audio).toInternal()) + session.send(Frame.Text(jsonString)) + } + } + + /** + * Sends a video input stream to the model, using the realtime API. + * + * @param video Encoded video data, used to update the model on the client's conversation. The + * MIME type can be a video format (e.g., `video/webm`) or an image format (e.g., `image/jpeg`). + */ + public suspend fun sendVideoRealtime(video: InlineData) { + FirebaseAIException.catchAsync { + val jsonString = + Json.encodeToString(BidiGenerateContentRealtimeInputSetup(video = video).toInternal()) + session.send(Frame.Text(jsonString)) + } + } + + /** + * Sends a text input stream to the model, using the realtime API. + * + * @param text Text content to append to the current client's conversation. + */ + public suspend fun sendTextRealtime(text: String) { + FirebaseAIException.catchAsync { + val jsonString = + Json.encodeToString(BidiGenerateContentRealtimeInputSetup(text = text).toInternal()) + session.send(Frame.Text(jsonString)) + } + } + /** * Streams client data to the model. * @@ -262,13 +308,17 @@ internal constructor( * * @param mediaChunks The list of [MediaData] instances representing the media data to be sent. */ + @Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead") public suspend fun sendMediaStream( mediaChunks: List, ) { FirebaseAIException.catchAsync { val jsonString = Json.encodeToString( - BidiGenerateContentRealtimeInputSetup(mediaChunks.map { (it.toInternal()) }).toInternal() + BidiGenerateContentRealtimeInputSetup( + mediaChunks.map { InlineData(it.data, it.mimeType) } + ) + .toInternal() ) session.send(Frame.Text(jsonString)) } @@ -324,7 +374,7 @@ internal constructor( ?.listenToRecording() ?.buffer(UNLIMITED) ?.accumulateUntil(MIN_BUFFER_SIZE) - ?.onEach { sendMediaStream(listOf(MediaData(it, "audio/pcm"))) } + ?.onEach { sendAudioRealtime(InlineData(it, "audio/pcm")) } ?.catch { throw FirebaseAIException.from(it) } ?.launchIn(scope) } @@ -464,15 +514,31 @@ internal constructor( * * End of turn is derived from user activity (eg; end of speech). */ - internal class BidiGenerateContentRealtimeInputSetup(val mediaChunks: List) { + internal class BidiGenerateContentRealtimeInputSetup( + val mediaChunks: List? = null, + val audio: InlineData? = null, + val video: InlineData? = null, + val text: String? = null + ) { @Serializable internal class Internal(val realtimeInput: BidiGenerateContentRealtimeInput) { @Serializable internal data class BidiGenerateContentRealtimeInput( - val mediaChunks: List + val mediaChunks: List?, + val audio: InlineData.Internal?, + val video: InlineData.Internal?, + val text: String? ) } - fun toInternal() = Internal(Internal.BidiGenerateContentRealtimeInput(mediaChunks)) + fun toInternal() = + Internal( + Internal.BidiGenerateContentRealtimeInput( + mediaChunks?.map { it.toInternal() }, + audio?.toInternal(), + video?.toInternal(), + text + ) + ) } private companion object { diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/MediaData.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/MediaData.kt index 1262027989d..7647c687934 100644 --- a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/MediaData.kt +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/MediaData.kt @@ -27,6 +27,7 @@ import kotlinx.serialization.Serializable * [Firebase documentation](https://firebase.google.com/docs/vertex-ai/input-file-requirements). */ @PublicPreviewAPI +@Deprecated("Use InlineData instead", ReplaceWith("InlineData")) public class MediaData(public val data: ByteArray, public val mimeType: String) { @Serializable internal class Internal( diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt index 4312fd5bdbd..d232c222b10 100644 --- a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt +++ b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt @@ -19,6 +19,7 @@ package com.google.firebase.ai.type import android.graphics.Bitmap import android.graphics.BitmapFactory import android.util.Log +import com.google.firebase.ai.type.ImagenImageFormat.Internal import java.io.ByteArrayOutputStream import kotlinx.serialization.DeserializationStrategy import kotlinx.serialization.SerialName @@ -161,14 +162,22 @@ internal constructor( @Serializable internal data class Internal( - @SerialName("inlineData") val inlineData: InlineData, + @SerialName("inlineData") val inlineData: InlineData.Internal, val thought: Boolean? = null, val thoughtSignature: String? = null - ) : InternalPart { + ) : InternalPart +} - @Serializable - internal data class InlineData(@SerialName("mimeType") val mimeType: String, val data: Base64) - } +/** + * Represents binary data with an associated MIME type. + * @property data the binary data as a [ByteArray] + * @property mimeType an IANA standard MIME type. + */ +public class InlineData(public val data: ByteArray, public val mimeType: String) { + @Serializable internal data class Internal(val mimeType: String, val data: Base64) + + internal fun toInternal() = + Internal(mimeType, android.util.Base64.encodeToString(data, BASE_64_FLAGS)) } /** Represents function call name and params received from requests. */ @@ -334,13 +343,13 @@ internal fun Part.toInternal(): InternalPart { is TextPart -> TextPart.Internal(text, isThought, thoughtSignature) is ImagePart -> InlineDataPart.Internal( - InlineDataPart.Internal.InlineData("image/jpeg", encodeBitmapToBase64Jpeg(image)), + InlineData.Internal("image/jpeg", encodeBitmapToBase64Jpeg(image)), isThought, thoughtSignature ) is InlineDataPart -> InlineDataPart.Internal( - InlineDataPart.Internal.InlineData( + InlineData.Internal( mimeType, android.util.Base64.encodeToString(inlineData, BASE_64_FLAGS) ), diff --git a/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java b/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java index fd2fb81687b..ef18dd94ae7 100644 --- a/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java +++ b/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java @@ -51,6 +51,7 @@ import com.google.firebase.ai.type.ImagenEditingConfig; import com.google.firebase.ai.type.ImagenInlineImage; import com.google.firebase.ai.type.ImagenMaskReference; +import com.google.firebase.ai.type.InlineData; import com.google.firebase.ai.type.InlineDataPart; import com.google.firebase.ai.type.LiveGenerationConfig; import com.google.firebase.ai.type.LiveServerContent; @@ -365,6 +366,9 @@ public void onComplete() { byte[] bytes = new byte[] {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE}; session.sendMediaStream(List.of(new MediaData(bytes, "image/jxl"))); + session.sendAudioRealtime(new InlineData(bytes, "audio/jxl")); + session.sendVideoRealtime(new InlineData(bytes, "image/jxl")); + session.sendTextRealtime("text"); FunctionResponsePart functionResponse = new FunctionResponsePart("myFunction", new JsonObject(Map.of()));