Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions firebase-ai/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Unreleased

- [feature] Added support for sending realtime audio and video in a `LiveSession`.
- [changed] Removed redundant internal exception types. (#7475)

# 17.4.0
Expand Down
2 changes: 1 addition & 1 deletion firebase-ai/gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.

version=17.4.1
version=17.5.0
latestReleasedVersion=17.4.0
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import com.google.common.util.concurrent.ListenableFuture
import com.google.firebase.ai.type.Content
import com.google.firebase.ai.type.FunctionCallPart
import com.google.firebase.ai.type.FunctionResponsePart
import com.google.firebase.ai.type.InlineData
import com.google.firebase.ai.type.LiveServerMessage
import com.google.firebase.ai.type.LiveSession
import com.google.firebase.ai.type.MediaData
Expand Down Expand Up @@ -126,13 +127,38 @@ public abstract class LiveSessionFutures internal constructor() {
functionList: List<FunctionResponsePart>
): ListenableFuture<Unit>

/**
* Sends audio data to the server in realtime. Check
* https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
* input usage.
* @param audio The audio data to send.
*/
public abstract fun sendAudioRealtime(audio: InlineData): ListenableFuture<Unit>

/**
* Sends video data to the server in realtime. Check
* https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
* input usage.
* @param video The video data to send. Video MIME type could be either video or image.
*/
public abstract fun sendVideoRealtime(video: InlineData): ListenableFuture<Unit>

/**
* Sends text data to the server in realtime. Check
* https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
* input usage.
* @param text The text data to send.
*/
public abstract fun sendTextRealtime(text: String): ListenableFuture<Unit>

/**
* Streams client data to the model.
*
* Calling this after [startAudioConversation] will play the response audio immediately.
*
* @param mediaChunks The list of [MediaData] instances representing the media data to be sent.
*/
@Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead")
public abstract fun sendMediaStream(mediaChunks: List<MediaData>): ListenableFuture<Unit>

/**
Expand Down Expand Up @@ -190,6 +216,15 @@ public abstract class LiveSessionFutures internal constructor() {
override fun sendFunctionResponse(functionList: List<FunctionResponsePart>) =
SuspendToFutureAdapter.launchFuture { session.sendFunctionResponse(functionList) }

override fun sendAudioRealtime(audio: InlineData): ListenableFuture<Unit> =
SuspendToFutureAdapter.launchFuture { session.sendAudioRealtime(audio) }

override fun sendVideoRealtime(video: InlineData): ListenableFuture<Unit> =
SuspendToFutureAdapter.launchFuture { session.sendVideoRealtime(video) }

override fun sendTextRealtime(text: String): ListenableFuture<Unit> =
SuspendToFutureAdapter.launchFuture { session.sendTextRealtime(text) }

override fun sendMediaStream(mediaChunks: List<MediaData>) =
SuspendToFutureAdapter.launchFuture { session.sendMediaStream(mediaChunks) }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import com.google.firebase.ai.common.JSON
import com.google.firebase.ai.common.util.CancelledCoroutineScope
import com.google.firebase.ai.common.util.accumulateUntil
import com.google.firebase.ai.common.util.childJob
import com.google.firebase.ai.type.MediaData.Internal
import com.google.firebase.annotations.concurrent.Blocking
import io.ktor.client.plugins.websocket.DefaultClientWebSocketSession
import io.ktor.websocket.Frame
Expand Down Expand Up @@ -255,20 +256,69 @@ internal constructor(
}
}

/**
* Sends an audio input stream to the model, using the realtime API.
*
* To learn more about audio formats, and the required state they should be provided in, see the
* docs on
* [Supported audio formats](https://cloud.google.com/vertex-ai/generative-ai/docs/live-api#supported-audio-formats)
*
* @param audio Raw audio data used to update the model on the client's conversation. For best
* results, send 16-bit PCM audio at 24kHz.
*/
public suspend fun sendAudioRealtime(audio: InlineData) {
FirebaseAIException.catchAsync {
val jsonString =
Json.encodeToString(BidiGenerateContentRealtimeInputSetup(audio = audio).toInternal())
session.send(Frame.Text(jsonString))
}
}

/**
* Sends a video input stream to the model, using the realtime API.
*
* @param video Encoded video data, used to update the model on the client's conversation. The
* MIME type can be a video format (e.g., `video/webm`) or an image format (e.g., `image/jpeg`).
*/
public suspend fun sendVideoRealtime(video: InlineData) {
FirebaseAIException.catchAsync {
val jsonString =
Json.encodeToString(BidiGenerateContentRealtimeInputSetup(video = video).toInternal())
session.send(Frame.Text(jsonString))
}
}

/**
* Sends a text input stream to the model, using the realtime API.
*
* @param text Text content to append to the current client's conversation.
*/
public suspend fun sendTextRealtime(text: String) {
FirebaseAIException.catchAsync {
val jsonString =
Json.encodeToString(BidiGenerateContentRealtimeInputSetup(text = text).toInternal())
session.send(Frame.Text(jsonString))
}
}

/**
* Streams client data to the model.
*
* Calling this after [startAudioConversation] will play the response audio immediately.
*
* @param mediaChunks The list of [MediaData] instances representing the media data to be sent.
*/
@Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead")
public suspend fun sendMediaStream(
mediaChunks: List<MediaData>,
) {
FirebaseAIException.catchAsync {
val jsonString =
Json.encodeToString(
BidiGenerateContentRealtimeInputSetup(mediaChunks.map { (it.toInternal()) }).toInternal()
BidiGenerateContentRealtimeInputSetup(
mediaChunks.map { InlineData(it.data, it.mimeType) }
)
.toInternal()
)
session.send(Frame.Text(jsonString))
}
Expand Down Expand Up @@ -324,7 +374,7 @@ internal constructor(
?.listenToRecording()
?.buffer(UNLIMITED)
?.accumulateUntil(MIN_BUFFER_SIZE)
?.onEach { sendMediaStream(listOf(MediaData(it, "audio/pcm"))) }
?.onEach { sendAudioRealtime(InlineData(it, "audio/pcm")) }
?.catch { throw FirebaseAIException.from(it) }
?.launchIn(scope)
}
Expand Down Expand Up @@ -464,15 +514,31 @@ internal constructor(
*
* End of turn is derived from user activity (eg; end of speech).
*/
internal class BidiGenerateContentRealtimeInputSetup(val mediaChunks: List<MediaData.Internal>) {
internal class BidiGenerateContentRealtimeInputSetup(
val mediaChunks: List<InlineData>? = null,
val audio: InlineData? = null,
val video: InlineData? = null,
val text: String? = null
) {
@Serializable
internal class Internal(val realtimeInput: BidiGenerateContentRealtimeInput) {
@Serializable
internal data class BidiGenerateContentRealtimeInput(
val mediaChunks: List<MediaData.Internal>
val mediaChunks: List<InlineData.Internal>?,
val audio: InlineData.Internal?,
val video: InlineData.Internal?,
val text: String?
)
}
fun toInternal() = Internal(Internal.BidiGenerateContentRealtimeInput(mediaChunks))
fun toInternal() =
Internal(
Internal.BidiGenerateContentRealtimeInput(
mediaChunks?.map { it.toInternal() },
audio?.toInternal(),
video?.toInternal(),
text
)
)
}

private companion object {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import kotlinx.serialization.Serializable
* [Firebase documentation](https://firebase.google.com/docs/vertex-ai/input-file-requirements).
*/
@PublicPreviewAPI
@Deprecated("Use InlineData instead", ReplaceWith("InlineData"))
public class MediaData(public val data: ByteArray, public val mimeType: String) {
@Serializable
internal class Internal(
Expand Down
18 changes: 11 additions & 7 deletions firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package com.google.firebase.ai.type
import android.graphics.Bitmap
import android.graphics.BitmapFactory
import android.util.Log
import com.google.firebase.ai.type.ImagenImageFormat.Internal
import java.io.ByteArrayOutputStream
import kotlinx.serialization.DeserializationStrategy
import kotlinx.serialization.SerialName
Expand Down Expand Up @@ -161,14 +162,17 @@ internal constructor(

@Serializable
internal data class Internal(
@SerialName("inlineData") val inlineData: InlineData,
@SerialName("inlineData") val inlineData: InlineData.Internal,
val thought: Boolean? = null,
val thoughtSignature: String? = null
) : InternalPart {
) : InternalPart
}

@Serializable
internal data class InlineData(@SerialName("mimeType") val mimeType: String, val data: Base64)
}
public class InlineData(public val data: ByteArray, public val mimeType: String) {
@Serializable internal data class Internal(val mimeType: String, val data: Base64)

internal fun toInternal() =
Internal(mimeType, android.util.Base64.encodeToString(data, BASE_64_FLAGS))
}

/** Represents function call name and params received from requests. */
Expand Down Expand Up @@ -334,13 +338,13 @@ internal fun Part.toInternal(): InternalPart {
is TextPart -> TextPart.Internal(text, isThought, thoughtSignature)
is ImagePart ->
InlineDataPart.Internal(
InlineDataPart.Internal.InlineData("image/jpeg", encodeBitmapToBase64Jpeg(image)),
InlineData.Internal("image/jpeg", encodeBitmapToBase64Jpeg(image)),
isThought,
thoughtSignature
)
is InlineDataPart ->
InlineDataPart.Internal(
InlineDataPart.Internal.InlineData(
InlineData.Internal(
mimeType,
android.util.Base64.encodeToString(inlineData, BASE_64_FLAGS)
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import com.google.firebase.ai.type.ImagenEditingConfig;
import com.google.firebase.ai.type.ImagenInlineImage;
import com.google.firebase.ai.type.ImagenMaskReference;
import com.google.firebase.ai.type.InlineData;
import com.google.firebase.ai.type.InlineDataPart;
import com.google.firebase.ai.type.LiveGenerationConfig;
import com.google.firebase.ai.type.LiveServerContent;
Expand Down Expand Up @@ -365,6 +366,9 @@ public void onComplete() {

byte[] bytes = new byte[] {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE};
session.sendMediaStream(List.of(new MediaData(bytes, "image/jxl")));
session.sendAudioRealtime(new InlineData(bytes, "audio/jxl"));
session.sendVideoRealtime(new InlineData(bytes, "image/jxl"));
session.sendTextRealtime("text");

FunctionResponsePart functionResponse =
new FunctionResponsePart("myFunction", new JsonObject(Map.of()));
Expand Down
Loading