firebase · rlazo · Oct 16, 2025 · Oct 13, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/firebase-ai/CHANGELOG.md b/firebase-ai/CHANGELOG.md
@@ -1,5 +1,6 @@
 # Unreleased
 
+- [feature] Added support for sending realtime audio and video in a `LiveSession`.
 - [changed] Removed redundant internal exception types. (#7475)
 
 # 17.4.0

diff --git a/firebase-ai/gradle.properties b/firebase-ai/gradle.properties
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-version=17.4.1
+version=17.5.0
 latestReleasedVersion=17.4.0
diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt
@@ -23,6 +23,7 @@ import com.google.common.util.concurrent.ListenableFuture
 import com.google.firebase.ai.type.Content
 import com.google.firebase.ai.type.FunctionCallPart
 import com.google.firebase.ai.type.FunctionResponsePart
+import com.google.firebase.ai.type.InlineData
 import com.google.firebase.ai.type.LiveServerMessage
 import com.google.firebase.ai.type.LiveSession
 import com.google.firebase.ai.type.MediaData
@@ -126,13 +127,38 @@ public abstract class LiveSessionFutures internal constructor() {
     functionList: List<FunctionResponsePart>
   ): ListenableFuture<Unit>
 
+  /**
+   * Sends audio data to the server in realtime. Check
+   * https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
+   * input usage.
+   * @param audio The audio data to send.
+   */
+  public abstract fun sendAudioRealtime(audio: InlineData): ListenableFuture<Unit>
+
+  /**
+   * Sends video data to the server in realtime. Check
+   * https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
+   * input usage.
+   * @param video The video data to send. Video MIME type could be either video or image.
+   */
+  public abstract fun sendVideoRealtime(video: InlineData): ListenableFuture<Unit>
+
+  /**
+   * Sends text data to the server in realtime. Check
+   * https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
+   * input usage.
+   * @param text The text data to send.
+   */
+  public abstract fun sendTextRealtime(text: String): ListenableFuture<Unit>
+
   /**
    * Streams client data to the model.
    *
    * Calling this after [startAudioConversation] will play the response audio immediately.
    *
    * @param mediaChunks The list of [MediaData] instances representing the media data to be sent.
    */
+  @Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead")
   public abstract fun sendMediaStream(mediaChunks: List<MediaData>): ListenableFuture<Unit>
 
   /**
@@ -190,6 +216,15 @@ public abstract class LiveSessionFutures internal constructor() {
     override fun sendFunctionResponse(functionList: List<FunctionResponsePart>) =
       SuspendToFutureAdapter.launchFuture { session.sendFunctionResponse(functionList) }
 
+    override fun sendAudioRealtime(audio: InlineData): ListenableFuture<Unit> =
+      SuspendToFutureAdapter.launchFuture { session.sendAudioRealtime(audio) }
+
+    override fun sendVideoRealtime(video: InlineData): ListenableFuture<Unit> =
+      SuspendToFutureAdapter.launchFuture { session.sendVideoRealtime(video) }
+
+    override fun sendTextRealtime(text: String): ListenableFuture<Unit> =
+      SuspendToFutureAdapter.launchFuture { session.sendTextRealtime(text) }
+
     override fun sendMediaStream(mediaChunks: List<MediaData>) =
       SuspendToFutureAdapter.launchFuture { session.sendMediaStream(mediaChunks) }
 

diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt
@@ -28,6 +28,7 @@ import com.google.firebase.ai.common.JSON
 import com.google.firebase.ai.common.util.CancelledCoroutineScope
 import com.google.firebase.ai.common.util.accumulateUntil
 import com.google.firebase.ai.common.util.childJob
+import com.google.firebase.ai.type.MediaData.Internal
 import com.google.firebase.annotations.concurrent.Blocking
 import io.ktor.client.plugins.websocket.DefaultClientWebSocketSession
 import io.ktor.websocket.Frame
@@ -255,20 +256,69 @@ internal constructor(
     }
   }
 
+  /**
+   * Sends an audio input stream to the model, using the realtime API.
+   *
+   * To learn more about audio formats, and the required state they should be provided in, see the
+   * docs on
+   * [Supported audio formats](https://cloud.google.com/vertex-ai/generative-ai/docs/live-api#supported-audio-formats)
+   *
+   * @param audio Raw audio data used to update the model on the client's conversation. For best
+   * results, send 16-bit PCM audio at 24kHz.
+   */
+  public suspend fun sendAudioRealtime(audio: InlineData) {
+    FirebaseAIException.catchAsync {
+      val jsonString =
+        Json.encodeToString(BidiGenerateContentRealtimeInputSetup(audio = audio).toInternal())
+      session.send(Frame.Text(jsonString))
+    }
+  }
+
+  /**
+   * Sends a video input stream to the model, using the realtime API.
+   *
+   * @param video Encoded video data, used to update the model on the client's conversation. The
+   * MIME type can be a video format (e.g., `video/webm`) or an image format (e.g., `image/jpeg`).
+   */
+  public suspend fun sendVideoRealtime(video: InlineData) {
+    FirebaseAIException.catchAsync {
+      val jsonString =
+        Json.encodeToString(BidiGenerateContentRealtimeInputSetup(video = video).toInternal())
+      session.send(Frame.Text(jsonString))
+    }
+  }
+
+  /**
+   * Sends a text input stream to the model, using the realtime API.
+   *
+   * @param text Text content to append to the current client's conversation.
+   */
+  public suspend fun sendTextRealtime(text: String) {
+    FirebaseAIException.catchAsync {
+      val jsonString =
+        Json.encodeToString(BidiGenerateContentRealtimeInputSetup(text = text).toInternal())
+      session.send(Frame.Text(jsonString))
+    }
+  }
+
   /**
    * Streams client data to the model.
    *
    * Calling this after [startAudioConversation] will play the response audio immediately.
    *
    * @param mediaChunks The list of [MediaData] instances representing the media data to be sent.
    */
+  @Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead")
   public suspend fun sendMediaStream(
     mediaChunks: List<MediaData>,
   ) {
     FirebaseAIException.catchAsync {
       val jsonString =
         Json.encodeToString(
-          BidiGenerateContentRealtimeInputSetup(mediaChunks.map { (it.toInternal()) }).toInternal()
+          BidiGenerateContentRealtimeInputSetup(
+              mediaChunks.map { InlineData(it.data, it.mimeType) }
+            )
+            .toInternal()
         )
       session.send(Frame.Text(jsonString))
     }
@@ -324,7 +374,7 @@ internal constructor(
       ?.listenToRecording()
       ?.buffer(UNLIMITED)
       ?.accumulateUntil(MIN_BUFFER_SIZE)
-      ?.onEach { sendMediaStream(listOf(MediaData(it, "audio/pcm"))) }
+      ?.onEach { sendAudioRealtime(InlineData(it, "audio/pcm")) }
       ?.catch { throw FirebaseAIException.from(it) }
       ?.launchIn(scope)
   }
@@ -464,15 +514,31 @@ internal constructor(
    *
    * End of turn is derived from user activity (eg; end of speech).
    */
-  internal class BidiGenerateContentRealtimeInputSetup(val mediaChunks: List<MediaData.Internal>) {
+  internal class BidiGenerateContentRealtimeInputSetup(
+    val mediaChunks: List<InlineData>? = null,
+    val audio: InlineData? = null,
+    val video: InlineData? = null,
+    val text: String? = null
+  ) {
     @Serializable
     internal class Internal(val realtimeInput: BidiGenerateContentRealtimeInput) {
       @Serializable
       internal data class BidiGenerateContentRealtimeInput(
-        val mediaChunks: List<MediaData.Internal>
+        val mediaChunks: List<InlineData.Internal>?,
+        val audio: InlineData.Internal?,
+        val video: InlineData.Internal?,
+        val text: String?
       )
     }
-    fun toInternal() = Internal(Internal.BidiGenerateContentRealtimeInput(mediaChunks))
+    fun toInternal() =
+      Internal(
+        Internal.BidiGenerateContentRealtimeInput(
+          mediaChunks?.map { it.toInternal() },
+          audio?.toInternal(),
+          video?.toInternal(),
+          text
+        )
+      )
   }
 
   private companion object {

diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/MediaData.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/MediaData.kt
@@ -27,6 +27,7 @@ import kotlinx.serialization.Serializable
  * [Firebase documentation](https://firebase.google.com/docs/vertex-ai/input-file-requirements).
  */
 @PublicPreviewAPI
+@Deprecated("Use InlineData instead", ReplaceWith("InlineData"))
 public class MediaData(public val data: ByteArray, public val mimeType: String) {
   @Serializable
   internal class Internal(

diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt
@@ -19,6 +19,7 @@ package com.google.firebase.ai.type
 import android.graphics.Bitmap
 import android.graphics.BitmapFactory
 import android.util.Log
+import com.google.firebase.ai.type.ImagenImageFormat.Internal
 import java.io.ByteArrayOutputStream
 import kotlinx.serialization.DeserializationStrategy
 import kotlinx.serialization.SerialName
@@ -161,14 +162,17 @@ internal constructor(
 
   @Serializable
   internal data class Internal(
-    @SerialName("inlineData") val inlineData: InlineData,
+    @SerialName("inlineData") val inlineData: InlineData.Internal,
     val thought: Boolean? = null,
     val thoughtSignature: String? = null
-  ) : InternalPart {
+  ) : InternalPart
+}
 
-    @Serializable
-    internal data class InlineData(@SerialName("mimeType") val mimeType: String, val data: Base64)
-  }
+public class InlineData(public val data: ByteArray, public val mimeType: String) {
+  @Serializable internal data class Internal(val mimeType: String, val data: Base64)
+
+  internal fun toInternal() =
+    Internal(mimeType, android.util.Base64.encodeToString(data, BASE_64_FLAGS))
 }
 
 /** Represents function call name and params received from requests. */
@@ -334,13 +338,13 @@ internal fun Part.toInternal(): InternalPart {
     is TextPart -> TextPart.Internal(text, isThought, thoughtSignature)
     is ImagePart ->
       InlineDataPart.Internal(
-        InlineDataPart.Internal.InlineData("image/jpeg", encodeBitmapToBase64Jpeg(image)),
+        InlineData.Internal("image/jpeg", encodeBitmapToBase64Jpeg(image)),
         isThought,
         thoughtSignature
       )
     is InlineDataPart ->
       InlineDataPart.Internal(
-        InlineDataPart.Internal.InlineData(
+        InlineData.Internal(
           mimeType,
           android.util.Base64.encodeToString(inlineData, BASE_64_FLAGS)
         ),

diff --git a/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java b/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java
@@ -51,6 +51,7 @@
 import com.google.firebase.ai.type.ImagenEditingConfig;
 import com.google.firebase.ai.type.ImagenInlineImage;
 import com.google.firebase.ai.type.ImagenMaskReference;
+import com.google.firebase.ai.type.InlineData;
 import com.google.firebase.ai.type.InlineDataPart;
 import com.google.firebase.ai.type.LiveGenerationConfig;
 import com.google.firebase.ai.type.LiveServerContent;
@@ -365,6 +366,9 @@ public void onComplete() {
 
     byte[] bytes = new byte[] {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE};
     session.sendMediaStream(List.of(new MediaData(bytes, "image/jxl")));
+    session.sendAudioRealtime(new InlineData(bytes, "audio/jxl"));
+    session.sendVideoRealtime(new InlineData(bytes, "image/jxl"));
+    session.sendTextRealtime("text");
 
     FunctionResponsePart functionResponse =
         new FunctionResponsePart("myFunction", new JsonObject(Map.of()));