firebase · davidmotson · Oct 17, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/firebase-ai/api.txt b/firebase-ai/api.txt
@@ -154,6 +154,9 @@ package com.google.firebase.ai.java {
     method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(boolean enableInterruptions);
     method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
     method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, boolean enableInterruptions);
+    method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null, boolean enableInterruptions);
+    method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null);
+    method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null, boolean enableInterruptions);
     method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
     method public abstract void stopReceiving();
     field public static final com.google.firebase.ai.java.LiveSessionFutures.Companion Companion;
@@ -174,6 +177,10 @@ package com.google.firebase.ai.type {
     ctor public AudioRecordInitializationFailedException(String message);
   }
 
+  public final class AudioTranscriptionConfig {
+    ctor public AudioTranscriptionConfig();
+  }
+
   public final class BlockReason {
     method public String getName();
     method public int getOrdinal();
@@ -839,15 +846,19 @@ package com.google.firebase.ai.type {
     ctor public LiveGenerationConfig.Builder();
     method public com.google.firebase.ai.type.LiveGenerationConfig build();
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setFrequencyPenalty(Float? frequencyPenalty);
+    method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setInputAudioTranscript(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setMaxOutputTokens(Integer? maxOutputTokens);
+    method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setOutputAudioTranscript(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setPresencePenalty(Float? presencePenalty);
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setResponseModality(com.google.firebase.ai.type.ResponseModality? responseModality);
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setSpeechConfig(com.google.firebase.ai.type.SpeechConfig? speechConfig);
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTemperature(Float? temperature);
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopK(Integer? topK);
     method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopP(Float? topP);
     field public Float? frequencyPenalty;
+    field public com.google.firebase.ai.type.AudioTranscriptionConfig? inputAudioTranscription;
     field public Integer? maxOutputTokens;
+    field public com.google.firebase.ai.type.AudioTranscriptionConfig? outputAudioTranscription;
     field public Float? presencePenalty;
     field public com.google.firebase.ai.type.ResponseModality? responseModality;
     field public com.google.firebase.ai.type.SpeechConfig? speechConfig;
@@ -865,14 +876,18 @@ package com.google.firebase.ai.type {
   }
 
   @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveServerContent implements com.google.firebase.ai.type.LiveServerMessage {
-    ctor public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete);
+    ctor public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete, com.google.firebase.ai.type.Transcription? inputTranscription, com.google.firebase.ai.type.Transcription? outputTranscription);
     method public com.google.firebase.ai.type.Content? getContent();
     method public boolean getGenerationComplete();
+    method public com.google.firebase.ai.type.Transcription? getInputTranscription();
     method public boolean getInterrupted();
+    method public com.google.firebase.ai.type.Transcription? getOutputTranscription();
     method public boolean getTurnComplete();
     property public final com.google.firebase.ai.type.Content? content;
     property public final boolean generationComplete;
+    property public final com.google.firebase.ai.type.Transcription? inputTranscription;
     property public final boolean interrupted;
+    property public final com.google.firebase.ai.type.Transcription? outputTranscription;
     property public final boolean turnComplete;
   }
 
@@ -909,6 +924,7 @@ package com.google.firebase.ai.type {
     method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation<? super kotlin.Unit>);
     method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
     method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.coroutines.Continuation<? super kotlin.Unit>);
+    method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
     method public void stopAudioConversation();
     method public void stopReceiving();
   }
@@ -1235,6 +1251,12 @@ package com.google.firebase.ai.type {
     ctor public ToolConfig(com.google.firebase.ai.type.FunctionCallingConfig? functionCallingConfig);
   }
 
+  public final class Transcription {
+    ctor public Transcription(String? text);
+    method public String? getText();
+    property public final String? text;
+  }
+
   public final class UnknownException extends com.google.firebase.ai.type.FirebaseAIException {
   }
 

diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt
@@ -111,7 +111,9 @@ internal constructor(
           modelName,
           config?.toInternal(),
           tools?.map { it.toInternal() },
-          systemInstruction?.toInternal()
+          systemInstruction?.toInternal(),
+          config?.inputAudioTranscription?.toInternal(),
+          config?.outputAudioTranscription?.toInternal()
         )
         .toInternal()
     val data: String = Json.encodeToString(clientMessage)
@@ -135,7 +137,7 @@ internal constructor(
     } catch (e: ClosedReceiveChannelException) {
       val reason = webSession?.closeReason?.await()
       val message =
-        "Channel was closed by the server.${if(reason!=null) " Details: ${reason.message}" else "" }"
+        "Channel was closed by the server.${if (reason != null) " Details: ${reason.message}" else ""}"
       throw ServiceConnectionHandshakeFailedException(message, e)
     }
   }

diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt
@@ -29,6 +29,7 @@ import com.google.firebase.ai.type.LiveSession
 import com.google.firebase.ai.type.MediaData
 import com.google.firebase.ai.type.PublicPreviewAPI
 import com.google.firebase.ai.type.SessionAlreadyReceivingException
+import com.google.firebase.ai.type.Transcription
 import io.ktor.websocket.close
 import kotlinx.coroutines.reactive.asPublisher
 import org.reactivestreams.Publisher
@@ -53,6 +54,18 @@ public abstract class LiveSessionFutures internal constructor() {
     functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?
   ): ListenableFuture<Unit>
 
+  /**
+   * Starts an audio conversation with the model, which can only be stopped using
+   * [stopAudioConversation].
+   * @param transcriptHandler A callback function that is invoked whenever the model receives a
+   * transcript. The first [Transcription] object is the input transcription, and the second is the
+   * output transcription
+   */
+  @RequiresPermission(RECORD_AUDIO)
+  public abstract fun startAudioConversation(
+    transcriptHandler: ((Transcription?, Transcription?) -> Unit)? = null,
+  ): ListenableFuture<Unit>
+
   /**
    * Starts an audio conversation with the model, which can only be stopped using
    * [stopAudioConversation].
@@ -73,6 +86,50 @@ public abstract class LiveSessionFutures internal constructor() {
   @RequiresPermission(RECORD_AUDIO)
   public abstract fun startAudioConversation(enableInterruptions: Boolean): ListenableFuture<Unit>
 
+  /**
+   * Starts an audio conversation with the model, which can only be stopped using
+   * [stopAudioConversation] or [close].
+   *
+   * @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
+   * ongoing reply.
+   *
+   * @param transcriptHandler A callback function that is invoked whenever the model receives a
+   * transcript. The first [Transcription] object is the input transcription, and the second is the
+   * output transcription
+   *
+   * **WARNING**: The user interruption feature relies on device-specific support, and may not be
+   * consistently available.
+   */
+  @RequiresPermission(RECORD_AUDIO)
+  public abstract fun startAudioConversation(
+    transcriptHandler: ((Transcription?, Transcription?) -> Unit)? = null,
+    enableInterruptions: Boolean
+  ): ListenableFuture<Unit>
+
+  /**
+   * Starts an audio conversation with the model, which can only be stopped using
+   * [stopAudioConversation] or [close].
+   *
+   * @param functionCallHandler A callback function that is invoked whenever the model receives a
+   * function call.
+   *
+   * @param transcriptHandler A callback function that is invoked whenever the model receives a
+   * transcript. The first [Transcription] object is the input transcription, and the second is the
+   * output transcription
+   *
+   * @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
+   * ongoing reply.
+   *
+   * **WARNING**: The user interruption feature relies on device-specific support, and may not be
+   * consistently available.
+   */
+  @RequiresPermission(RECORD_AUDIO)
+  public abstract fun startAudioConversation(
+    functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
+    transcriptHandler: ((Transcription?, Transcription?) -> Unit)? = null,
+    enableInterruptions: Boolean
+  ): ListenableFuture<Unit>
+
   /**
    * Starts an audio conversation with the model, which can only be stopped using
    * [stopAudioConversation] or [close].
@@ -233,6 +290,14 @@ public abstract class LiveSessionFutures internal constructor() {
       functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?
     ) = SuspendToFutureAdapter.launchFuture { session.startAudioConversation(functionCallHandler) }
 
+    @RequiresPermission(RECORD_AUDIO)
+    override fun startAudioConversation(
+      transcriptHandler: ((Transcription?, Transcription?) -> Unit)?
+    ) =
+      SuspendToFutureAdapter.launchFuture {
+        session.startAudioConversation(transcriptHandler = transcriptHandler)
+      }
+
     @RequiresPermission(RECORD_AUDIO)
     override fun startAudioConversation() =
       SuspendToFutureAdapter.launchFuture { session.startAudioConversation() }
@@ -243,6 +308,32 @@ public abstract class LiveSessionFutures internal constructor() {
         session.startAudioConversation(enableInterruptions = enableInterruptions)
       }
 
+    @RequiresPermission(RECORD_AUDIO)
+    override fun startAudioConversation(
+      transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
+      enableInterruptions: Boolean
+    ) =
+      SuspendToFutureAdapter.launchFuture {
+        session.startAudioConversation(
+          transcriptHandler = transcriptHandler,
+          enableInterruptions = enableInterruptions
+        )
+      }
+
+    @RequiresPermission(RECORD_AUDIO)
+    override fun startAudioConversation(
+      functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
+      transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
+      enableInterruptions: Boolean
+    ) =
+      SuspendToFutureAdapter.launchFuture {
+        session.startAudioConversation(
+          functionCallHandler = functionCallHandler,
+          transcriptHandler = transcriptHandler,
+          enableInterruptions = enableInterruptions
+        )
+      }
+
     @RequiresPermission(RECORD_AUDIO)
     override fun startAudioConversation(
       functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,

diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioTranscriptionConfig.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioTranscriptionConfig.kt
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.firebase.ai.type
+
+import kotlinx.serialization.Serializable
+
+/** The audio transcription configuration. Its presence enables audio transcription */
+public class AudioTranscriptionConfig() {
+
+  @Serializable internal object Internal
+
+  internal fun toInternal() = Internal
+}
diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt
@@ -32,7 +32,9 @@ internal class LiveClientSetupMessage(
   // needs its own config class
   val generationConfig: LiveGenerationConfig.Internal?,
   val tools: List<Tool.Internal>?,
-  val systemInstruction: Content.Internal?
+  val systemInstruction: Content.Internal?,
+  val inputAudioTranscription: AudioTranscriptionConfig.Internal?,
+  val outputAudioTranscription: AudioTranscriptionConfig.Internal?,
 ) {
   @Serializable
   internal class Internal(val setup: LiveClientSetup) {
@@ -41,10 +43,21 @@ internal class LiveClientSetupMessage(
       val model: String,
       val generationConfig: LiveGenerationConfig.Internal?,
       val tools: List<Tool.Internal>?,
-      val systemInstruction: Content.Internal?
+      val systemInstruction: Content.Internal?,
+      val inputAudioTranscription: AudioTranscriptionConfig.Internal?,
+      val outputAudioTranscription: AudioTranscriptionConfig.Internal?,
     )
   }
 
   fun toInternal() =
-    Internal(Internal.LiveClientSetup(model, generationConfig, tools, systemInstruction))
+    Internal(
+      Internal.LiveClientSetup(
+        model,
+        generationConfig,
+        tools,
+        systemInstruction,
+        inputAudioTranscription,
+        outputAudioTranscription
+      )
+    )
 }
diff --git a/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt b/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt
@@ -53,6 +53,11 @@ import kotlinx.serialization.Serializable
  *
  * @property speechConfig Specifies the voice configuration of the audio response from the server.
  *
+ * @property inputAudioTranscription Specifies the configuration for transcribing input audio.
+ *
+ * @property outputAudioTranscription Specifies the configuration for transcribing output audio from
+ * the model.
+ *
  * Refer to the
  * [Control generated output](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/control-generated-output)
  * guide for more details.
@@ -67,7 +72,9 @@ private constructor(
   internal val presencePenalty: Float?,
   internal val frequencyPenalty: Float?,
   internal val responseModality: ResponseModality?,
-  internal val speechConfig: SpeechConfig?
+  internal val speechConfig: SpeechConfig?,
+  internal val inputAudioTranscription: AudioTranscriptionConfig?,
+  internal val outputAudioTranscription: AudioTranscriptionConfig?,
 ) {
 
   /**
@@ -91,6 +98,10 @@ private constructor(
    * @property responseModality See [LiveGenerationConfig.responseModality]
    *
    * @property speechConfig See [LiveGenerationConfig.speechConfig]
+   *
+   * @property inputAudioTranscription see [LiveGenerationConfig.inputAudioTranscription]
+   *
+   * @property outputAudioTranscription see [LiveGenerationConfig.outputAudioTranscription]
    */
   public class Builder {
     @JvmField public var temperature: Float? = null
@@ -101,6 +112,8 @@ private constructor(
     @JvmField public var frequencyPenalty: Float? = null
     @JvmField public var responseModality: ResponseModality? = null
     @JvmField public var speechConfig: SpeechConfig? = null
+    @JvmField public var inputAudioTranscription: AudioTranscriptionConfig? = null
+    @JvmField public var outputAudioTranscription: AudioTranscriptionConfig? = null
 
     public fun setTemperature(temperature: Float?): Builder = apply {
       this.temperature = temperature
@@ -123,6 +136,14 @@ private constructor(
       this.speechConfig = speechConfig
     }
 
+    public fun setInputAudioTranscript(config: AudioTranscriptionConfig?): Builder = apply {
+      this.inputAudioTranscription = config
+    }
+
+    public fun setOutputAudioTranscript(config: AudioTranscriptionConfig?): Builder = apply {
+      this.outputAudioTranscription = config
+    }
+
     /** Create a new [LiveGenerationConfig] with the attached arguments. */
     public fun build(): LiveGenerationConfig =
       LiveGenerationConfig(
@@ -133,7 +154,9 @@ private constructor(
         presencePenalty = presencePenalty,
         frequencyPenalty = frequencyPenalty,
         speechConfig = speechConfig,
-        responseModality = responseModality
+        responseModality = responseModality,
+        inputAudioTranscription = inputAudioTranscription,
+        outputAudioTranscription = outputAudioTranscription,
       )
   }