diff --git a/common/api-review/ai.api.md b/common/api-review/ai.api.md index 8b019190c1f..5dcc442b77e 100644 --- a/common/api-review/ai.api.md +++ b/common/api-review/ai.api.md @@ -85,6 +85,11 @@ export class ArraySchema extends Schema { toJSON(): SchemaRequest; } +// @beta +export interface AudioConversationController { + stop: () => Promise; +} + // @public export abstract class Backend { protected constructor(type: BackendType); @@ -710,7 +715,7 @@ export interface LiveGenerationConfig { frequencyPenalty?: number; maxOutputTokens?: number; presencePenalty?: number; - responseModalities?: [ResponseModality]; + responseModalities?: ResponseModality[]; speechConfig?: SpeechConfig; temperature?: number; topK?: number; @@ -787,6 +792,7 @@ export class LiveSession { // @internal constructor(webSocketHandler: WebSocketHandler, serverMessages: AsyncGenerator); close(): Promise; + inConversation: boolean; isClosed: boolean; receive(): AsyncGenerator; send(request: string | Array, turnComplete?: boolean): Promise; @@ -860,7 +866,7 @@ export const POSSIBLE_ROLES: readonly ["user", "model", "function", "system"]; // @beta export interface PrebuiltVoiceConfig { - voiceConfig?: string; + voiceName?: string; } // @public @@ -882,6 +888,7 @@ export interface RequestOptions { export const ResponseModality: { readonly TEXT: "TEXT"; readonly IMAGE: "IMAGE"; + readonly AUDIO: "AUDIO"; }; // @beta @@ -1031,6 +1038,14 @@ export interface SpeechConfig { voiceConfig?: VoiceConfig; } +// @beta +export function startAudioConversation(liveSession: LiveSession, options?: StartAudioConversationOptions): Promise; + +// @beta +export interface StartAudioConversationOptions { + functionCallingHandler?: (functionCalls: LiveServerToolCall['functionCalls']) => Promise; +} + // @public export interface StartChatParams extends BaseParams { // (undocumented) diff --git a/docs-devsite/_toc.yaml b/docs-devsite/_toc.yaml index 6979b475536..e6a1443c562 100644 --- a/docs-devsite/_toc.yaml +++ b/docs-devsite/_toc.yaml @@ -16,6 +16,8 @@ toc: path: /docs/reference/js/ai.anyofschema.md - title: ArraySchema path: /docs/reference/js/ai.arrayschema.md + - title: AudioConversationController + path: /docs/reference/js/ai.audioconversationcontroller.md - title: Backend path: /docs/reference/js/ai.backend.md - title: BaseParams @@ -160,6 +162,8 @@ toc: path: /docs/reference/js/ai.segment.md - title: SpeechConfig path: /docs/reference/js/ai.speechconfig.md + - title: StartAudioConversationOptions + path: /docs/reference/js/ai.startaudioconversationoptions.md - title: StartChatParams path: /docs/reference/js/ai.startchatparams.md - title: StringSchema diff --git a/docs-devsite/ai.audioconversationcontroller.md b/docs-devsite/ai.audioconversationcontroller.md new file mode 100644 index 00000000000..18820a2fe55 --- /dev/null +++ b/docs-devsite/ai.audioconversationcontroller.md @@ -0,0 +1,41 @@ +Project: /docs/reference/js/_project.yaml +Book: /docs/reference/_book.yaml +page_type: reference + +{% comment %} +DO NOT EDIT THIS FILE! +This is generated by the JS SDK team, and any local changes will be +overwritten. Changes should be made in the source code at +https://github.com/firebase/firebase-js-sdk +{% endcomment %} + +# AudioConversationController interface +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +A controller for managing an active audio conversation. + +Signature: + +```typescript +export interface AudioConversationController +``` + +## Properties + +| Property | Type | Description | +| --- | --- | --- | +| [stop](./ai.audioconversationcontroller.md#audioconversationcontrollerstop) | () => Promise<void> | (Public Preview) Stops the audio conversation, closes the microphone connection, and cleans up resources. Returns a promise that resolves when cleanup is complete. | + +## AudioConversationController.stop + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Stops the audio conversation, closes the microphone connection, and cleans up resources. Returns a promise that resolves when cleanup is complete. + +Signature: + +```typescript +stop: () => Promise; +``` diff --git a/docs-devsite/ai.livegenerationconfig.md b/docs-devsite/ai.livegenerationconfig.md index a9724ccd86c..1a920afa1e7 100644 --- a/docs-devsite/ai.livegenerationconfig.md +++ b/docs-devsite/ai.livegenerationconfig.md @@ -28,7 +28,7 @@ export interface LiveGenerationConfig | [frequencyPenalty](./ai.livegenerationconfig.md#livegenerationconfigfrequencypenalty) | number | (Public Preview) Frequency penalties. | | [maxOutputTokens](./ai.livegenerationconfig.md#livegenerationconfigmaxoutputtokens) | number | (Public Preview) Specifies the maximum number of tokens that can be generated in the response. The number of tokens per word varies depending on the language outputted. Is unbounded by default. | | [presencePenalty](./ai.livegenerationconfig.md#livegenerationconfigpresencepenalty) | number | (Public Preview) Positive penalties. | -| [responseModalities](./ai.livegenerationconfig.md#livegenerationconfigresponsemodalities) | \[[ResponseModality](./ai.md#responsemodality)\] | (Public Preview) The modalities of the response. | +| [responseModalities](./ai.livegenerationconfig.md#livegenerationconfigresponsemodalities) | [ResponseModality](./ai.md#responsemodality)\[\] | (Public Preview) The modalities of the response. | | [speechConfig](./ai.livegenerationconfig.md#livegenerationconfigspeechconfig) | [SpeechConfig](./ai.speechconfig.md#speechconfig_interface) | (Public Preview) Configuration for speech synthesis. | | [temperature](./ai.livegenerationconfig.md#livegenerationconfigtemperature) | number | (Public Preview) Controls the degree of randomness in token selection. A temperature value of 0 means that the highest probability tokens are always selected. In this case, responses for a given prompt are mostly deterministic, but a small amount of variation is still possible. | | [topK](./ai.livegenerationconfig.md#livegenerationconfigtopk) | number | (Public Preview) Changes how the model selects token for output. A topK value of 1 means the select token is the most probable among all tokens in the model's vocabulary, while a topK value 3 means that the next token is selected from among the 3 most probably using probabilities sampled. Tokens are then further filtered with the highest selected temperature sampling. Defaults to 40 if unspecified. | @@ -83,7 +83,7 @@ The modalities of the response. Signature: ```typescript -responseModalities?: [ResponseModality]; +responseModalities?: ResponseModality[]; ``` ## LiveGenerationConfig.speechConfig diff --git a/docs-devsite/ai.livesession.md b/docs-devsite/ai.livesession.md index bb2eca88b89..6ae2cde711c 100644 --- a/docs-devsite/ai.livesession.md +++ b/docs-devsite/ai.livesession.md @@ -29,6 +29,7 @@ export declare class LiveSession | Property | Modifiers | Type | Description | | --- | --- | --- | --- | +| [inConversation](./ai.livesession.md#livesessioninconversation) | | boolean | (Public Preview) Indicates whether this Live session is being controlled by an AudioConversationController. | | [isClosed](./ai.livesession.md#livesessionisclosed) | | boolean | (Public Preview) Indicates whether this Live session is closed. | ## Methods @@ -41,6 +42,19 @@ export declare class LiveSession | [sendMediaChunks(mediaChunks)](./ai.livesession.md#livesessionsendmediachunks) | | (Public Preview) Sends realtime input to the server. | | [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | (Public Preview) Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). | +## LiveSession.inConversation + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Indicates whether this Live session is being controlled by an `AudioConversationController`. + +Signature: + +```typescript +inConversation: boolean; +``` + ## LiveSession.isClosed > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. diff --git a/docs-devsite/ai.md b/docs-devsite/ai.md index a70772dc763..e6811e96afa 100644 --- a/docs-devsite/ai.md +++ b/docs-devsite/ai.md @@ -22,6 +22,8 @@ The Firebase AI Web SDK. | [getGenerativeModel(ai, modelParams, requestOptions)](./ai.md#getgenerativemodel_80bd839) | Returns a [GenerativeModel](./ai.generativemodel.md#generativemodel_class) class with methods for inference and other functionality. | | [getImagenModel(ai, modelParams, requestOptions)](./ai.md#getimagenmodel_e1f6645) | (Public Preview) Returns an [ImagenModel](./ai.imagenmodel.md#imagenmodel_class) class with methods for using Imagen.Only Imagen 3 models (named imagen-3.0-*) are supported. | | [getLiveGenerativeModel(ai, modelParams)](./ai.md#getlivegenerativemodel_f2099ac) | (Public Preview) Returns a [LiveGenerativeModel](./ai.livegenerativemodel.md#livegenerativemodel_class) class for real-time, bidirectional communication.The Live API is only supported in modern browser windows and Node >= 22. | +| function(liveSession, ...) | +| [startAudioConversation(liveSession, options)](./ai.md#startaudioconversation_01c8e7f) | (Public Preview) Starts a real-time, bidirectional audio conversation with the model. This helper function manages the complexities of microphone access, audio recording, playback, and interruptions. | ## Classes @@ -53,6 +55,7 @@ The Firebase AI Web SDK. | --- | --- | | [AI](./ai.ai.md#ai_interface) | An instance of the Firebase AI SDK.Do not create this instance directly. Instead, use [getAI()](./ai.md#getai_a94a413). | | [AIOptions](./ai.aioptions.md#aioptions_interface) | Options for initializing the AI service using [getAI()](./ai.md#getai_a94a413). This allows specifying which backend to use (Vertex AI Gemini API or Gemini Developer API) and configuring its specific options (like location for Vertex AI). | +| [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface) | (Public Preview) A controller for managing an active audio conversation. | | [BaseParams](./ai.baseparams.md#baseparams_interface) | Base parameters for a number of methods. | | [Citation](./ai.citation.md#citation_interface) | A single citation. | | [CitationMetadata](./ai.citationmetadata.md#citationmetadata_interface) | Citation metadata that may be found on a [GenerateContentCandidate](./ai.generatecontentcandidate.md#generatecontentcandidate_interface). | @@ -112,6 +115,7 @@ The Firebase AI Web SDK. | [SearchEntrypoint](./ai.searchentrypoint.md#searchentrypoint_interface) | Google search entry point. | | [Segment](./ai.segment.md#segment_interface) | Represents a specific segment within a [Content](./ai.content.md#content_interface) object, often used to pinpoint the exact location of text or data that grounding information refers to. | | [SpeechConfig](./ai.speechconfig.md#speechconfig_interface) | (Public Preview) Configures speech synthesis. | +| [StartAudioConversationOptions](./ai.startaudioconversationoptions.md#startaudioconversationoptions_interface) | (Public Preview) Options for [startAudioConversation()](./ai.md#startaudioconversation_01c8e7f). | | [StartChatParams](./ai.startchatparams.md#startchatparams_interface) | Params for [GenerativeModel.startChat()](./ai.generativemodel.md#generativemodelstartchat). | | [TextPart](./ai.textpart.md#textpart_interface) | Content part interface if the part represents a text string. | | [ThinkingConfig](./ai.thinkingconfig.md#thinkingconfig_interface) | Configuration for "thinking" behavior of compatible Gemini models.Certain models utilize a thinking process before generating a response. This allows them to reason through complex problems and plan a more coherent and accurate answer. | @@ -307,6 +311,76 @@ export declare function getLiveGenerativeModel(ai: AI, modelParams: LiveModelPar If the `apiKey` or `projectId` fields are missing in your Firebase config. +## function(liveSession, ...) + +### startAudioConversation(liveSession, options) {:#startaudioconversation_01c8e7f} + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Starts a real-time, bidirectional audio conversation with the model. This helper function manages the complexities of microphone access, audio recording, playback, and interruptions. + +Important: This function must be called in response to a user gesture (for example, a button click) to comply with [browser autoplay policies](https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Best_practices#autoplay_policy). + +Signature: + +```typescript +export declare function startAudioConversation(liveSession: LiveSession, options?: StartAudioConversationOptions): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| liveSession | [LiveSession](./ai.livesession.md#livesession_class) | An active [LiveSession](./ai.livesession.md#livesession_class) instance. | +| options | [StartAudioConversationOptions](./ai.startaudioconversationoptions.md#startaudioconversationoptions_interface) | Configuration options for the audio conversation. | + +Returns: + +Promise<[AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface)> + +A `Promise` that resolves with an [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface). + +#### Exceptions + +`AIError` if the environment does not support required Web APIs (`UNSUPPORTED`), if a conversation is already active (`REQUEST_ERROR`), the session is closed (`SESSION_CLOSED`), or if an unexpected initialization error occurs (`ERROR`). + +`DOMException` Thrown by `navigator.mediaDevices.getUserMedia()` if issues occur with microphone access, such as permissions being denied (`NotAllowedError`) or no compatible hardware being found (`NotFoundError`). See the [MDN documentation](https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia#exceptions) for a full list of exceptions. + +### Example + + +```javascript +const liveSession = await model.connect(); +let conversationController; + +// This function must be called from within a click handler. +async function startConversation() { + try { + conversationController = await startAudioConversation(liveSession); + } catch (e) { + // Handle AI-specific errors + if (e instanceof AIError) { + console.error("AI Error:", e.message); + } + // Handle microphone permission and hardware errors + else if (e instanceof DOMException) { + console.error("Microphone Error:", e.message); + } + // Handle other unexpected errors + else { + console.error("An unexpected error occurred:", e); + } + } +} + +// Later, to stop the conversation: +// if (conversationController) { +// await conversationController.stop(); +// } + +``` + ## AIErrorCode Standardized error codes that [AIError](./ai.aierror.md#aierror_class) can have. @@ -589,6 +663,7 @@ Generation modalities to be returned in generation responses. ResponseModality: { readonly TEXT: "TEXT"; readonly IMAGE: "IMAGE"; + readonly AUDIO: "AUDIO"; } ``` diff --git a/docs-devsite/ai.prebuiltvoiceconfig.md b/docs-devsite/ai.prebuiltvoiceconfig.md index a5600e9a650..8627ae184b3 100644 --- a/docs-devsite/ai.prebuiltvoiceconfig.md +++ b/docs-devsite/ai.prebuiltvoiceconfig.md @@ -25,9 +25,9 @@ export interface PrebuiltVoiceConfig | Property | Type | Description | | --- | --- | --- | -| [voiceConfig](./ai.prebuiltvoiceconfig.md#prebuiltvoiceconfigvoiceconfig) | string | (Public Preview) The voice name to use for speech synthesis.For a full list of names and demos of what each voice sounds like, see [Chirp 3: HD Voices](https://cloud.google.com/text-to-speech/docs/chirp3-hd). | +| [voiceName](./ai.prebuiltvoiceconfig.md#prebuiltvoiceconfigvoicename) | string | (Public Preview) The voice name to use for speech synthesis.For a full list of names and demos of what each voice sounds like, see [Chirp 3: HD Voices](https://cloud.google.com/text-to-speech/docs/chirp3-hd). | -## PrebuiltVoiceConfig.voiceConfig +## PrebuiltVoiceConfig.voiceName > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > @@ -39,5 +39,5 @@ For a full list of names and demos of what each voice sounds like, see [Chirp 3: Signature: ```typescript -voiceConfig?: string; +voiceName?: string; ``` diff --git a/docs-devsite/ai.startaudioconversationoptions.md b/docs-devsite/ai.startaudioconversationoptions.md new file mode 100644 index 00000000000..08e91d2c7b5 --- /dev/null +++ b/docs-devsite/ai.startaudioconversationoptions.md @@ -0,0 +1,41 @@ +Project: /docs/reference/js/_project.yaml +Book: /docs/reference/_book.yaml +page_type: reference + +{% comment %} +DO NOT EDIT THIS FILE! +This is generated by the JS SDK team, and any local changes will be +overwritten. Changes should be made in the source code at +https://github.com/firebase/firebase-js-sdk +{% endcomment %} + +# StartAudioConversationOptions interface +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Options for [startAudioConversation()](./ai.md#startaudioconversation_01c8e7f). + +Signature: + +```typescript +export interface StartAudioConversationOptions +``` + +## Properties + +| Property | Type | Description | +| --- | --- | --- | +| [functionCallingHandler](./ai.startaudioconversationoptions.md#startaudioconversationoptionsfunctioncallinghandler) | (functionCalls: [LiveServerToolCall](./ai.liveservertoolcall.md#liveservertoolcall_interface)\['functionCalls'\]) => Promise<[Part](./ai.md#part)> | (Public Preview) An async handler that is called when the model requests a function to be executed. The handler should perform the function call and return the result as a Part, which will then be sent back to the model. | + +## StartAudioConversationOptions.functionCallingHandler + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +An async handler that is called when the model requests a function to be executed. The handler should perform the function call and return the result as a `Part`, which will then be sent back to the model. + +Signature: + +```typescript +functionCallingHandler?: (functionCalls: LiveServerToolCall['functionCalls']) => Promise; +``` diff --git a/packages/ai/src/api.ts b/packages/ai/src/api.ts index 418c17bb49c..5b7d02c934a 100644 --- a/packages/ai/src/api.ts +++ b/packages/ai/src/api.ts @@ -45,6 +45,11 @@ export * from './requests/schema-builder'; export { ImagenImageFormat } from './requests/imagen-image-format'; export { AIModel, GenerativeModel, LiveGenerativeModel, ImagenModel, AIError }; export { Backend, VertexAIBackend, GoogleAIBackend } from './backend'; +export { + startAudioConversation, + AudioConversationController, + StartAudioConversationOptions +} from './methods/live-session-helpers'; declare module '@firebase/component' { interface NameServiceMapping { diff --git a/packages/ai/src/methods/live-session-helpers.test.ts b/packages/ai/src/methods/live-session-helpers.test.ts new file mode 100644 index 00000000000..d7d1e2aabbf --- /dev/null +++ b/packages/ai/src/methods/live-session-helpers.test.ts @@ -0,0 +1,356 @@ +/** + * @license + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { expect, use } from 'chai'; +import sinon, { SinonFakeTimers, SinonStub, SinonStubbedInstance } from 'sinon'; +import sinonChai from 'sinon-chai'; +import chaiAsPromised from 'chai-as-promised'; +import { AIError } from '../errors'; +import { startAudioConversation } from './live-session-helpers'; +import { LiveServerContent, LiveServerToolCall, Part } from '../types'; +import { logger } from '../logger'; +import { isNode } from '@firebase/util'; + +use(sinonChai); +use(chaiAsPromised); + +// A mock message generator to simulate receiving messages from the server. +class MockMessageGenerator { + private resolvers: Array<(result: IteratorResult) => void> = []; + isDone = false; + + next(): Promise> { + return new Promise(resolve => this.resolvers.push(resolve)); + } + + simulateMessage(message: any): void { + const resolver = this.resolvers.shift(); + if (resolver) { + resolver({ value: message, done: false }); + } + } + + endStream(): void { + if (this.isDone) { + return; + } + this.isDone = true; + this.resolvers.forEach(resolve => + resolve({ value: undefined, done: true }) + ); + this.resolvers = []; + } +} + +// A mock LiveSession to intercept calls to the server. +class MockLiveSession { + isClosed = false; + inConversation = false; + send = sinon.stub(); + sendMediaChunks = sinon.stub(); + messageGenerator = new MockMessageGenerator(); + receive = (): MockMessageGenerator => this.messageGenerator; +} + +// Stubs and mocks for Web APIs used by the helpers. +let mockAudioContext: SinonStubbedInstance; +let mockMediaStream: SinonStubbedInstance; +let getUserMediaStub: SinonStub; +let mockWorkletNode: SinonStubbedInstance; +let mockSourceNode: SinonStubbedInstance; +let mockAudioBufferSource: any; + +function setupGlobalMocks(): void { + // Mock AudioWorkletNode + mockWorkletNode = { + port: { + postMessage: sinon.stub(), + onmessage: null + }, + connect: sinon.stub(), + disconnect: sinon.stub() + } as any; + sinon.stub(global, 'AudioWorkletNode').returns(mockWorkletNode); + + // Mock AudioContext + mockAudioBufferSource = { + connect: sinon.stub(), + start: sinon.stub(), + stop: sinon.stub(), + onended: null, + buffer: { duration: 0.5 } // Mock duration for scheduling + }; + mockSourceNode = { + connect: sinon.stub(), + disconnect: sinon.stub() + } as any; + mockAudioContext = { + resume: sinon.stub().resolves(), + close: sinon.stub().resolves(), + createBuffer: sinon.stub().returns({ + getChannelData: sinon.stub().returns(new Float32Array(1)) + } as any), + createBufferSource: sinon.stub().returns(mockAudioBufferSource), + createMediaStreamSource: sinon.stub().returns(mockSourceNode), + audioWorklet: { + addModule: sinon.stub().resolves() + }, + state: 'suspended' as AudioContextState, + currentTime: 0 + } as any; + sinon.stub(global, 'AudioContext').returns(mockAudioContext); + + // Mock other globals + sinon.stub(global, 'Blob').returns({} as Blob); + sinon.stub(URL, 'createObjectURL').returns('blob:http://localhost/fake-url'); + + // Mock getUserMedia + mockMediaStream = { + getTracks: sinon.stub().returns([{ stop: sinon.stub() } as any]) + } as any; + getUserMediaStub = sinon.stub().resolves(mockMediaStream); + if (typeof navigator === 'undefined') { + (global as any).navigator = { + mediaDevices: { getUserMedia: getUserMediaStub } + }; + } else { + if (!navigator.mediaDevices) { + (navigator as any).mediaDevices = {}; + } + sinon + .stub(navigator.mediaDevices, 'getUserMedia') + .callsFake(getUserMediaStub); + } +} + +describe('Audio Conversation Helpers', () => { + let clock: SinonFakeTimers; + + if (isNode()) { + return; + } + + beforeEach(() => { + clock = sinon.useFakeTimers(); + setupGlobalMocks(); + }); + + afterEach(() => { + sinon.restore(); + clock.restore(); + }); + + describe('startAudioConversation', () => { + let liveSession: MockLiveSession; + beforeEach(() => { + liveSession = new MockLiveSession(); + }); + + it('should throw if the session is closed.', async () => { + liveSession.isClosed = true; + await expect( + startAudioConversation(liveSession as any) + ).to.be.rejectedWith(AIError, /on a closed LiveSession/); + }); + + it('should throw if a conversation is in progress.', async () => { + liveSession.inConversation = true; + await expect( + startAudioConversation(liveSession as any) + ).to.be.rejectedWith(AIError, /is already in progress/); + }); + + it('should throw if APIs are not supported.', async () => { + (global as any).AudioWorkletNode = undefined; // Simulate lack of support + await expect( + startAudioConversation(liveSession as any) + ).to.be.rejectedWith(AIError, /not supported in this environment/); + }); + + it('should throw if microphone permissions are denied.', async () => { + getUserMediaStub.rejects( + new DOMException('Permission denied', 'NotAllowedError') + ); + await expect( + startAudioConversation(liveSession as any) + ).to.be.rejectedWith(DOMException, /Permission denied/); + }); + + it('should return a controller with a stop method on success.', async () => { + const controller = await startAudioConversation(liveSession as any); + expect(controller).to.have.property('stop').that.is.a('function'); + // Ensure it doesn't throw during cleanup + await expect(controller.stop()).to.be.fulfilled; + }); + }); + + describe('AudioConversationRunner', () => { + let liveSession: MockLiveSession; + let warnStub: SinonStub; + + beforeEach(() => { + liveSession = new MockLiveSession(); + warnStub = sinon.stub(logger, 'warn'); + }); + + afterEach(() => { + warnStub.restore(); + }); + + it('should send processed audio chunks received from the worklet.', async () => { + const controller = await startAudioConversation(liveSession as any); + expect(mockWorkletNode.port.onmessage).to.be.a('function'); + + // Simulate the worklet sending a message + const fakeAudioData = new Int16Array(128); + mockWorkletNode.port.onmessage!({ data: fakeAudioData } as MessageEvent); + + await clock.tickAsync(1); + + expect(liveSession.sendMediaChunks).to.have.been.calledOnce; + const [sentChunk] = liveSession.sendMediaChunks.getCall(0).args[0]; + expect(sentChunk.mimeType).to.equal('audio/pcm'); + expect(sentChunk.data).to.be.a('string'); + await controller.stop(); + }); + + it('should queue and play audio from a serverContent message.', async () => { + const controller = await startAudioConversation(liveSession as any); + const serverMessage: LiveServerContent = { + type: 'serverContent', + modelTurn: { + role: 'model', + parts: [ + { inlineData: { mimeType: 'audio/pcm', data: '1111222233334444' } } + ] // base64 for dummy data + } + }; + + liveSession.messageGenerator.simulateMessage(serverMessage); + await clock.tickAsync(1); // allow message processing + + expect(mockAudioContext.createBuffer).to.have.been.calledOnce; + expect(mockAudioBufferSource.start).to.have.been.calledOnce; + await controller.stop(); + }); + + it('should call function handler and send result on toolCall message.', async () => { + const handlerStub = sinon.stub().resolves({ + functionResponse: { name: 'get_weather', response: { temp: '72F' } } + } as Part); + const controller = await startAudioConversation(liveSession as any, { + functionCallingHandler: handlerStub + }); + + const toolCallMessage: LiveServerToolCall = { + type: 'toolCall', + functionCalls: [{ name: 'get_weather', args: { location: 'LA' } }] + }; + + liveSession.messageGenerator.simulateMessage(toolCallMessage); + await clock.tickAsync(1); + + expect(handlerStub).to.have.been.calledOnceWith( + toolCallMessage.functionCalls + ); + expect(liveSession.send).to.have.been.calledOnceWith([ + { functionResponse: { name: 'get_weather', response: { temp: '72F' } } } + ]); + await controller.stop(); + }); + + it('should clear queue and stop sources on an interruption message.', async () => { + const controller = await startAudioConversation(liveSession as any); + + // 1. Enqueue some audio that is "playing" + const playingMessage: LiveServerContent = { + type: 'serverContent', + modelTurn: { + parts: [ + { inlineData: { mimeType: 'audio/pcm', data: '1111222233334444' } } + ], + role: 'model' + } + }; + liveSession.messageGenerator.simulateMessage(playingMessage); + await clock.tickAsync(1); + expect(mockAudioBufferSource.start).to.have.been.calledOnce; + + // 2. Enqueue another chunk that is now scheduled + liveSession.messageGenerator.simulateMessage(playingMessage); + await clock.tickAsync(1); + expect(mockAudioBufferSource.start).to.have.been.calledTwice; + + // 3. Send interruption message + const interruptionMessage: LiveServerContent = { + type: 'serverContent', + interrupted: true + }; + liveSession.messageGenerator.simulateMessage(interruptionMessage); + await clock.tickAsync(1); + + // Assert that all scheduled sources were stopped. + expect(mockAudioBufferSource.stop).to.have.been.calledTwice; + + // 4. Send new audio post-interruption + const newMessage: LiveServerContent = { + type: 'serverContent', + modelTurn: { + parts: [ + { inlineData: { mimeType: 'audio/pcm', data: '1111222233334444' } } + ], + role: 'model' + } + }; + liveSession.messageGenerator.simulateMessage(newMessage); + await clock.tickAsync(1); + + // Assert a new source was created and started (total of 3 starts) + expect(mockAudioBufferSource.start).to.have.been.calledThrice; + + await controller.stop(); + }); + + it('should warn if no function handler is provided for a toolCall message.', async () => { + const controller = await startAudioConversation(liveSession as any); + liveSession.messageGenerator.simulateMessage({ + type: 'toolCall', + functionCalls: [{ name: 'test' }] + }); + await clock.tickAsync(1); + + expect(warnStub).to.have.been.calledWithMatch( + /functionCallingHandler is undefined/ + ); + await controller.stop(); + }); + + it('stop() should call cleanup and release all resources.', async () => { + const controller = await startAudioConversation(liveSession as any); + + // Need to spy on the internal runner's cleanup method. This is a bit tricky. + // We can't do it directly. Instead, we'll just check the mock results. + await controller.stop(); + + expect(mockWorkletNode.disconnect).to.have.been.calledOnce; + expect(mockSourceNode.disconnect).to.have.been.calledOnce; + expect(mockMediaStream.getTracks()[0].stop).to.have.been.calledOnce; + expect(mockAudioContext.close).to.have.been.calledOnce; + expect(liveSession.inConversation).to.be.false; + }); + }); +}); diff --git a/packages/ai/src/methods/live-session-helpers.ts b/packages/ai/src/methods/live-session-helpers.ts new file mode 100644 index 00000000000..e52715de36e --- /dev/null +++ b/packages/ai/src/methods/live-session-helpers.ts @@ -0,0 +1,497 @@ +/** + * @license + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { AIError } from '../errors'; +import { logger } from '../logger'; +import { + AIErrorCode, + GenerativeContentBlob, + LiveServerContent, + LiveServerToolCall, + Part +} from '../types'; +import { LiveSession } from './live-session'; +import { Deferred } from '@firebase/util'; + +const SERVER_INPUT_SAMPLE_RATE = 16_000; +const SERVER_OUTPUT_SAMPLE_RATE = 24_000; + +const AUDIO_PROCESSOR_NAME = 'audio-processor'; + +/** + * The JS for an `AudioWorkletProcessor`. + * This processor is responsible for taking raw audio from the microphone, + * converting it to the required 16-bit 16kHz PCM, and posting it back to the main thread. + * + * See: https://developer.mozilla.org/en-US/docs/Web/API/AudioWorkletProcessor + * + * It is defined as a string here so that it can be converted into a `Blob` + * and loaded at runtime. + */ +const audioProcessorWorkletString = ` + class AudioProcessor extends AudioWorkletProcessor { + constructor(options) { + super(); + this.targetSampleRate = options.processorOptions.targetSampleRate; + // 'sampleRate' is a global variable available inside the AudioWorkletGlobalScope, + // representing the native sample rate of the AudioContext. + this.inputSampleRate = sampleRate; + } + + /** + * This method is called by the browser's audio engine for each block of audio data. + * Input is a single input, with a single channel (input[0][0]). + */ + process(inputs) { + const input = inputs[0]; + if (input && input.length > 0 && input[0].length > 0) { + const pcmData = input[0]; // Float32Array of raw audio samples. + + // Simple linear interpolation for resampling. + const resampled = new Float32Array(Math.round(pcmData.length * this.targetSampleRate / this.inputSampleRate)); + const ratio = pcmData.length / resampled.length; + for (let i = 0; i < resampled.length; i++) { + resampled[i] = pcmData[Math.floor(i * ratio)]; + } + + // Convert Float32 (-1, 1) samples to Int16 (-32768, 32767) + const resampledInt16 = new Int16Array(resampled.length); + for (let i = 0; i < resampled.length; i++) { + const sample = Math.max(-1, Math.min(1, resampled[i])); + if (sample < 0) { + resampledInt16[i] = sample * 32768; + } else { + resampledInt16[i] = sample * 32767; + } + } + + this.port.postMessage(resampledInt16); + } + // Return true to keep the processor alive and processing the next audio block. + return true; + } + } + + // Register the processor with a name that can be used to instantiate it from the main thread. + registerProcessor('${AUDIO_PROCESSOR_NAME}', AudioProcessor); +`; + +/** + * A controller for managing an active audio conversation. + * + * @beta + */ +export interface AudioConversationController { + /** + * Stops the audio conversation, closes the microphone connection, and + * cleans up resources. Returns a promise that resolves when cleanup is complete. + */ + stop: () => Promise; +} + +/** + * Options for {@link startAudioConversation}. + * + * @beta + */ +export interface StartAudioConversationOptions { + /** + * An async handler that is called when the model requests a function to be executed. + * The handler should perform the function call and return the result as a `Part`, + * which will then be sent back to the model. + */ + functionCallingHandler?: ( + functionCalls: LiveServerToolCall['functionCalls'] + ) => Promise; +} + +/** + * Dependencies needed by the {@link AudioConversationRunner}. + * + * @internal + */ +interface RunnerDependencies { + audioContext: AudioContext; + mediaStream: MediaStream; + sourceNode: MediaStreamAudioSourceNode; + workletNode: AudioWorkletNode; +} + +/** + * Encapsulates the core logic of an audio conversation. + * + * @internal + */ +export class AudioConversationRunner { + /** A flag to indicate if the conversation has been stopped. */ + private isStopped = false; + /** A deferred that contains a promise that is resolved when stop() is called, to unblock the receive loop. */ + private readonly stopDeferred = new Deferred(); + /** A promise that tracks the lifecycle of the main `runReceiveLoop`. */ + private readonly receiveLoopPromise: Promise; + + /** A FIFO queue of 24kHz, 16-bit PCM audio chunks received from the server. */ + private readonly playbackQueue: ArrayBuffer[] = []; + /** Tracks scheduled audio sources. Used to cancel scheduled audio when the model is interrupted. */ + private scheduledSources: AudioBufferSourceNode[] = []; + /** A high-precision timeline pointer for scheduling gapless audio playback. */ + private nextStartTime = 0; + /** A mutex to prevent the playback processing loop from running multiple times concurrently. */ + private isPlaybackLoopRunning = false; + + constructor( + private readonly liveSession: LiveSession, + private readonly options: StartAudioConversationOptions, + private readonly deps: RunnerDependencies + ) { + this.liveSession.inConversation = true; + + // Start listening for messages from the server. + this.receiveLoopPromise = this.runReceiveLoop().finally(() => + this.cleanup() + ); + + // Set up the handler for receiving processed audio data from the worklet. + // Message data has been resampled to 16kHz 16-bit PCM. + this.deps.workletNode.port.onmessage = event => { + if (this.isStopped) { + return; + } + + const pcm16 = event.data as Int16Array; + const base64 = btoa( + String.fromCharCode.apply( + null, + Array.from(new Uint8Array(pcm16.buffer)) + ) + ); + + const chunk: GenerativeContentBlob = { + mimeType: 'audio/pcm', + data: base64 + }; + void this.liveSession.sendMediaChunks([chunk]); + }; + } + + /** + * Stops the conversation and unblocks the main receive loop. + */ + async stop(): Promise { + if (this.isStopped) { + return; + } + this.isStopped = true; + this.stopDeferred.resolve(); // Unblock the receive loop + await this.receiveLoopPromise; // Wait for the loop and cleanup to finish + } + + /** + * Cleans up all audio resources (nodes, stream tracks, context) and marks the + * session as no longer in a conversation. + */ + private cleanup(): void { + this.interruptPlayback(); // Ensure all audio is stopped on final cleanup. + this.deps.workletNode.port.onmessage = null; + this.deps.workletNode.disconnect(); + this.deps.sourceNode.disconnect(); + this.deps.mediaStream.getTracks().forEach(track => track.stop()); + if (this.deps.audioContext.state !== 'closed') { + void this.deps.audioContext.close(); + } + this.liveSession.inConversation = false; + } + + /** + * Adds audio data to the queue and ensures the playback loop is running. + */ + private enqueueAndPlay(audioData: ArrayBuffer): void { + this.playbackQueue.push(audioData); + // Will no-op if it's already running. + void this.processPlaybackQueue(); + } + + /** + * Stops all current and pending audio playback and clears the queue. This is + * called when the server indicates the model's speech was interrupted with + * `LiveServerContent.modelTurn.interrupted`. + */ + private interruptPlayback(): void { + // Stop all sources that have been scheduled. The onended event will fire for each, + // which will clean up the scheduledSources array. + [...this.scheduledSources].forEach(source => source.stop(0)); + + // Clear the internal buffer of unprocessed audio chunks. + this.playbackQueue.length = 0; + + // Reset the playback clock to start fresh. + this.nextStartTime = this.deps.audioContext.currentTime; + } + + /** + * Processes the playback queue in a loop, scheduling each chunk in a gapless sequence. + */ + private async processPlaybackQueue(): Promise { + if (this.isPlaybackLoopRunning) { + return; + } + this.isPlaybackLoopRunning = true; + + while (this.playbackQueue.length > 0 && !this.isStopped) { + const pcmRawBuffer = this.playbackQueue.shift()!; + try { + const pcm16 = new Int16Array(pcmRawBuffer); + const frameCount = pcm16.length; + + const audioBuffer = this.deps.audioContext.createBuffer( + 1, + frameCount, + SERVER_OUTPUT_SAMPLE_RATE + ); + + // Convert 16-bit PCM to 32-bit PCM, required by the Web Audio API. + const channelData = audioBuffer.getChannelData(0); + for (let i = 0; i < frameCount; i++) { + channelData[i] = pcm16[i] / 32768; // Normalize to Float32 range [-1.0, 1.0] + } + + const source = this.deps.audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(this.deps.audioContext.destination); + + // Track the source and set up a handler to remove it from tracking when it finishes. + this.scheduledSources.push(source); + source.onended = () => { + this.scheduledSources = this.scheduledSources.filter( + s => s !== source + ); + }; + + // To prevent gaps, schedule the next chunk to start either now (if we're catching up) + // or exactly when the previous chunk is scheduled to end. + this.nextStartTime = Math.max( + this.deps.audioContext.currentTime, + this.nextStartTime + ); + source.start(this.nextStartTime); + + // Update the schedule for the *next* chunk. + this.nextStartTime += audioBuffer.duration; + } catch (e) { + logger.error('Error playing audio:', e); + } + } + + this.isPlaybackLoopRunning = false; + } + + /** + * The main loop that listens for and processes messages from the server. + */ + private async runReceiveLoop(): Promise { + const messageGenerator = this.liveSession.receive(); + while (!this.isStopped) { + const result = await Promise.race([ + messageGenerator.next(), + this.stopDeferred.promise + ]); + + if (this.isStopped || !result || result.done) { + break; + } + + const message = result.value; + if (message.type === 'serverContent') { + const serverContent = message as LiveServerContent; + if (serverContent.interrupted) { + this.interruptPlayback(); + } + + const audioPart = serverContent.modelTurn?.parts.find(part => + part.inlineData?.mimeType.startsWith('audio/') + ); + if (audioPart?.inlineData) { + const audioData = Uint8Array.from( + atob(audioPart.inlineData.data), + c => c.charCodeAt(0) + ).buffer; + this.enqueueAndPlay(audioData); + } + } else if (message.type === 'toolCall') { + if (!this.options.functionCallingHandler) { + logger.warn( + 'Received tool call message, but StartAudioConversationOptions.functionCallingHandler is undefined. Ignoring tool call.' + ); + } else { + try { + const resultPart = await this.options.functionCallingHandler( + message.functionCalls + ); + if (!this.isStopped) { + void this.liveSession.send([resultPart]); + } + } catch (e) { + throw new AIError( + AIErrorCode.ERROR, + `Function calling handler failed: ${(e as Error).message}` + ); + } + } + } + } + } +} + +/** + * Starts a real-time, bidirectional audio conversation with the model. This helper function manages + * the complexities of microphone access, audio recording, playback, and interruptions. + * + * @remarks Important: This function must be called in response to a user gesture + * (for example, a button click) to comply with {@link https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Best_practices#autoplay_policy | browser autoplay policies}. + * + * @example + * ```javascript + * const liveSession = await model.connect(); + * let conversationController; + * + * // This function must be called from within a click handler. + * async function startConversation() { + * try { + * conversationController = await startAudioConversation(liveSession); + * } catch (e) { + * // Handle AI-specific errors + * if (e instanceof AIError) { + * console.error("AI Error:", e.message); + * } + * // Handle microphone permission and hardware errors + * else if (e instanceof DOMException) { + * console.error("Microphone Error:", e.message); + * } + * // Handle other unexpected errors + * else { + * console.error("An unexpected error occurred:", e); + * } + * } + * } + * + * // Later, to stop the conversation: + * // if (conversationController) { + * // await conversationController.stop(); + * // } + * ``` + * + * @param liveSession - An active {@link LiveSession} instance. + * @param options - Configuration options for the audio conversation. + * @returns A `Promise` that resolves with an {@link AudioConversationController}. + * @throws `AIError` if the environment does not support required Web APIs (`UNSUPPORTED`), if a conversation is already active (`REQUEST_ERROR`), the session is closed (`SESSION_CLOSED`), or if an unexpected initialization error occurs (`ERROR`). + * @throws `DOMException` Thrown by `navigator.mediaDevices.getUserMedia()` if issues occur with microphone access, such as permissions being denied (`NotAllowedError`) or no compatible hardware being found (`NotFoundError`). See the {@link https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia#exceptions | MDN documentation} for a full list of exceptions. + * + * @beta + */ +export async function startAudioConversation( + liveSession: LiveSession, + options: StartAudioConversationOptions = {} +): Promise { + if (liveSession.isClosed) { + throw new AIError( + AIErrorCode.SESSION_CLOSED, + 'Cannot start audio conversation on a closed LiveSession.' + ); + } + + if (liveSession.inConversation) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'An audio conversation is already in progress for this session.' + ); + } + + // Check for necessary Web API support. + if ( + typeof AudioWorkletNode === 'undefined' || + typeof AudioContext === 'undefined' || + typeof navigator === 'undefined' || + !navigator.mediaDevices + ) { + throw new AIError( + AIErrorCode.UNSUPPORTED, + 'Audio conversation is not supported in this environment. It requires the Web Audio API and AudioWorklet support.' + ); + } + + let audioContext: AudioContext | undefined; + try { + // 1. Set up the audio context. This must be in response to a user gesture. + // See: https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Best_practices#autoplay_policy + audioContext = new AudioContext(); + if (audioContext.state === 'suspended') { + await audioContext.resume(); + } + + // 2. Prompt for microphone access and get the media stream. + // This can throw a variety of permission or hardware-related errors. + const mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: true + }); + + // 3. Load the AudioWorklet processor. + // See: https://developer.mozilla.org/en-US/docs/Web/API/AudioWorklet + const workletBlob = new Blob([audioProcessorWorkletString], { + type: 'application/javascript' + }); + const workletURL = URL.createObjectURL(workletBlob); + await audioContext.audioWorklet.addModule(workletURL); + + // 4. Create the audio graph: Microphone -> Source Node -> Worklet Node + const sourceNode = audioContext.createMediaStreamSource(mediaStream); + const workletNode = new AudioWorkletNode( + audioContext, + AUDIO_PROCESSOR_NAME, + { + processorOptions: { targetSampleRate: SERVER_INPUT_SAMPLE_RATE } + } + ); + sourceNode.connect(workletNode); + + // 5. Instantiate and return the runner which manages the conversation. + const runner = new AudioConversationRunner(liveSession, options, { + audioContext, + mediaStream, + sourceNode, + workletNode + }); + + return { stop: () => runner.stop() }; + } catch (e) { + // Ensure the audio context is closed on any setup error. + if (audioContext && audioContext.state !== 'closed') { + void audioContext.close(); + } + + // Re-throw specific, known error types directly. The user may want to handle `DOMException` + // errors differently (for example, if permission to access audio device was denied). + if (e instanceof AIError || e instanceof DOMException) { + throw e; + } + + // Wrap any other unexpected errors in a standard AIError. + throw new AIError( + AIErrorCode.ERROR, + `Failed to initialize audio recording: ${(e as Error).message}` + ); + } +} diff --git a/packages/ai/src/methods/live-session.ts b/packages/ai/src/methods/live-session.ts index b257d0a5787..11e5346adc0 100644 --- a/packages/ai/src/methods/live-session.ts +++ b/packages/ai/src/methods/live-session.ts @@ -47,6 +47,12 @@ export class LiveSession { * @beta */ isClosed = false; + /** + * Indicates whether this Live session is being controlled by an `AudioConversationController`. + * + * @beta + */ + inConversation = false; /** * @internal diff --git a/packages/ai/src/types/enums.ts b/packages/ai/src/types/enums.ts index b5e4e60ab4f..7196aad81a0 100644 --- a/packages/ai/src/types/enums.ts +++ b/packages/ai/src/types/enums.ts @@ -325,7 +325,12 @@ export const ResponseModality = { * Image. * @beta */ - IMAGE: 'IMAGE' + IMAGE: 'IMAGE', + /** + * Audio. + * @beta + */ + AUDIO: 'AUDIO' } as const; /** diff --git a/packages/ai/src/types/requests.ts b/packages/ai/src/types/requests.ts index f081149649c..80c0911c328 100644 --- a/packages/ai/src/types/requests.ts +++ b/packages/ai/src/types/requests.ts @@ -178,7 +178,7 @@ export interface LiveGenerationConfig { /** * The modalities of the response. */ - responseModalities?: [ResponseModality]; + responseModalities?: ResponseModality[]; } /** @@ -369,7 +369,7 @@ export interface PrebuiltVoiceConfig { * * For a full list of names and demos of what each voice sounds like, see {@link https://cloud.google.com/text-to-speech/docs/chirp3-hd | Chirp 3: HD Voices}. */ - voiceConfig?: string; + voiceName?: string; } /**