diff --git a/.changeset/dull-ligers-bow.md b/.changeset/dull-ligers-bow.md new file mode 100644 index 00000000000..95e813f3fb6 --- /dev/null +++ b/.changeset/dull-ligers-bow.md @@ -0,0 +1,6 @@ +--- +'firebase': minor +'@firebase/ai': minor +--- + +Add `sendTextRealtime()`, `sendAudioReatime()`, and `sendVideoRealtime()` to the `LiveSession` class, and deprecate `sendMediaChunks()` and `sendMediaStream()`. diff --git a/common/api-review/ai.api.md b/common/api-review/ai.api.md index d3c43d906fd..485e87fbbe5 100644 --- a/common/api-review/ai.api.md +++ b/common/api-review/ai.api.md @@ -994,9 +994,14 @@ export class LiveSession { isClosed: boolean; receive(): AsyncGenerator; send(request: string | Array, turnComplete?: boolean): Promise; + sendAudioRealtime(blob: GenerativeContentBlob): Promise; sendFunctionResponses(functionResponses: FunctionResponse[]): Promise; + // @deprecated (undocumented) sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise; + // @deprecated (undocumented) sendMediaStream(mediaChunkStream: ReadableStream): Promise; + sendTextRealtime(text: string): Promise; + sendVideoRealtime(blob: GenerativeContentBlob): Promise; } // @public diff --git a/docs-devsite/ai.livesession.md b/docs-devsite/ai.livesession.md index 558c5eb3bd6..7aeeb8ee644 100644 --- a/docs-devsite/ai.livesession.md +++ b/docs-devsite/ai.livesession.md @@ -39,9 +39,12 @@ export declare class LiveSession | [close()](./ai.livesession.md#livesessionclose) | | (Public Preview) Closes this session. All methods on this session will throw an error once this resolves. | | [receive()](./ai.livesession.md#livesessionreceive) | | (Public Preview) Yields messages received from the server. This can only be used by one consumer at a time. | | [send(request, turnComplete)](./ai.livesession.md#livesessionsend) | | (Public Preview) Sends content to the server. | +| [sendAudioRealtime(blob)](./ai.livesession.md#livesessionsendaudiorealtime) | | (Public Preview) Sends audio data to the server in realtime. | | [sendFunctionResponses(functionResponses)](./ai.livesession.md#livesessionsendfunctionresponses) | | (Public Preview) Sends function responses to the server. | -| [sendMediaChunks(mediaChunks)](./ai.livesession.md#livesessionsendmediachunks) | | (Public Preview) Sends realtime input to the server. | -| [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | (Public Preview) Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). | +| [sendMediaChunks(mediaChunks)](./ai.livesession.md#livesessionsendmediachunks) | | (Public Preview) | +| [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | (Public Preview) | +| [sendTextRealtime(text)](./ai.livesession.md#livesessionsendtextrealtime) | | (Public Preview) Sends text to the server in realtime. | +| [sendVideoRealtime(blob)](./ai.livesession.md#livesessionsendvideorealtime) | | (Public Preview) Sends video data to the server in realtime. | ## LiveSession.inConversation @@ -135,6 +138,45 @@ Promise<void> If this session has been closed. +## LiveSession.sendAudioRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends audio data to the server in realtime. + +The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz little-endian. + +Signature: + +```typescript +sendAudioRealtime(blob: GenerativeContentBlob): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| blob | [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface) | The base64-encoded PCM data to send to the server in realtime. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +// const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian. +const blob = { mimeType: "audio/pcm", data: pcmData }; +liveSession.sendAudioRealtime(blob); + +``` + ## LiveSession.sendFunctionResponses() > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. @@ -167,7 +209,12 @@ If this session has been closed. > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > -Sends realtime input to the server. +> Warning: This API is now obsolete. +> +> Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. +> +> Sends realtime input to the server. +> Signature: @@ -194,7 +241,12 @@ If this session has been closed. > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > -Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). +> Warning: This API is now obsolete. +> +> Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. +> +> Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). +> Signature: @@ -216,3 +268,77 @@ Promise<void> If this session has been closed. +## LiveSession.sendTextRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends text to the server in realtime. + +Signature: + +```typescript +sendTextRealtime(text: string): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| text | string | The text data to send. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +liveSession.sendTextRealtime("Hello, how are you?"); + +``` + +## LiveSession.sendVideoRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends video data to the server in realtime. + +The server requires that the video is sent as individual video frames at 1 FPS. It is recommended to set `mimeType` to `image/jpeg`. + +Signature: + +```typescript +sendVideoRealtime(blob: GenerativeContentBlob): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| blob | [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface) | The base64-encoded video data to send to the server in realtime. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +// const videoFrame = ... JPEG data +const blob = { mimeType: "image/jpeg", data: videoFrame }; +liveSession.sendAudioRealtime(blob); + +``` + diff --git a/packages/ai/integration/live.test.ts b/packages/ai/integration/live.test.ts index caa18970ab7..0af5bef2242 100644 --- a/packages/ai/integration/live.test.ts +++ b/packages/ai/integration/live.test.ts @@ -154,6 +154,45 @@ describe('Live', function () { }); }); + describe('sendTextRealtime()', () => { + it('should send a single text chunk and receive a response', async () => { + const model = getLiveGenerativeModel(testConfig.ai, { + model: testConfig.model, + generationConfig: textLiveGenerationConfig + }); + const session = await model.connect(); + const responsePromise = nextTurnText(session.receive()); + + await session.sendTextRealtime('Are you an AI? Yes or No.'); + + const responseText = await responsePromise; + expect(responseText).to.include('Yes'); + + await session.close(); + }); + }); + + describe('sendAudioRealtime()', () => { + it('should send a single audio chunk and receive a response', async () => { + const model = getLiveGenerativeModel(testConfig.ai, { + model: testConfig.model, + generationConfig: textLiveGenerationConfig + }); + const session = await model.connect(); + const responsePromise = nextTurnText(session.receive()); + + await session.sendAudioRealtime({ + data: HELLO_AUDIO_PCM_BASE64, // "Hey, can you hear me?" + mimeType: 'audio/pcm' + }); + + const responseText = await responsePromise; + expect(responseText).to.include('Yes'); + + await session.close(); + }); + }); + describe('sendMediaChunks()', () => { it('should send a single audio chunk and receive a response', async () => { const model = getLiveGenerativeModel(testConfig.ai, { diff --git a/packages/ai/src/methods/live-session-helpers.test.ts b/packages/ai/src/methods/live-session-helpers.test.ts index cad0475b358..a62315c701d 100644 --- a/packages/ai/src/methods/live-session-helpers.test.ts +++ b/packages/ai/src/methods/live-session-helpers.test.ts @@ -65,7 +65,7 @@ class MockLiveSession { isClosed = false; inConversation = false; send = sinon.stub(); - sendMediaChunks = sinon.stub(); + sendAudioRealtime = sinon.stub(); sendFunctionResponses = sinon.stub(); messageGenerator = new MockMessageGenerator(); receive = (): MockMessageGenerator => this.messageGenerator; @@ -226,8 +226,8 @@ describe('Audio Conversation Helpers', () => { await clock.tickAsync(1); - expect(liveSession.sendMediaChunks).to.have.been.calledOnce; - const [sentChunk] = liveSession.sendMediaChunks.getCall(0).args[0]; + expect(liveSession.sendAudioRealtime).to.have.been.calledOnce; + const sentChunk = liveSession.sendAudioRealtime.getCall(0).args[0]; expect(sentChunk.mimeType).to.equal('audio/pcm'); expect(sentChunk.data).to.be.a('string'); await controller.stop(); diff --git a/packages/ai/src/methods/live-session-helpers.ts b/packages/ai/src/methods/live-session-helpers.ts index b3907d6219b..cb3be493f5d 100644 --- a/packages/ai/src/methods/live-session-helpers.ts +++ b/packages/ai/src/methods/live-session-helpers.ts @@ -184,7 +184,7 @@ export class AudioConversationRunner { mimeType: 'audio/pcm', data: base64 }; - void this.liveSession.sendMediaChunks([chunk]); + void this.liveSession.sendAudioRealtime(chunk); }; } diff --git a/packages/ai/src/methods/live-session.test.ts b/packages/ai/src/methods/live-session.test.ts index 7454b1208c9..428e92ec770 100644 --- a/packages/ai/src/methods/live-session.test.ts +++ b/packages/ai/src/methods/live-session.test.ts @@ -110,6 +110,42 @@ describe('LiveSession', () => { }); }); + describe('sendTextRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const text = 'foo'; + await session.sendTextRealtime(text); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { text } + }); + }); + }); + + describe('sendAudioRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const blob = { data: 'abcdef', mimeType: 'audio/pcm' }; + await session.sendAudioRealtime(blob); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { audio: blob } + }); + }); + }); + + describe('sendVideoRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const blob = { data: 'abcdef', mimeType: 'image/jpeg' }; + await session.sendVideoRealtime(blob); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { video: blob } + }); + }); + }); + describe('sendMediaChunks()', () => { it('should send a correctly formatted realtimeInput message', async () => { const chunks = [{ data: 'base64', mimeType: 'audio/webm' }]; diff --git a/packages/ai/src/methods/live-session.ts b/packages/ai/src/methods/live-session.ts index 92d325e2f0d..bb52efafc94 100644 --- a/packages/ai/src/methods/live-session.ts +++ b/packages/ai/src/methods/live-session.ts @@ -96,14 +96,19 @@ export class LiveSession { } /** - * Sends realtime input to the server. + * Sends text to the server in realtime. * - * @param mediaChunks - The media chunks to send. + * @example + * ```javascript + * liveSession.sendTextRealtime("Hello, how are you?"); + * ``` + * + * @param text - The text data to send. * @throws If this session has been closed. * * @beta */ - async sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise { + async sendTextRealtime(text: string): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -111,27 +116,33 @@ export class LiveSession { ); } - // The backend does not support sending more than one mediaChunk in one message. - // Work around this limitation by sending mediaChunks in separate messages. - mediaChunks.forEach(mediaChunk => { - const message: _LiveClientRealtimeInput = { - realtimeInput: { mediaChunks: [mediaChunk] } - }; - this.webSocketHandler.send(JSON.stringify(message)); - }); + const message: _LiveClientRealtimeInput = { + realtimeInput: { + text + } + }; + this.webSocketHandler.send(JSON.stringify(message)); } /** - * Sends function responses to the server. + * Sends audio data to the server in realtime. * - * @param functionResponses - The function responses to send. + * @remarks The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz + * little-endian. + * + * @example + * ```javascript + * // const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian. + * const blob = { mimeType: "audio/pcm", data: pcmData }; + * liveSession.sendAudioRealtime(blob); + * ``` + * + * @param blob - The base64-encoded PCM data to send to the server in realtime. * @throws If this session has been closed. * * @beta */ - async sendFunctionResponses( - functionResponses: FunctionResponse[] - ): Promise { + async sendAudioRealtime(blob: GenerativeContentBlob): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -139,25 +150,32 @@ export class LiveSession { ); } - const message: _LiveClientToolResponse = { - toolResponse: { - functionResponses + const message: _LiveClientRealtimeInput = { + realtimeInput: { + audio: blob } }; this.webSocketHandler.send(JSON.stringify(message)); } /** - * Sends a stream of {@link GenerativeContentBlob}. + * Sends video data to the server in realtime. * - * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send. + * @remarks The server requires that the video is sent as individual video frames at 1 FPS. It + * is recommended to set `mimeType` to `image/jpeg`. + * + * @example + * ```javascript + * // const videoFrame = ... JPEG data + * const blob = { mimeType: "image/jpeg", data: videoFrame }; + * liveSession.sendAudioRealtime(blob); + * ``` + * @param blob - The base64-encoded video data to send to the server in realtime. * @throws If this session has been closed. * * @beta */ - async sendMediaStream( - mediaChunkStream: ReadableStream - ): Promise { + async sendVideoRealtime(blob: GenerativeContentBlob): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -165,25 +183,38 @@ export class LiveSession { ); } - const reader = mediaChunkStream.getReader(); - while (true) { - try { - const { done, value } = await reader.read(); + const message: _LiveClientRealtimeInput = { + realtimeInput: { + video: blob + } + }; + this.webSocketHandler.send(JSON.stringify(message)); + } - if (done) { - break; - } else if (!value) { - throw new Error('Missing chunk in reader, but reader is not done.'); - } + /** + * Sends function responses to the server. + * + * @param functionResponses - The function responses to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendFunctionResponses( + functionResponses: FunctionResponse[] + ): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } - await this.sendMediaChunks([value]); - } catch (e) { - // Re-throw any errors that occur during stream consumption or sending. - const message = - e instanceof Error ? e.message : 'Error processing media stream.'; - throw new AIError(AIErrorCode.REQUEST_ERROR, message); + const message: _LiveClientToolResponse = { + toolResponse: { + functionResponses } - } + }; + this.webSocketHandler.send(JSON.stringify(message)); } /** @@ -259,4 +290,73 @@ export class LiveSession { await this.webSocketHandler.close(1000, 'Client closed session.'); } } + + /** + * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. + * + * Sends realtime input to the server. + * + * @param mediaChunks - The media chunks to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } + + // The backend does not support sending more than one mediaChunk in one message. + // Work around this limitation by sending mediaChunks in separate messages. + mediaChunks.forEach(mediaChunk => { + const message: _LiveClientRealtimeInput = { + realtimeInput: { mediaChunks: [mediaChunk] } + }; + this.webSocketHandler.send(JSON.stringify(message)); + }); + } + + /** + * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. + * + * Sends a stream of {@link GenerativeContentBlob}. + * + * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendMediaStream( + mediaChunkStream: ReadableStream + ): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } + + const reader = mediaChunkStream.getReader(); + while (true) { + try { + const { done, value } = await reader.read(); + + if (done) { + break; + } else if (!value) { + throw new Error('Missing chunk in reader, but reader is not done.'); + } + + await this.sendMediaChunks([value]); + } catch (e) { + // Re-throw any errors that occur during stream consumption or sending. + const message = + e instanceof Error ? e.message : 'Error processing media stream.'; + throw new AIError(AIErrorCode.REQUEST_ERROR, message); + } + } + } } diff --git a/packages/ai/src/types/live-responses.ts b/packages/ai/src/types/live-responses.ts index d1870fa109f..6b69a0ea350 100644 --- a/packages/ai/src/types/live-responses.ts +++ b/packages/ai/src/types/live-responses.ts @@ -44,7 +44,14 @@ export interface _LiveClientContent { // eslint-disable-next-line @typescript-eslint/naming-convention export interface _LiveClientRealtimeInput { realtimeInput: { - mediaChunks: GenerativeContentBlob[]; + text?: string; + audio?: GenerativeContentBlob; + video?: GenerativeContentBlob; + + /** + * @deprecated Use `text`, `audio`, and `video` instead. + */ + mediaChunks?: GenerativeContentBlob[]; }; }