Skip to content

Commit 1c7a6ed

Browse files
committed
feat: add support for resets in speech sessions
1 parent 0870904 commit 1c7a6ed

File tree

2 files changed

+110
-81
lines changed

2 files changed

+110
-81
lines changed

examples/speech_session.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@ async function main() {
3131

3232
const readTask = async () => {
3333
for await (const message of speechSession) {
34-
const audioBytes = Buffer.byteLength(message.audio);
35-
console.log(` ** Received from LMNT -- ${audioBytes} bytes ** `);
36-
audioFile.write(message.audio);
34+
if (message.type === 'audio') {
35+
const audioBytes = message.audio.byteLength;
36+
console.log(` ** Received from LMNT -- ${audioBytes} bytes ** `);
37+
audioFile.write(message.audio);
38+
}
3739
}
3840

3941
speechSession.close();

src/lib/websocketStreaming.ts

Lines changed: 105 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,14 @@ const URL_STREAMING = 'wss://api.lmnt.com/v1/ai/speech/stream';
77
const WEBSOCKET_OPEN_STATE = 1;
88
const WEBSOCKET_NORMAL_CLOSE = 1000;
99

10-
const MESSAGE_EOF = { eof: true };
11-
const MESSAGE_FLUSH = { flush: true };
12-
1310
class MessageQueue {
1411
private messages: any[] = [];
1512
private resolvers: ((value: any) => void)[] = [];
1613

1714
finish() {
1815
while (this.resolvers.length) {
1916
const resolve = this.resolvers.shift();
20-
resolve?.(MESSAGE_EOF);
17+
resolve?.(null);
2118
}
2219
}
2320

@@ -56,33 +53,49 @@ export interface Duration {
5653
text?: string;
5754
}
5855

59-
export interface SpeechStreamResponse {
60-
/**
61-
* The synthesized speech audio data
62-
*/
56+
export type SpeechStreamMessage = AudioMessage | ExtrasMessage | ErrorMessage | CompleteMessage;
57+
58+
/**
59+
* Audio message containing synthesized speech data
60+
*/
61+
export interface AudioMessage {
62+
type: 'audio';
6363
audio: ArrayBuffer | Buffer;
64+
}
6465

65-
/**
66-
* The durations of the generated speech
67-
*/
66+
/**
67+
* Extras message containing metadata about the synthesis
68+
*/
69+
export interface ExtrasMessage {
70+
type: 'extras';
6871
durations?: Duration[];
69-
70-
/**
71-
* A warning message
72-
*/
7372
warning?: string;
74-
75-
/**
76-
* Whether the buffer on the server is empty
77-
*/
7873
buffer_empty?: boolean;
7974
}
8075

76+
/**
77+
* Error message containing error information
78+
*/
79+
export interface ErrorMessage {
80+
type: 'error';
81+
error: string;
82+
}
83+
84+
/**
85+
* Complete message for commands (reset/flush)
86+
*/
87+
export interface CompleteMessage {
88+
type: 'complete';
89+
command: 'reset' | 'flush';
90+
nonce: number;
91+
}
92+
8193
export class SpeechSession {
8294
private socket: WebSocket;
8395
private outMessages: any[] = [];
8496
private inMessages: MessageQueue;
8597
private returnExtras: boolean;
98+
private nextNonce: number = 1;
8699

87100
/**
88101
* Creates a new streaming speech connection.
@@ -107,6 +120,7 @@ export class SpeechSession {
107120
sample_rate: options.sample_rate,
108121
send_extras: options.return_extras,
109122
speed: options.speed,
123+
protocol_version: 2,
110124
});
111125
}
112126

@@ -134,8 +148,12 @@ export class SpeechSession {
134148
this.inMessages.finish();
135149
}
136150

137-
private onMessage(message: any) {
138-
this.inMessages.push(message);
151+
private onMessage(event: MessageEvent) {
152+
const isText = typeof event.data === 'string';
153+
this.inMessages.push({
154+
type: isText ? 'text' : 'binary',
155+
data: event.data,
156+
});
139157
}
140158

141159
/**
@@ -159,17 +177,78 @@ export class SpeechSession {
159177
/**
160178
* Triggers the server to synthesize all currently queued text and return the audio data.
161179
* Use sparingly as it may affect the natural flow of speech synthesis.
180+
*
181+
* @returns The nonce used for this flush command
182+
*/
183+
flush(): number {
184+
const nonce = this.nextNonce++;
185+
this.sendMessage({ command: 'flush', nonce });
186+
return nonce;
187+
}
188+
189+
/**
190+
* Resets the speech session, discarding all queued text and stopping current synthesis.
191+
*
192+
* @returns The nonce used for this reset command
162193
*/
163-
flush() {
164-
this.sendMessage(MESSAGE_FLUSH);
194+
reset(): number {
195+
const nonce = this.nextNonce++;
196+
this.sendMessage({ command: 'reset', nonce });
197+
return nonce;
165198
}
166199

167200
/**
168201
* Finishes the streaming session after writing all text. This will flush remaining data
169202
* and close the connection after all audio has been received.
170203
*/
171204
finish() {
172-
this.sendMessage(MESSAGE_EOF);
205+
this.sendMessage({ command: 'eof' });
206+
}
207+
208+
/**
209+
* Iterate over messages from the server.
210+
*
211+
* @returns AsyncIterator<SpeechStreamMessage>
212+
*/
213+
async *[Symbol.asyncIterator](): AsyncIterator<SpeechStreamMessage> {
214+
while (true) {
215+
const message = await this.inMessages.next();
216+
if (message === null) {
217+
return;
218+
}
219+
220+
if (message.type === 'text') {
221+
yield this.parseTextMessage(message.data);
222+
} else {
223+
try {
224+
const audio = await this.processAudioData(message.data);
225+
yield { type: 'audio', audio };
226+
} catch (error) {
227+
yield { type: 'error', error: error instanceof Error ? error.message : String(error) };
228+
}
229+
}
230+
}
231+
}
232+
233+
private parseTextMessage(textData: string): SpeechStreamMessage {
234+
const messageJson = JSON.parse(textData);
235+
236+
if ('error' in messageJson) {
237+
return { type: 'error', ...messageJson };
238+
}
239+
240+
if ('complete' in messageJson) {
241+
return { type: 'complete', ...messageJson };
242+
}
243+
244+
if (
245+
this.returnExtras &&
246+
(messageJson.durations || messageJson.warning || messageJson.buffer_empty !== undefined)
247+
) {
248+
return { type: 'extras', ...messageJson };
249+
}
250+
251+
return { type: 'error', ...messageJson };
173252
}
174253

175254
private sendMessage(message: any) {
@@ -186,65 +265,13 @@ export class SpeechSession {
186265
}
187266
}
188267

189-
/**
190-
* Returns an async iterator that yields objects containing synthesized speech audio data
191-
* and optional metadata like durations and warnings when return_extras is enabled.
192-
* @returns AsyncIterator<SpeechStreamResponse>
193-
*/
194-
async *[Symbol.asyncIterator](): AsyncIterator<SpeechStreamResponse> {
195-
while (true) {
196-
const message = await this.inMessages.next();
197-
if (message === MESSAGE_EOF) {
198-
return;
199-
}
200-
201-
let data: SpeechStreamResponse;
202-
if (this.returnExtras) {
203-
const msg1Json = this.parseAndCheckForError(message.data, false);
204-
const msg2 = await this.inMessages.next();
205-
const audio = await this.processAudioData(msg2.data);
206-
data = {
207-
audio,
208-
durations: msg1Json['durations'],
209-
};
210-
if ('warning' in msg1Json) {
211-
data.warning = msg1Json['warning'];
212-
}
213-
if ('buffer_empty' in msg1Json) {
214-
data.buffer_empty = msg1Json['buffer_empty'];
215-
}
216-
} else {
217-
const audio = await this.processAudioData(message.data);
218-
data = { audio };
219-
}
220-
yield data;
221-
}
222-
}
223-
224-
private async processAudioData(audioData: any): Promise<ArrayBuffer | Buffer> {
268+
private async processAudioData(audioData: Blob | Buffer): Promise<ArrayBuffer | Buffer> {
225269
if (audioData instanceof Blob) {
226270
return await audioData.arrayBuffer();
227271
} else if (audioData instanceof Buffer) {
228272
return audioData;
229273
} else {
230-
this.parseAndCheckForError(audioData);
231-
}
232-
return new ArrayBuffer(0);
233-
}
234-
235-
private parseAndCheckForError(message: any, requireError = true) {
236-
let messageJson;
237-
try {
238-
messageJson = JSON.parse(message);
239-
} catch {
240-
throw new Error(`Unexpected message type received from server: ${message}`);
241-
}
242-
if ('error' in messageJson) {
243-
throw new LmntError(messageJson['error']);
244-
}
245-
if (requireError) {
246-
throw new Error(`Unexpected message type received from server: ${message}`);
274+
throw new LmntError(`Unexpected message type received from server: ${audioData}`);
247275
}
248-
return messageJson;
249276
}
250277
}

0 commit comments

Comments
 (0)