@@ -7,17 +7,14 @@ const URL_STREAMING = 'wss://api.lmnt.com/v1/ai/speech/stream';
77const WEBSOCKET_OPEN_STATE = 1 ;
88const WEBSOCKET_NORMAL_CLOSE = 1000 ;
99
10- const MESSAGE_EOF = { eof : true } ;
11- const MESSAGE_FLUSH = { flush : true } ;
12-
1310class MessageQueue {
1411 private messages : any [ ] = [ ] ;
1512 private resolvers : ( ( value : any ) => void ) [ ] = [ ] ;
1613
1714 finish ( ) {
1815 while ( this . resolvers . length ) {
1916 const resolve = this . resolvers . shift ( ) ;
20- resolve ?.( MESSAGE_EOF ) ;
17+ resolve ?.( null ) ;
2118 }
2219 }
2320
@@ -56,33 +53,49 @@ export interface Duration {
5653 text ?: string ;
5754}
5855
59- export interface SpeechStreamResponse {
60- /**
61- * The synthesized speech audio data
62- */
56+ export type SpeechStreamMessage = AudioMessage | ExtrasMessage | ErrorMessage | CompleteMessage ;
57+
58+ /**
59+ * Audio message containing synthesized speech data
60+ */
61+ export interface AudioMessage {
62+ type : 'audio' ;
6363 audio : ArrayBuffer | Buffer ;
64+ }
6465
65- /**
66- * The durations of the generated speech
67- */
66+ /**
67+ * Extras message containing metadata about the synthesis
68+ */
69+ export interface ExtrasMessage {
70+ type : 'extras' ;
6871 durations ?: Duration [ ] ;
69-
70- /**
71- * A warning message
72- */
7372 warning ?: string ;
74-
75- /**
76- * Whether the buffer on the server is empty
77- */
7873 buffer_empty ?: boolean ;
7974}
8075
76+ /**
77+ * Error message containing error information
78+ */
79+ export interface ErrorMessage {
80+ type : 'error' ;
81+ error : string ;
82+ }
83+
84+ /**
85+ * Complete message for commands (reset/flush)
86+ */
87+ export interface CompleteMessage {
88+ type : 'complete' ;
89+ command : 'reset' | 'flush' ;
90+ nonce : number ;
91+ }
92+
8193export class SpeechSession {
8294 private socket : WebSocket ;
8395 private outMessages : any [ ] = [ ] ;
8496 private inMessages : MessageQueue ;
8597 private returnExtras : boolean ;
98+ private nextNonce : number = 1 ;
8699
87100 /**
88101 * Creates a new streaming speech connection.
@@ -107,6 +120,7 @@ export class SpeechSession {
107120 sample_rate : options . sample_rate ,
108121 send_extras : options . return_extras ,
109122 speed : options . speed ,
123+ protocol_version : 2 ,
110124 } ) ;
111125 }
112126
@@ -134,8 +148,12 @@ export class SpeechSession {
134148 this . inMessages . finish ( ) ;
135149 }
136150
137- private onMessage ( message : any ) {
138- this . inMessages . push ( message ) ;
151+ private onMessage ( event : MessageEvent ) {
152+ const isText = typeof event . data === 'string' ;
153+ this . inMessages . push ( {
154+ type : isText ? 'text' : 'binary' ,
155+ data : event . data ,
156+ } ) ;
139157 }
140158
141159 /**
@@ -159,17 +177,78 @@ export class SpeechSession {
159177 /**
160178 * Triggers the server to synthesize all currently queued text and return the audio data.
161179 * Use sparingly as it may affect the natural flow of speech synthesis.
180+ *
181+ * @returns The nonce used for this flush command
182+ */
183+ flush ( ) : number {
184+ const nonce = this . nextNonce ++ ;
185+ this . sendMessage ( { command : 'flush' , nonce } ) ;
186+ return nonce ;
187+ }
188+
189+ /**
190+ * Resets the speech session, discarding all queued text and stopping current synthesis.
191+ *
192+ * @returns The nonce used for this reset command
162193 */
163- flush ( ) {
164- this . sendMessage ( MESSAGE_FLUSH ) ;
194+ reset ( ) : number {
195+ const nonce = this . nextNonce ++ ;
196+ this . sendMessage ( { command : 'reset' , nonce } ) ;
197+ return nonce ;
165198 }
166199
167200 /**
168201 * Finishes the streaming session after writing all text. This will flush remaining data
169202 * and close the connection after all audio has been received.
170203 */
171204 finish ( ) {
172- this . sendMessage ( MESSAGE_EOF ) ;
205+ this . sendMessage ( { command : 'eof' } ) ;
206+ }
207+
208+ /**
209+ * Iterate over messages from the server.
210+ *
211+ * @returns AsyncIterator<SpeechStreamMessage>
212+ */
213+ async * [ Symbol . asyncIterator ] ( ) : AsyncIterator < SpeechStreamMessage > {
214+ while ( true ) {
215+ const message = await this . inMessages . next ( ) ;
216+ if ( message === null ) {
217+ return ;
218+ }
219+
220+ if ( message . type === 'text' ) {
221+ yield this . parseTextMessage ( message . data ) ;
222+ } else {
223+ try {
224+ const audio = await this . processAudioData ( message . data ) ;
225+ yield { type : 'audio' , audio } ;
226+ } catch ( error ) {
227+ yield { type : 'error' , error : error instanceof Error ? error . message : String ( error ) } ;
228+ }
229+ }
230+ }
231+ }
232+
233+ private parseTextMessage ( textData : string ) : SpeechStreamMessage {
234+ const messageJson = JSON . parse ( textData ) ;
235+
236+ if ( 'error' in messageJson ) {
237+ return { type : 'error' , ...messageJson } ;
238+ }
239+
240+ if ( 'complete' in messageJson ) {
241+ return { type : 'complete' , ...messageJson } ;
242+ }
243+
244+ if (
245+ this . returnExtras &&
246+ ( messageJson . durations || messageJson . warning || messageJson . buffer_empty !== undefined )
247+ ) {
248+ return { type : 'extras' , ...messageJson } ;
249+ }
250+
251+ return { type : 'error' , ...messageJson } ;
173252 }
174253
175254 private sendMessage ( message : any ) {
@@ -186,65 +265,13 @@ export class SpeechSession {
186265 }
187266 }
188267
189- /**
190- * Returns an async iterator that yields objects containing synthesized speech audio data
191- * and optional metadata like durations and warnings when return_extras is enabled.
192- * @returns AsyncIterator<SpeechStreamResponse>
193- */
194- async * [ Symbol . asyncIterator ] ( ) : AsyncIterator < SpeechStreamResponse > {
195- while ( true ) {
196- const message = await this . inMessages . next ( ) ;
197- if ( message === MESSAGE_EOF ) {
198- return ;
199- }
200-
201- let data : SpeechStreamResponse ;
202- if ( this . returnExtras ) {
203- const msg1Json = this . parseAndCheckForError ( message . data , false ) ;
204- const msg2 = await this . inMessages . next ( ) ;
205- const audio = await this . processAudioData ( msg2 . data ) ;
206- data = {
207- audio,
208- durations : msg1Json [ 'durations' ] ,
209- } ;
210- if ( 'warning' in msg1Json ) {
211- data . warning = msg1Json [ 'warning' ] ;
212- }
213- if ( 'buffer_empty' in msg1Json ) {
214- data . buffer_empty = msg1Json [ 'buffer_empty' ] ;
215- }
216- } else {
217- const audio = await this . processAudioData ( message . data ) ;
218- data = { audio } ;
219- }
220- yield data ;
221- }
222- }
223-
224- private async processAudioData ( audioData : any ) : Promise < ArrayBuffer | Buffer > {
268+ private async processAudioData ( audioData : Blob | Buffer ) : Promise < ArrayBuffer | Buffer > {
225269 if ( audioData instanceof Blob ) {
226270 return await audioData . arrayBuffer ( ) ;
227271 } else if ( audioData instanceof Buffer ) {
228272 return audioData ;
229273 } else {
230- this . parseAndCheckForError ( audioData ) ;
231- }
232- return new ArrayBuffer ( 0 ) ;
233- }
234-
235- private parseAndCheckForError ( message : any , requireError = true ) {
236- let messageJson ;
237- try {
238- messageJson = JSON . parse ( message ) ;
239- } catch {
240- throw new Error ( `Unexpected message type received from server: ${ message } ` ) ;
241- }
242- if ( 'error' in messageJson ) {
243- throw new LmntError ( messageJson [ 'error' ] ) ;
244- }
245- if ( requireError ) {
246- throw new Error ( `Unexpected message type received from server: ${ message } ` ) ;
274+ throw new LmntError ( `Unexpected message type received from server: ${ audioData } ` ) ;
247275 }
248- return messageJson ;
249276 }
250277}
0 commit comments