@@ -14,6 +14,7 @@ Some AI providers support real-time, low-latency interactions over WebSockets. A
1414- [ Cartesia] ( https://docs.cartesia.ai/api-reference/tts/tts )
1515- [ ElevenLabs] ( https://elevenlabs.io/docs/conversational-ai/api-reference/conversational-ai/websocket )
1616- [ Fal AI] ( https://docs.fal.ai/model-apis/model-endpoints/websockets )
17+ - [ Deepgram (Workers AI)] ( https://developers.cloudflare.com/workers-ai/models/?authors=deepgram )
1718
1819## Authentication
1920
@@ -163,3 +164,215 @@ ws.send(
163164```
164165
165166For more information on Fal AI's WebSocket API, see their [ HTTP over WebSocket documentation] ( https://docs.fal.ai/model-apis/model-endpoints/websockets ) .
167+
168+ ### Deepgram (Workers AI)
169+
170+ Workers AI provides Deepgram models for real-time speech-to-text (STT) and text-to-speech (TTS) capabilities through WebSocket connections.
171+
172+ #### Speech-to-Text (STT)
173+
174+ Workers AI supports two Deepgram STT models: ` @cf/deepgram/nova-3 ` and ` @cf/deepgram/flux ` . The following example demonstrates real-time audio transcription from a microphone:
175+
176+ ``` javascript
177+ import WebSocket from " ws" ;
178+ import mic from " mic" ;
179+
180+ const ws = new WebSocket (
181+ " wss://gateway.ai.cloudflare.com/v1/<account_id>/<gateway>/workers-ai?model=@cf/deepgram/nova-3&encoding=linear16&sample_rate=16000&interim_results=true" ,
182+ {
183+ headers: {
184+ " cf-aig-authorization" : process .env .CLOUDFLARE_API_KEY ,
185+ },
186+ },
187+ );
188+
189+ // Configure microphone
190+ const micInstance = mic ({
191+ rate: " 16000" ,
192+ channels: " 1" ,
193+ debug: false ,
194+ exitOnSilence: 6 ,
195+ });
196+
197+ const micInputStream = micInstance .getAudioStream ();
198+
199+ micInputStream .on (" data" , (data ) => {
200+ if (ws .readyState === WebSocket .OPEN ) {
201+ ws .send (data);
202+ }
203+ });
204+
205+ micInputStream .on (" error" , (error ) => {
206+ console .error (" Microphone error:" , error);
207+ });
208+
209+ ws .onopen = () => {
210+ console .log (" Connected to WebSocket" );
211+ console .log (" Starting microphone..." );
212+ micInstance .start ();
213+ };
214+
215+ ws .onmessage = (event ) => {
216+ try {
217+ const parse = JSON .parse (event .data );
218+ if (parse .channel ? .alternatives ? .[0 ]? .transcript ) {
219+ if (parse .is_final ) {
220+ console .log (
221+ " Final transcript:" ,
222+ parse .channel .alternatives [0 ].transcript ,
223+ );
224+ } else {
225+ console .log (
226+ " Interim transcript:" ,
227+ parse .channel .alternatives [0 ].transcript ,
228+ );
229+ }
230+ }
231+ } catch (error) {
232+ console .error (" Error parsing message:" , error);
233+ }
234+ };
235+
236+ ws .onerror = (error ) => {
237+ console .error (" WebSocket error:" , error);
238+ };
239+
240+ ws .onclose = () => {
241+ console .log (" WebSocket closed" );
242+ micInstance .stop ();
243+ };
244+ ` ` `
245+
246+ #### Text-to-Speech (TTS)
247+
248+ Workers AI supports the Deepgram ` @cf/ deepgram/ aura- 1 ` model for TTS. The following example demonstrates converting text input to audio:
249+
250+ ` ` ` javascript
251+ import WebSocket from " ws" ;
252+ import readline from " readline" ;
253+ import Speaker from " speaker" ;
254+
255+ const ws = new WebSocket (
256+ " wss://gateway.ai.cloudflare.com/v1/<account_id>/<gateway>/workers-ai?model=@cf/deepgram/aura-1" ,
257+ {
258+ headers: {
259+ " cf-aig-authorization" : process .env .CLOUDFLARE_API_KEY ,
260+ },
261+ },
262+ );
263+
264+ // Speaker management
265+ let currentSpeaker = null ;
266+ let isPlayingAudio = false ;
267+
268+ // Setup readline for text input
269+ const rl = readline .createInterface ({
270+ input: process .stdin ,
271+ output: process .stdout ,
272+ prompt: " Enter text to speak (or \" quit\" to exit): " ,
273+ });
274+
275+ ws .onopen = () => {
276+ console .log (" Connected to Deepgram TTS WebSocket" );
277+ rl .prompt ();
278+ };
279+
280+ ws .onmessage = (event ) => {
281+ // Check if message is JSON (metadata, flushed, etc.) or raw audio
282+ if (event .data instanceof Buffer || event .data instanceof ArrayBuffer ) {
283+ // Raw audio data - create new speaker if needed
284+ if (! currentSpeaker) {
285+ currentSpeaker = new Speaker ({
286+ channels: 1 ,
287+ bitDepth: 16 ,
288+ sampleRate: 24000 ,
289+ });
290+ isPlayingAudio = true ;
291+ }
292+ currentSpeaker .write (Buffer .from (event .data ));
293+ } else {
294+ try {
295+ const message = JSON .parse (event .data );
296+ switch (message .type ) {
297+ case " Metadata" :
298+ console .log (" Model info:" , message .model_name , message .model_version );
299+ break ;
300+ case " Flushed" :
301+ console .log (" Audio complete" );
302+ // End speaker after flush to prevent buffer underflow
303+ if (currentSpeaker && isPlayingAudio) {
304+ currentSpeaker .end ();
305+ currentSpeaker = null ;
306+ isPlayingAudio = false ;
307+ }
308+ rl .prompt ();
309+ break ;
310+ case " Cleared" :
311+ console .log (" Audio cleared, sequence:" , message .sequence_id );
312+ break ;
313+ case " Warning" :
314+ console .warn (" Warning:" , message .description );
315+ break ;
316+ }
317+ } catch (error) {
318+ // Not JSON, might be raw audio as string
319+ if (! currentSpeaker) {
320+ currentSpeaker = new Speaker ({
321+ channels: 1 ,
322+ bitDepth: 16 ,
323+ sampleRate: 24000 ,
324+ });
325+ isPlayingAudio = true ;
326+ }
327+ currentSpeaker .write (Buffer .from (event .data ));
328+ }
329+ }
330+ };
331+
332+ ws .onerror = (error ) => {
333+ console .error (" WebSocket error:" , error);
334+ };
335+
336+ ws .onclose = () => {
337+ console .log (" WebSocket closed" );
338+ if (currentSpeaker) {
339+ currentSpeaker .end ();
340+ }
341+ rl .close ();
342+ process .exit (0 );
343+ };
344+
345+ // Handle user input
346+ rl .on (" line" , (input ) => {
347+ const text = input .trim ();
348+
349+ if (text .toLowerCase () === " quit" ) {
350+ // Send Close message
351+ ws .send (JSON .stringify ({ type: " Close" }));
352+ ws .close ();
353+ return ;
354+ }
355+
356+ if (text .length > 0 ) {
357+ // Send text to TTS
358+ ws .send (
359+ JSON .stringify ({
360+ type: " Speak" ,
361+ text: text,
362+ }),
363+ );
364+
365+ // Flush to get audio immediately
366+ ws .send (JSON .stringify ({ type: " Flush" }));
367+ console .log (" Flushing audio" );
368+ }
369+
370+ rl .prompt ();
371+ });
372+
373+ rl .on (" close" , () => {
374+ if (ws .readyState === WebSocket .OPEN ) {
375+ ws .close ();
376+ }
377+ });
378+ ` ` `
0 commit comments