Skip to content

Commit dd53cb6

Browse files
thebongyRishit Bansal
andauthored
[AI Gateway]: Add docs for Deepgram Workers AI Websocket models on AI Gateway for STT/TTS (#25674)
Co-authored-by: Rishit Bansal <[email protected]>
1 parent 58a306e commit dd53cb6

File tree

1 file changed

+213
-0
lines changed

1 file changed

+213
-0
lines changed

src/content/docs/ai-gateway/usage/websockets-api/realtime-api.mdx

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Some AI providers support real-time, low-latency interactions over WebSockets. A
1414
- [Cartesia](https://docs.cartesia.ai/api-reference/tts/tts)
1515
- [ElevenLabs](https://elevenlabs.io/docs/conversational-ai/api-reference/conversational-ai/websocket)
1616
- [Fal AI](https://docs.fal.ai/model-apis/model-endpoints/websockets)
17+
- [Deepgram (Workers AI)](https://developers.cloudflare.com/workers-ai/models/?authors=deepgram)
1718

1819
## Authentication
1920

@@ -163,3 +164,215 @@ ws.send(
163164
```
164165

165166
For more information on Fal AI's WebSocket API, see their [HTTP over WebSocket documentation](https://docs.fal.ai/model-apis/model-endpoints/websockets).
167+
168+
### Deepgram (Workers AI)
169+
170+
Workers AI provides Deepgram models for real-time speech-to-text (STT) and text-to-speech (TTS) capabilities through WebSocket connections.
171+
172+
#### Speech-to-Text (STT)
173+
174+
Workers AI supports two Deepgram STT models: `@cf/deepgram/nova-3` and `@cf/deepgram/flux`. The following example demonstrates real-time audio transcription from a microphone:
175+
176+
```javascript
177+
import WebSocket from "ws";
178+
import mic from "mic";
179+
180+
const ws = new WebSocket(
181+
"wss://gateway.ai.cloudflare.com/v1/<account_id>/<gateway>/workers-ai?model=@cf/deepgram/nova-3&encoding=linear16&sample_rate=16000&interim_results=true",
182+
{
183+
headers: {
184+
"cf-aig-authorization": process.env.CLOUDFLARE_API_KEY,
185+
},
186+
},
187+
);
188+
189+
// Configure microphone
190+
const micInstance = mic({
191+
rate: "16000",
192+
channels: "1",
193+
debug: false,
194+
exitOnSilence: 6,
195+
});
196+
197+
const micInputStream = micInstance.getAudioStream();
198+
199+
micInputStream.on("data", (data) => {
200+
if (ws.readyState === WebSocket.OPEN) {
201+
ws.send(data);
202+
}
203+
});
204+
205+
micInputStream.on("error", (error) => {
206+
console.error("Microphone error:", error);
207+
});
208+
209+
ws.onopen = () => {
210+
console.log("Connected to WebSocket");
211+
console.log("Starting microphone...");
212+
micInstance.start();
213+
};
214+
215+
ws.onmessage = (event) => {
216+
try {
217+
const parse = JSON.parse(event.data);
218+
if (parse.channel?.alternatives?.[0]?.transcript) {
219+
if (parse.is_final) {
220+
console.log(
221+
"Final transcript:",
222+
parse.channel.alternatives[0].transcript,
223+
);
224+
} else {
225+
console.log(
226+
"Interim transcript:",
227+
parse.channel.alternatives[0].transcript,
228+
);
229+
}
230+
}
231+
} catch (error) {
232+
console.error("Error parsing message:", error);
233+
}
234+
};
235+
236+
ws.onerror = (error) => {
237+
console.error("WebSocket error:", error);
238+
};
239+
240+
ws.onclose = () => {
241+
console.log("WebSocket closed");
242+
micInstance.stop();
243+
};
244+
```
245+
246+
#### Text-to-Speech (TTS)
247+
248+
Workers AI supports the Deepgram `@cf/deepgram/aura-1` model for TTS. The following example demonstrates converting text input to audio:
249+
250+
```javascript
251+
import WebSocket from "ws";
252+
import readline from "readline";
253+
import Speaker from "speaker";
254+
255+
const ws = new WebSocket(
256+
"wss://gateway.ai.cloudflare.com/v1/<account_id>/<gateway>/workers-ai?model=@cf/deepgram/aura-1",
257+
{
258+
headers: {
259+
"cf-aig-authorization": process.env.CLOUDFLARE_API_KEY,
260+
},
261+
},
262+
);
263+
264+
// Speaker management
265+
let currentSpeaker = null;
266+
let isPlayingAudio = false;
267+
268+
// Setup readline for text input
269+
const rl = readline.createInterface({
270+
input: process.stdin,
271+
output: process.stdout,
272+
prompt: "Enter text to speak (or \"quit\" to exit): ",
273+
});
274+
275+
ws.onopen = () => {
276+
console.log("Connected to Deepgram TTS WebSocket");
277+
rl.prompt();
278+
};
279+
280+
ws.onmessage = (event) => {
281+
// Check if message is JSON (metadata, flushed, etc.) or raw audio
282+
if (event.data instanceof Buffer || event.data instanceof ArrayBuffer) {
283+
// Raw audio data - create new speaker if needed
284+
if (!currentSpeaker) {
285+
currentSpeaker = new Speaker({
286+
channels: 1,
287+
bitDepth: 16,
288+
sampleRate: 24000,
289+
});
290+
isPlayingAudio = true;
291+
}
292+
currentSpeaker.write(Buffer.from(event.data));
293+
} else {
294+
try {
295+
const message = JSON.parse(event.data);
296+
switch (message.type) {
297+
case "Metadata":
298+
console.log("Model info:", message.model_name, message.model_version);
299+
break;
300+
case "Flushed":
301+
console.log("Audio complete");
302+
// End speaker after flush to prevent buffer underflow
303+
if (currentSpeaker && isPlayingAudio) {
304+
currentSpeaker.end();
305+
currentSpeaker = null;
306+
isPlayingAudio = false;
307+
}
308+
rl.prompt();
309+
break;
310+
case "Cleared":
311+
console.log("Audio cleared, sequence:", message.sequence_id);
312+
break;
313+
case "Warning":
314+
console.warn("Warning:", message.description);
315+
break;
316+
}
317+
} catch (error) {
318+
// Not JSON, might be raw audio as string
319+
if (!currentSpeaker) {
320+
currentSpeaker = new Speaker({
321+
channels: 1,
322+
bitDepth: 16,
323+
sampleRate: 24000,
324+
});
325+
isPlayingAudio = true;
326+
}
327+
currentSpeaker.write(Buffer.from(event.data));
328+
}
329+
}
330+
};
331+
332+
ws.onerror = (error) => {
333+
console.error("WebSocket error:", error);
334+
};
335+
336+
ws.onclose = () => {
337+
console.log("WebSocket closed");
338+
if (currentSpeaker) {
339+
currentSpeaker.end();
340+
}
341+
rl.close();
342+
process.exit(0);
343+
};
344+
345+
// Handle user input
346+
rl.on("line", (input) => {
347+
const text = input.trim();
348+
349+
if (text.toLowerCase() === "quit") {
350+
// Send Close message
351+
ws.send(JSON.stringify({ type: "Close" }));
352+
ws.close();
353+
return;
354+
}
355+
356+
if (text.length > 0) {
357+
// Send text to TTS
358+
ws.send(
359+
JSON.stringify({
360+
type: "Speak",
361+
text: text,
362+
}),
363+
);
364+
365+
// Flush to get audio immediately
366+
ws.send(JSON.stringify({ type: "Flush" }));
367+
console.log("Flushing audio");
368+
}
369+
370+
rl.prompt();
371+
});
372+
373+
rl.on("close", () => {
374+
if (ws.readyState === WebSocket.OPEN) {
375+
ws.close();
376+
}
377+
});
378+
```

0 commit comments

Comments
 (0)