Skip to content

Commit 2bf214c

Browse files
feat(api): add batch STT transcription endpoint (#2146)
* feat(api): add batch STT transcription endpoint Add POST /transcribe endpoint for batch speech-to-text transcription via file upload. This mirrors the existing real-time WebSocket proxy pattern but for batch processing. Features: - Support for Deepgram, AssemblyAI, and Soniox providers via ?provider= query param - Normalized BatchResponse format matching owhisper_interface::batch::Response - Proper polling for async providers (AssemblyAI, Soniox) - OpenAPI documentation with Zod schemas - Sentry tracing and metrics integration Usage: POST /transcribe?provider=deepgram&language=en Content-Type: audio/wav <audio data> Co-Authored-By: yujonglee <[email protected]> * fix(api): add params.model support to AssemblyAI batch handler Wire params.model to AssemblyAI's speech_model parameter for model selection, matching the behavior of Deepgram and Soniox handlers. Addresses CodeRabbit review feedback. Co-Authored-By: yujonglee <[email protected]> * fix(api): add requireSupabaseAuth middleware to /transcribe endpoint Apply the same auth middleware pattern as /listen to ensure the /transcribe endpoint requires Supabase authentication as documented in the OpenAPI spec. Addresses CodeRabbit review feedback. Co-Authored-By: yujonglee <[email protected]> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 775c138 commit 2bf214c

File tree

7 files changed

+674
-0
lines changed

7 files changed

+674
-0
lines changed

apps/api/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ app.use("/webhook/stripe", verifyStripeWebhook);
4949

5050
if (env.NODE_ENV !== "development") {
5151
app.use("/listen", loadTestOverride, requireSupabaseAuth);
52+
app.use("/transcribe", loadTestOverride, requireSupabaseAuth);
5253
}
5354

5455
app.route("/", routes);

apps/api/src/routes.ts

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,39 @@ const WebSocketErrorSchema = z.object({
5555
detail: z.string().optional(),
5656
});
5757

58+
const BatchWordSchema = z.object({
59+
word: z.string(),
60+
start: z.number(),
61+
end: z.number(),
62+
confidence: z.number(),
63+
speaker: z.number().nullable().optional(),
64+
punctuated_word: z.string().nullable().optional(),
65+
});
66+
67+
const BatchAlternativesSchema = z.object({
68+
transcript: z.string(),
69+
confidence: z.number(),
70+
words: z.array(BatchWordSchema),
71+
});
72+
73+
const BatchChannelSchema = z.object({
74+
alternatives: z.array(BatchAlternativesSchema),
75+
});
76+
77+
const BatchResultsSchema = z.object({
78+
channels: z.array(BatchChannelSchema),
79+
});
80+
81+
const BatchResponseSchema = z.object({
82+
metadata: z.unknown(),
83+
results: BatchResultsSchema,
84+
});
85+
86+
const BatchErrorSchema = z.object({
87+
error: z.string(),
88+
detail: z.string().optional(),
89+
});
90+
5891
export const routes = new Hono<AppBindings>();
5992

6093
routes.get(
@@ -336,3 +369,121 @@ routes.get(
336369
return listenSocketHandler(c, next);
337370
},
338371
);
372+
373+
routes.post(
374+
"/transcribe",
375+
describeRoute({
376+
tags: [API_TAGS.APP],
377+
summary: "Batch speech-to-text transcription",
378+
description:
379+
"HTTP endpoint for batch speech-to-text transcription via file upload. Supports Deepgram, AssemblyAI, and Soniox providers. Use query parameter ?provider=deepgram|assemblyai|soniox to select provider. Requires Supabase authentication.",
380+
security: [{ Bearer: [] }],
381+
responses: {
382+
200: {
383+
description: "Transcription completed successfully",
384+
content: {
385+
"application/json": {
386+
schema: resolver(BatchResponseSchema),
387+
},
388+
},
389+
},
390+
400: {
391+
description: "Bad request - missing or invalid audio file",
392+
content: {
393+
"application/json": {
394+
schema: resolver(BatchErrorSchema),
395+
},
396+
},
397+
},
398+
401: {
399+
description: "Unauthorized - missing or invalid authentication",
400+
content: {
401+
"text/plain": {
402+
schema: { type: "string", example: "unauthorized" },
403+
},
404+
},
405+
},
406+
500: {
407+
description: "Internal server error during transcription",
408+
content: {
409+
"application/json": {
410+
schema: resolver(BatchErrorSchema),
411+
},
412+
},
413+
},
414+
502: {
415+
description: "Upstream STT service error",
416+
content: {
417+
"application/json": {
418+
schema: resolver(BatchErrorSchema),
419+
},
420+
},
421+
},
422+
},
423+
}),
424+
async (c) => {
425+
const { transcribeBatch } = await import("./stt");
426+
type BatchProvider = "deepgram" | "assemblyai" | "soniox";
427+
428+
const clientUrl = new URL(c.req.url, "http://localhost");
429+
const provider =
430+
(clientUrl.searchParams.get("provider") as BatchProvider) ?? "deepgram";
431+
432+
const languages = clientUrl.searchParams.getAll("language");
433+
const keywords = clientUrl.searchParams.getAll("keyword");
434+
const model = clientUrl.searchParams.get("model") ?? undefined;
435+
436+
const contentType =
437+
c.req.header("content-type") ?? "application/octet-stream";
438+
439+
return Sentry.startSpan(
440+
{ op: "http.client", name: `stt.batch.${provider}` },
441+
async (span) => {
442+
const startTime = performance.now();
443+
444+
try {
445+
const audioData = await c.req.arrayBuffer();
446+
447+
if (!audioData || audioData.byteLength === 0) {
448+
return c.json(
449+
{ error: "missing_audio_data", detail: "Request body is empty" },
450+
400,
451+
);
452+
}
453+
454+
span.setAttribute("stt.provider", provider);
455+
span.setAttribute("stt.audio_size", audioData.byteLength);
456+
457+
const response = await transcribeBatch(
458+
provider,
459+
audioData,
460+
contentType,
461+
{ languages, keywords, model },
462+
);
463+
464+
Metrics.upstreamLatency(provider, performance.now() - startTime);
465+
span.setAttribute("http.status_code", 200);
466+
467+
return c.json(response, 200);
468+
} catch (error) {
469+
Metrics.upstreamLatency(provider, performance.now() - startTime);
470+
471+
const errorMessage =
472+
error instanceof Error ? error.message : "unknown error";
473+
const isUpstreamError = errorMessage.includes("failed:");
474+
475+
Sentry.captureException(error, {
476+
tags: { provider, operation: "batch_transcribe" },
477+
});
478+
479+
span.setAttribute("http.status_code", isUpstreamError ? 502 : 500);
480+
481+
return c.json(
482+
{ error: "transcription_failed", detail: errorMessage },
483+
isUpstreamError ? 502 : 500,
484+
);
485+
}
486+
},
487+
);
488+
},
489+
);
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import { env } from "../env";
2+
import type {
3+
BatchAlternatives,
4+
BatchChannel,
5+
BatchParams,
6+
BatchResponse,
7+
BatchResults,
8+
BatchWord,
9+
} from "./batch-types";
10+
11+
const ASSEMBLYAI_API_URL = "https://api.assemblyai.com/v2";
12+
const POLL_INTERVAL_MS = 3000;
13+
const MAX_POLL_ATTEMPTS = 200;
14+
15+
type AssemblyAIWord = {
16+
text: string;
17+
start: number;
18+
end: number;
19+
confidence: number;
20+
speaker?: string;
21+
};
22+
23+
type AssemblyAITranscriptResponse = {
24+
id: string;
25+
status: string;
26+
text?: string;
27+
words?: AssemblyAIWord[];
28+
confidence?: number;
29+
audio_duration?: number;
30+
error?: string;
31+
};
32+
33+
const uploadAudio = async (audioData: ArrayBuffer): Promise<string> => {
34+
const response = await fetch(`${ASSEMBLYAI_API_URL}/upload`, {
35+
method: "POST",
36+
headers: {
37+
Authorization: env.ASSEMBLYAI_API_KEY,
38+
"Content-Type": "application/octet-stream",
39+
},
40+
body: audioData,
41+
});
42+
43+
if (!response.ok) {
44+
const errorText = await response.text();
45+
throw new Error(
46+
`AssemblyAI upload failed: ${response.status} - ${errorText}`,
47+
);
48+
}
49+
50+
const result = (await response.json()) as { upload_url: string };
51+
return result.upload_url;
52+
};
53+
54+
const createTranscript = async (
55+
audioUrl: string,
56+
params: BatchParams,
57+
): Promise<string> => {
58+
const languageCode =
59+
params.languages && params.languages.length === 1
60+
? params.languages[0]
61+
: undefined;
62+
const languageDetection =
63+
!params.languages ||
64+
params.languages.length === 0 ||
65+
params.languages.length > 1;
66+
67+
const requestBody: Record<string, unknown> = {
68+
audio_url: audioUrl,
69+
speaker_labels: true,
70+
};
71+
72+
if (languageCode) {
73+
requestBody.language_code = languageCode;
74+
}
75+
if (languageDetection) {
76+
requestBody.language_detection = true;
77+
}
78+
if (params.keywords && params.keywords.length > 0) {
79+
requestBody.keyterms_prompt = params.keywords;
80+
}
81+
if (params.model) {
82+
requestBody.speech_model = params.model;
83+
}
84+
85+
const response = await fetch(`${ASSEMBLYAI_API_URL}/transcript`, {
86+
method: "POST",
87+
headers: {
88+
Authorization: env.ASSEMBLYAI_API_KEY,
89+
"Content-Type": "application/json",
90+
},
91+
body: JSON.stringify(requestBody),
92+
});
93+
94+
if (!response.ok) {
95+
const errorText = await response.text();
96+
throw new Error(
97+
`AssemblyAI transcript creation failed: ${response.status} - ${errorText}`,
98+
);
99+
}
100+
101+
const result = (await response.json()) as { id: string };
102+
return result.id;
103+
};
104+
105+
const pollTranscript = async (
106+
transcriptId: string,
107+
): Promise<AssemblyAITranscriptResponse> => {
108+
for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) {
109+
const response = await fetch(
110+
`${ASSEMBLYAI_API_URL}/transcript/${transcriptId}`,
111+
{
112+
headers: {
113+
Authorization: env.ASSEMBLYAI_API_KEY,
114+
},
115+
},
116+
);
117+
118+
if (!response.ok) {
119+
const errorText = await response.text();
120+
throw new Error(
121+
`AssemblyAI poll failed: ${response.status} - ${errorText}`,
122+
);
123+
}
124+
125+
const result = (await response.json()) as AssemblyAITranscriptResponse;
126+
127+
if (result.status === "completed") {
128+
return result;
129+
}
130+
131+
if (result.status === "error") {
132+
throw new Error(
133+
`AssemblyAI transcription failed: ${result.error ?? "unknown error"}`,
134+
);
135+
}
136+
137+
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
138+
}
139+
140+
throw new Error("AssemblyAI transcription timed out");
141+
};
142+
143+
const convertToResponse = (
144+
result: AssemblyAITranscriptResponse,
145+
): BatchResponse => {
146+
const words: BatchWord[] = (result.words ?? []).map((w) => {
147+
const speaker = w.speaker
148+
? parseInt(w.speaker.replace(/\D/g, ""), 10)
149+
: undefined;
150+
151+
return {
152+
word: w.text,
153+
start: w.start / 1000,
154+
end: w.end / 1000,
155+
confidence: w.confidence,
156+
speaker: Number.isNaN(speaker) ? undefined : speaker,
157+
punctuated_word: w.text,
158+
};
159+
});
160+
161+
const alternatives: BatchAlternatives = {
162+
transcript: result.text ?? "",
163+
confidence: result.confidence ?? 1.0,
164+
words,
165+
};
166+
167+
const channel: BatchChannel = {
168+
alternatives: [alternatives],
169+
};
170+
171+
const results: BatchResults = {
172+
channels: [channel],
173+
};
174+
175+
return {
176+
metadata: {
177+
audio_duration: result.audio_duration,
178+
},
179+
results,
180+
};
181+
};
182+
183+
export const transcribeWithAssemblyAI = async (
184+
audioData: ArrayBuffer,
185+
_contentType: string,
186+
params: BatchParams,
187+
): Promise<BatchResponse> => {
188+
const uploadUrl = await uploadAudio(audioData);
189+
const transcriptId = await createTranscript(uploadUrl, params);
190+
const result = await pollTranscript(transcriptId);
191+
return convertToResponse(result);
192+
};

0 commit comments

Comments
 (0)