diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index b03e112ebc..ff3c9a4a54 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -49,6 +49,7 @@ app.use("/webhook/stripe", verifyStripeWebhook); if (env.NODE_ENV !== "development") { app.use("/listen", loadTestOverride, requireSupabaseAuth); + app.use("/transcribe", loadTestOverride, requireSupabaseAuth); } app.route("/", routes); diff --git a/apps/api/src/routes.ts b/apps/api/src/routes.ts index 72f1c608eb..9a1087ee6f 100644 --- a/apps/api/src/routes.ts +++ b/apps/api/src/routes.ts @@ -55,6 +55,39 @@ const WebSocketErrorSchema = z.object({ detail: z.string().optional(), }); +const BatchWordSchema = z.object({ + word: z.string(), + start: z.number(), + end: z.number(), + confidence: z.number(), + speaker: z.number().nullable().optional(), + punctuated_word: z.string().nullable().optional(), +}); + +const BatchAlternativesSchema = z.object({ + transcript: z.string(), + confidence: z.number(), + words: z.array(BatchWordSchema), +}); + +const BatchChannelSchema = z.object({ + alternatives: z.array(BatchAlternativesSchema), +}); + +const BatchResultsSchema = z.object({ + channels: z.array(BatchChannelSchema), +}); + +const BatchResponseSchema = z.object({ + metadata: z.unknown(), + results: BatchResultsSchema, +}); + +const BatchErrorSchema = z.object({ + error: z.string(), + detail: z.string().optional(), +}); + export const routes = new Hono(); routes.get( @@ -336,3 +369,121 @@ routes.get( return listenSocketHandler(c, next); }, ); + +routes.post( + "/transcribe", + describeRoute({ + tags: [API_TAGS.APP], + summary: "Batch speech-to-text transcription", + description: + "HTTP endpoint for batch speech-to-text transcription via file upload. Supports Deepgram, AssemblyAI, and Soniox providers. Use query parameter ?provider=deepgram|assemblyai|soniox to select provider. Requires Supabase authentication.", + security: [{ Bearer: [] }], + responses: { + 200: { + description: "Transcription completed successfully", + content: { + "application/json": { + schema: resolver(BatchResponseSchema), + }, + }, + }, + 400: { + description: "Bad request - missing or invalid audio file", + content: { + "application/json": { + schema: resolver(BatchErrorSchema), + }, + }, + }, + 401: { + description: "Unauthorized - missing or invalid authentication", + content: { + "text/plain": { + schema: { type: "string", example: "unauthorized" }, + }, + }, + }, + 500: { + description: "Internal server error during transcription", + content: { + "application/json": { + schema: resolver(BatchErrorSchema), + }, + }, + }, + 502: { + description: "Upstream STT service error", + content: { + "application/json": { + schema: resolver(BatchErrorSchema), + }, + }, + }, + }, + }), + async (c) => { + const { transcribeBatch } = await import("./stt"); + type BatchProvider = "deepgram" | "assemblyai" | "soniox"; + + const clientUrl = new URL(c.req.url, "http://localhost"); + const provider = + (clientUrl.searchParams.get("provider") as BatchProvider) ?? "deepgram"; + + const languages = clientUrl.searchParams.getAll("language"); + const keywords = clientUrl.searchParams.getAll("keyword"); + const model = clientUrl.searchParams.get("model") ?? undefined; + + const contentType = + c.req.header("content-type") ?? "application/octet-stream"; + + return Sentry.startSpan( + { op: "http.client", name: `stt.batch.${provider}` }, + async (span) => { + const startTime = performance.now(); + + try { + const audioData = await c.req.arrayBuffer(); + + if (!audioData || audioData.byteLength === 0) { + return c.json( + { error: "missing_audio_data", detail: "Request body is empty" }, + 400, + ); + } + + span.setAttribute("stt.provider", provider); + span.setAttribute("stt.audio_size", audioData.byteLength); + + const response = await transcribeBatch( + provider, + audioData, + contentType, + { languages, keywords, model }, + ); + + Metrics.upstreamLatency(provider, performance.now() - startTime); + span.setAttribute("http.status_code", 200); + + return c.json(response, 200); + } catch (error) { + Metrics.upstreamLatency(provider, performance.now() - startTime); + + const errorMessage = + error instanceof Error ? error.message : "unknown error"; + const isUpstreamError = errorMessage.includes("failed:"); + + Sentry.captureException(error, { + tags: { provider, operation: "batch_transcribe" }, + }); + + span.setAttribute("http.status_code", isUpstreamError ? 502 : 500); + + return c.json( + { error: "transcription_failed", detail: errorMessage }, + isUpstreamError ? 502 : 500, + ); + } + }, + ); + }, +); diff --git a/apps/api/src/stt/batch-assemblyai.ts b/apps/api/src/stt/batch-assemblyai.ts new file mode 100644 index 0000000000..e02515c811 --- /dev/null +++ b/apps/api/src/stt/batch-assemblyai.ts @@ -0,0 +1,192 @@ +import { env } from "../env"; +import type { + BatchAlternatives, + BatchChannel, + BatchParams, + BatchResponse, + BatchResults, + BatchWord, +} from "./batch-types"; + +const ASSEMBLYAI_API_URL = "https://api.assemblyai.com/v2"; +const POLL_INTERVAL_MS = 3000; +const MAX_POLL_ATTEMPTS = 200; + +type AssemblyAIWord = { + text: string; + start: number; + end: number; + confidence: number; + speaker?: string; +}; + +type AssemblyAITranscriptResponse = { + id: string; + status: string; + text?: string; + words?: AssemblyAIWord[]; + confidence?: number; + audio_duration?: number; + error?: string; +}; + +const uploadAudio = async (audioData: ArrayBuffer): Promise => { + const response = await fetch(`${ASSEMBLYAI_API_URL}/upload`, { + method: "POST", + headers: { + Authorization: env.ASSEMBLYAI_API_KEY, + "Content-Type": "application/octet-stream", + }, + body: audioData, + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `AssemblyAI upload failed: ${response.status} - ${errorText}`, + ); + } + + const result = (await response.json()) as { upload_url: string }; + return result.upload_url; +}; + +const createTranscript = async ( + audioUrl: string, + params: BatchParams, +): Promise => { + const languageCode = + params.languages && params.languages.length === 1 + ? params.languages[0] + : undefined; + const languageDetection = + !params.languages || + params.languages.length === 0 || + params.languages.length > 1; + + const requestBody: Record = { + audio_url: audioUrl, + speaker_labels: true, + }; + + if (languageCode) { + requestBody.language_code = languageCode; + } + if (languageDetection) { + requestBody.language_detection = true; + } + if (params.keywords && params.keywords.length > 0) { + requestBody.keyterms_prompt = params.keywords; + } + if (params.model) { + requestBody.speech_model = params.model; + } + + const response = await fetch(`${ASSEMBLYAI_API_URL}/transcript`, { + method: "POST", + headers: { + Authorization: env.ASSEMBLYAI_API_KEY, + "Content-Type": "application/json", + }, + body: JSON.stringify(requestBody), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `AssemblyAI transcript creation failed: ${response.status} - ${errorText}`, + ); + } + + const result = (await response.json()) as { id: string }; + return result.id; +}; + +const pollTranscript = async ( + transcriptId: string, +): Promise => { + for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) { + const response = await fetch( + `${ASSEMBLYAI_API_URL}/transcript/${transcriptId}`, + { + headers: { + Authorization: env.ASSEMBLYAI_API_KEY, + }, + }, + ); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `AssemblyAI poll failed: ${response.status} - ${errorText}`, + ); + } + + const result = (await response.json()) as AssemblyAITranscriptResponse; + + if (result.status === "completed") { + return result; + } + + if (result.status === "error") { + throw new Error( + `AssemblyAI transcription failed: ${result.error ?? "unknown error"}`, + ); + } + + await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS)); + } + + throw new Error("AssemblyAI transcription timed out"); +}; + +const convertToResponse = ( + result: AssemblyAITranscriptResponse, +): BatchResponse => { + const words: BatchWord[] = (result.words ?? []).map((w) => { + const speaker = w.speaker + ? parseInt(w.speaker.replace(/\D/g, ""), 10) + : undefined; + + return { + word: w.text, + start: w.start / 1000, + end: w.end / 1000, + confidence: w.confidence, + speaker: Number.isNaN(speaker) ? undefined : speaker, + punctuated_word: w.text, + }; + }); + + const alternatives: BatchAlternatives = { + transcript: result.text ?? "", + confidence: result.confidence ?? 1.0, + words, + }; + + const channel: BatchChannel = { + alternatives: [alternatives], + }; + + const results: BatchResults = { + channels: [channel], + }; + + return { + metadata: { + audio_duration: result.audio_duration, + }, + results, + }; +}; + +export const transcribeWithAssemblyAI = async ( + audioData: ArrayBuffer, + _contentType: string, + params: BatchParams, +): Promise => { + const uploadUrl = await uploadAudio(audioData); + const transcriptId = await createTranscript(uploadUrl, params); + const result = await pollTranscript(transcriptId); + return convertToResponse(result); +}; diff --git a/apps/api/src/stt/batch-deepgram.ts b/apps/api/src/stt/batch-deepgram.ts new file mode 100644 index 0000000000..d3464fe12e --- /dev/null +++ b/apps/api/src/stt/batch-deepgram.ts @@ -0,0 +1,53 @@ +import { env } from "../env"; +import type { BatchParams, BatchResponse } from "./batch-types"; + +const DEEPGRAM_BATCH_URL = "https://api.deepgram.com/v1/listen"; + +export const transcribeWithDeepgram = async ( + audioData: ArrayBuffer, + contentType: string, + params: BatchParams, +): Promise => { + const url = new URL(DEEPGRAM_BATCH_URL); + + url.searchParams.set("model", params.model ?? "nova-3-general"); + url.searchParams.set("smart_format", "true"); + url.searchParams.set("diarize", "true"); + url.searchParams.set("punctuate", "true"); + url.searchParams.set("mip_opt_out", "false"); + + if (params.languages && params.languages.length > 0) { + if (params.languages.length === 1) { + url.searchParams.set("language", params.languages[0]); + } else { + url.searchParams.set("detect_language", "true"); + } + } else { + url.searchParams.set("detect_language", "true"); + } + + if (params.keywords && params.keywords.length > 0) { + for (const keyword of params.keywords) { + url.searchParams.append("keywords", keyword); + } + } + + const response = await fetch(url.toString(), { + method: "POST", + headers: { + Authorization: `Token ${env.DEEPGRAM_API_KEY}`, + "Content-Type": contentType, + Accept: "application/json", + }, + body: audioData, + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Deepgram batch transcription failed: ${response.status} - ${errorText}`, + ); + } + + return response.json() as Promise; +}; diff --git a/apps/api/src/stt/batch-soniox.ts b/apps/api/src/stt/batch-soniox.ts new file mode 100644 index 0000000000..611f629b63 --- /dev/null +++ b/apps/api/src/stt/batch-soniox.ts @@ -0,0 +1,216 @@ +import { env } from "../env"; +import type { + BatchAlternatives, + BatchChannel, + BatchParams, + BatchResponse, + BatchResults, + BatchWord, +} from "./batch-types"; + +const SONIOX_API_HOST = "api.soniox.com"; +const POLL_INTERVAL_MS = 3000; +const MAX_POLL_ATTEMPTS = 200; + +type SonioxToken = { + text: string; + start_ms?: number; + end_ms?: number; + confidence?: number; + speaker?: number | string; +}; + +type SonioxTranscriptResponse = { + text: string; + tokens: SonioxToken[]; +}; + +type SonioxTranscriptionStatus = { + status: string; + error_message?: string; +}; + +const uploadFile = async ( + audioData: ArrayBuffer, + fileName: string, +): Promise => { + const formData = new FormData(); + const blob = new Blob([audioData]); + formData.append("file", blob, fileName); + + const response = await fetch(`https://${SONIOX_API_HOST}/v1/files`, { + method: "POST", + headers: { + Authorization: `Bearer ${env.SONIOX_API_KEY}`, + }, + body: formData, + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Soniox upload failed: ${response.status} - ${errorText}`); + } + + const result = (await response.json()) as { id: string }; + return result.id; +}; + +const createTranscription = async ( + fileId: string, + params: BatchParams, +): Promise => { + const model = params.model ?? "stt-async-v3"; + const languageHints = params.languages ?? []; + + const requestBody: Record = { + model: model === "stt-v3" ? "stt-async-v3" : model, + file_id: fileId, + enable_speaker_diarization: true, + enable_language_identification: true, + }; + + if (languageHints.length > 0) { + requestBody.language_hints = languageHints; + } + + if (params.keywords && params.keywords.length > 0) { + requestBody.context = { + terms: params.keywords, + }; + } + + const response = await fetch(`https://${SONIOX_API_HOST}/v1/transcriptions`, { + method: "POST", + headers: { + Authorization: `Bearer ${env.SONIOX_API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(requestBody), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Soniox transcription creation failed: ${response.status} - ${errorText}`, + ); + } + + const result = (await response.json()) as { id: string }; + return result.id; +}; + +const pollTranscription = async (transcriptionId: string): Promise => { + for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) { + const response = await fetch( + `https://${SONIOX_API_HOST}/v1/transcriptions/${transcriptionId}`, + { + headers: { + Authorization: `Bearer ${env.SONIOX_API_KEY}`, + }, + }, + ); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Soniox poll failed: ${response.status} - ${errorText}`); + } + + const result = (await response.json()) as SonioxTranscriptionStatus; + + if (result.status === "completed") { + return; + } + + if (result.status === "error") { + throw new Error( + `Soniox transcription failed: ${result.error_message ?? "unknown error"}`, + ); + } + + if (result.status !== "queued" && result.status !== "processing") { + throw new Error(`Soniox unexpected status: ${result.status}`); + } + + await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS)); + } + + throw new Error("Soniox transcription timed out"); +}; + +const getTranscript = async ( + transcriptionId: string, +): Promise => { + const response = await fetch( + `https://${SONIOX_API_HOST}/v1/transcriptions/${transcriptionId}/transcript`, + { + headers: { + Authorization: `Bearer ${env.SONIOX_API_KEY}`, + }, + }, + ); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Soniox get transcript failed: ${response.status} - ${errorText}`, + ); + } + + return response.json() as Promise; +}; + +const parseSpeaker = ( + speaker: number | string | undefined, +): number | undefined => { + if (speaker === undefined) { + return undefined; + } + if (typeof speaker === "number") { + return speaker >= 0 ? speaker : undefined; + } + const parsed = parseInt(speaker.replace(/\D/g, ""), 10); + return Number.isNaN(parsed) ? undefined : parsed; +}; + +const convertToResponse = (result: SonioxTranscriptResponse): BatchResponse => { + const words: BatchWord[] = result.tokens.map((token) => ({ + word: token.text, + start: (token.start_ms ?? 0) / 1000, + end: (token.end_ms ?? 0) / 1000, + confidence: token.confidence ?? 1.0, + speaker: parseSpeaker(token.speaker), + punctuated_word: token.text, + })); + + const alternatives: BatchAlternatives = { + transcript: result.text, + confidence: 1.0, + words, + }; + + const channel: BatchChannel = { + alternatives: [alternatives], + }; + + const results: BatchResults = { + channels: [channel], + }; + + return { + metadata: {}, + results, + }; +}; + +export const transcribeWithSoniox = async ( + audioData: ArrayBuffer, + _contentType: string, + params: BatchParams, + fileName: string = "audio.wav", +): Promise => { + const fileId = await uploadFile(audioData, fileName); + const transcriptionId = await createTranscription(fileId, params); + await pollTranscription(transcriptionId); + const transcript = await getTranscript(transcriptionId); + return convertToResponse(transcript); +}; diff --git a/apps/api/src/stt/batch-types.ts b/apps/api/src/stt/batch-types.ts new file mode 100644 index 0000000000..af542b3173 --- /dev/null +++ b/apps/api/src/stt/batch-types.ts @@ -0,0 +1,35 @@ +export type BatchWord = { + word: string; + start: number; + end: number; + confidence: number; + speaker?: number | null; + punctuated_word?: string | null; +}; + +export type BatchAlternatives = { + transcript: string; + confidence: number; + words: BatchWord[]; +}; + +export type BatchChannel = { + alternatives: BatchAlternatives[]; +}; + +export type BatchResults = { + channels: BatchChannel[]; +}; + +export type BatchResponse = { + metadata: unknown; + results: BatchResults; +}; + +export type BatchProvider = "deepgram" | "assemblyai" | "soniox"; + +export type BatchParams = { + languages?: string[]; + keywords?: string[]; + model?: string; +}; diff --git a/apps/api/src/stt/index.ts b/apps/api/src/stt/index.ts index ae02030b1f..c30e3fc09f 100644 --- a/apps/api/src/stt/index.ts +++ b/apps/api/src/stt/index.ts @@ -1,4 +1,8 @@ import { createAssemblyAIProxy } from "./assemblyai"; +import { transcribeWithAssemblyAI } from "./batch-assemblyai"; +import { transcribeWithDeepgram } from "./batch-deepgram"; +import { transcribeWithSoniox } from "./batch-soniox"; +import type { BatchParams, BatchProvider, BatchResponse } from "./batch-types"; import { WsProxyConnection } from "./connection"; import { createDeepgramProxy } from "./deepgram"; import { createSonioxProxy } from "./soniox"; @@ -8,12 +12,34 @@ export { normalizeWsData, type WsPayload } from "./utils"; export { buildDeepgramUrl, createDeepgramProxy } from "./deepgram"; export { buildAssemblyAIUrl, createAssemblyAIProxy } from "./assemblyai"; export { buildSonioxUrl, createSonioxProxy } from "./soniox"; +export type { BatchParams, BatchProvider, BatchResponse } from "./batch-types"; +export { transcribeWithDeepgram } from "./batch-deepgram"; +export { transcribeWithAssemblyAI } from "./batch-assemblyai"; +export { transcribeWithSoniox } from "./batch-soniox"; export const UPSTREAM_URL_HEADER = "x-owh-upstream-url"; export const UPSTREAM_AUTH_HEADER = "x-owh-upstream-auth"; export type SttProvider = "deepgram" | "assemblyai" | "soniox"; +export async function transcribeBatch( + provider: BatchProvider, + audioData: ArrayBuffer, + contentType: string, + params: BatchParams, + fileName?: string, +): Promise { + switch (provider) { + case "assemblyai": + return transcribeWithAssemblyAI(audioData, contentType, params); + case "soniox": + return transcribeWithSoniox(audioData, contentType, params, fileName); + case "deepgram": + default: + return transcribeWithDeepgram(audioData, contentType, params); + } +} + export function createProxyFromRequest( incomingUrl: URL, reqHeaders: Headers,