fastrepl · yujonglee · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
@@ -49,6 +49,7 @@ app.use("/webhook/stripe", verifyStripeWebhook);
 
 if (env.NODE_ENV !== "development") {
   app.use("/listen", loadTestOverride, requireSupabaseAuth);
+  app.use("/transcribe", loadTestOverride, requireSupabaseAuth);
 }
 
 app.route("/", routes);

diff --git a/apps/api/src/routes.ts b/apps/api/src/routes.ts
@@ -55,6 +55,39 @@ const WebSocketErrorSchema = z.object({
   detail: z.string().optional(),
 });
 
+const BatchWordSchema = z.object({
+  word: z.string(),
+  start: z.number(),
+  end: z.number(),
+  confidence: z.number(),
+  speaker: z.number().nullable().optional(),
+  punctuated_word: z.string().nullable().optional(),
+});
+
+const BatchAlternativesSchema = z.object({
+  transcript: z.string(),
+  confidence: z.number(),
+  words: z.array(BatchWordSchema),
+});
+
+const BatchChannelSchema = z.object({
+  alternatives: z.array(BatchAlternativesSchema),
+});
+
+const BatchResultsSchema = z.object({
+  channels: z.array(BatchChannelSchema),
+});
+
+const BatchResponseSchema = z.object({
+  metadata: z.unknown(),
+  results: BatchResultsSchema,
+});
+
+const BatchErrorSchema = z.object({
+  error: z.string(),
+  detail: z.string().optional(),
+});
+
 export const routes = new Hono<AppBindings>();
 
 routes.get(
@@ -336,3 +369,121 @@ routes.get(
     return listenSocketHandler(c, next);
   },
 );
+
+routes.post(
+  "/transcribe",
+  describeRoute({
+    tags: [API_TAGS.APP],
+    summary: "Batch speech-to-text transcription",
+    description:
+      "HTTP endpoint for batch speech-to-text transcription via file upload. Supports Deepgram, AssemblyAI, and Soniox providers. Use query parameter ?provider=deepgram|assemblyai|soniox to select provider. Requires Supabase authentication.",
+    security: [{ Bearer: [] }],
+    responses: {
+      200: {
+        description: "Transcription completed successfully",
+        content: {
+          "application/json": {
+            schema: resolver(BatchResponseSchema),
+          },
+        },
+      },
+      400: {
+        description: "Bad request - missing or invalid audio file",
+        content: {
+          "application/json": {
+            schema: resolver(BatchErrorSchema),
+          },
+        },
+      },
+      401: {
+        description: "Unauthorized - missing or invalid authentication",
+        content: {
+          "text/plain": {
+            schema: { type: "string", example: "unauthorized" },
+          },
+        },
+      },
+      500: {
+        description: "Internal server error during transcription",
+        content: {
+          "application/json": {
+            schema: resolver(BatchErrorSchema),
+          },
+        },
+      },
+      502: {
+        description: "Upstream STT service error",
+        content: {
+          "application/json": {
+            schema: resolver(BatchErrorSchema),
+          },
+        },
+      },
+    },
+  }),
+  async (c) => {
+    const { transcribeBatch } = await import("./stt");
+    type BatchProvider = "deepgram" | "assemblyai" | "soniox";
+
+    const clientUrl = new URL(c.req.url, "http://localhost");
+    const provider =
+      (clientUrl.searchParams.get("provider") as BatchProvider) ?? "deepgram";
+
+    const languages = clientUrl.searchParams.getAll("language");
+    const keywords = clientUrl.searchParams.getAll("keyword");
+    const model = clientUrl.searchParams.get("model") ?? undefined;
+
+    const contentType =
+      c.req.header("content-type") ?? "application/octet-stream";
+
+    return Sentry.startSpan(
+      { op: "http.client", name: `stt.batch.${provider}` },
+      async (span) => {
+        const startTime = performance.now();
+
+        try {
+          const audioData = await c.req.arrayBuffer();
+
+          if (!audioData || audioData.byteLength === 0) {
+            return c.json(
+              { error: "missing_audio_data", detail: "Request body is empty" },
+              400,
+            );
+          }
+
+          span.setAttribute("stt.provider", provider);
+          span.setAttribute("stt.audio_size", audioData.byteLength);
+
+          const response = await transcribeBatch(
+            provider,
+            audioData,
+            contentType,
+            { languages, keywords, model },
+          );
+
+          Metrics.upstreamLatency(provider, performance.now() - startTime);
+          span.setAttribute("http.status_code", 200);
+
+          return c.json(response, 200);
+        } catch (error) {
+          Metrics.upstreamLatency(provider, performance.now() - startTime);
+
+          const errorMessage =
+            error instanceof Error ? error.message : "unknown error";
+          const isUpstreamError = errorMessage.includes("failed:");
+
+          Sentry.captureException(error, {
+            tags: { provider, operation: "batch_transcribe" },
+          });
+
+          span.setAttribute("http.status_code", isUpstreamError ? 502 : 500);
+
+          return c.json(
+            { error: "transcription_failed", detail: errorMessage },
+            isUpstreamError ? 502 : 500,
+          );
+        }
+      },
+    );
+  },
+);
diff --git a/apps/api/src/stt/batch-assemblyai.ts b/apps/api/src/stt/batch-assemblyai.ts
@@ -0,0 +1,192 @@
+import { env } from "../env";
+import type {
+  BatchAlternatives,
+  BatchChannel,
+  BatchParams,
+  BatchResponse,
+  BatchResults,
+  BatchWord,
+} from "./batch-types";
+
+const ASSEMBLYAI_API_URL = "https://api.assemblyai.com/v2";
+const POLL_INTERVAL_MS = 3000;
+const MAX_POLL_ATTEMPTS = 200;
+
+type AssemblyAIWord = {
+  text: string;
+  start: number;
+  end: number;
+  confidence: number;
+  speaker?: string;
+};
+
+type AssemblyAITranscriptResponse = {
+  id: string;
+  status: string;
+  text?: string;
+  words?: AssemblyAIWord[];
+  confidence?: number;
+  audio_duration?: number;
+  error?: string;
+};
+
+const uploadAudio = async (audioData: ArrayBuffer): Promise<string> => {
+  const response = await fetch(`${ASSEMBLYAI_API_URL}/upload`, {
+    method: "POST",
+    headers: {
+      Authorization: env.ASSEMBLYAI_API_KEY,
+      "Content-Type": "application/octet-stream",
+    },
+    body: audioData,
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text();
+    throw new Error(
+      `AssemblyAI upload failed: ${response.status} - ${errorText}`,
+    );
+  }
+
+  const result = (await response.json()) as { upload_url: string };
+  return result.upload_url;
+};
+
+const createTranscript = async (
+  audioUrl: string,
+  params: BatchParams,
+): Promise<string> => {
+  const languageCode =
+    params.languages && params.languages.length === 1
+      ? params.languages[0]
+      : undefined;
+  const languageDetection =
+    !params.languages ||
+    params.languages.length === 0 ||
+    params.languages.length > 1;
+
+  const requestBody: Record<string, unknown> = {
+    audio_url: audioUrl,
+    speaker_labels: true,
+  };
+
+  if (languageCode) {
+    requestBody.language_code = languageCode;
+  }
+  if (languageDetection) {
+    requestBody.language_detection = true;
+  }
+  if (params.keywords && params.keywords.length > 0) {
+    requestBody.keyterms_prompt = params.keywords;
+  }
+  if (params.model) {
+    requestBody.speech_model = params.model;
+  }
+
+  const response = await fetch(`${ASSEMBLYAI_API_URL}/transcript`, {
+    method: "POST",
+    headers: {
+      Authorization: env.ASSEMBLYAI_API_KEY,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify(requestBody),
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text();
+    throw new Error(
+      `AssemblyAI transcript creation failed: ${response.status} - ${errorText}`,
+    );
+  }
+
+  const result = (await response.json()) as { id: string };
+  return result.id;
+};
+
+const pollTranscript = async (
+  transcriptId: string,
+): Promise<AssemblyAITranscriptResponse> => {
+  for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) {
+    const response = await fetch(
+      `${ASSEMBLYAI_API_URL}/transcript/${transcriptId}`,
+      {
+        headers: {
+          Authorization: env.ASSEMBLYAI_API_KEY,
+        },
+      },
+    );
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      throw new Error(
+        `AssemblyAI poll failed: ${response.status} - ${errorText}`,
+      );
+    }
+
+    const result = (await response.json()) as AssemblyAITranscriptResponse;
+
+    if (result.status === "completed") {
+      return result;
+    }
+
+    if (result.status === "error") {
+      throw new Error(
+        `AssemblyAI transcription failed: ${result.error ?? "unknown error"}`,
+      );
+    }
+
+    await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
+  }
+
+  throw new Error("AssemblyAI transcription timed out");
+};
+
+const convertToResponse = (
+  result: AssemblyAITranscriptResponse,
+): BatchResponse => {
+  const words: BatchWord[] = (result.words ?? []).map((w) => {
+    const speaker = w.speaker
+      ? parseInt(w.speaker.replace(/\D/g, ""), 10)
+      : undefined;
+
+    return {
+      word: w.text,
+      start: w.start / 1000,
+      end: w.end / 1000,
+      confidence: w.confidence,
+      speaker: Number.isNaN(speaker) ? undefined : speaker,
+      punctuated_word: w.text,
+    };
+  });
+
+  const alternatives: BatchAlternatives = {
+    transcript: result.text ?? "",
+    confidence: result.confidence ?? 1.0,
+    words,
+  };
+
+  const channel: BatchChannel = {
+    alternatives: [alternatives],
+  };
+
+  const results: BatchResults = {
+    channels: [channel],
+  };
+
+  return {
+    metadata: {
+      audio_duration: result.audio_duration,
+    },
+    results,
+  };
+};
+
+export const transcribeWithAssemblyAI = async (
+  audioData: ArrayBuffer,
+  _contentType: string,
+  params: BatchParams,
+): Promise<BatchResponse> => {
+  const uploadUrl = await uploadAudio(audioData);
+  const transcriptId = await createTranscript(uploadUrl, params);
+  const result = await pollTranscript(transcriptId);
+  return convertToResponse(result);
+};