Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/api/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ app.use("/webhook/stripe", verifyStripeWebhook);

if (env.NODE_ENV !== "development") {
app.use("/listen", loadTestOverride, requireSupabaseAuth);
app.use("/transcribe", loadTestOverride, requireSupabaseAuth);
}

app.route("/", routes);
Expand Down
151 changes: 151 additions & 0 deletions apps/api/src/routes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,39 @@ const WebSocketErrorSchema = z.object({
detail: z.string().optional(),
});

const BatchWordSchema = z.object({
word: z.string(),
start: z.number(),
end: z.number(),
confidence: z.number(),
speaker: z.number().nullable().optional(),
punctuated_word: z.string().nullable().optional(),
});

const BatchAlternativesSchema = z.object({
transcript: z.string(),
confidence: z.number(),
words: z.array(BatchWordSchema),
});

const BatchChannelSchema = z.object({
alternatives: z.array(BatchAlternativesSchema),
});

const BatchResultsSchema = z.object({
channels: z.array(BatchChannelSchema),
});

const BatchResponseSchema = z.object({
metadata: z.unknown(),
results: BatchResultsSchema,
});

const BatchErrorSchema = z.object({
error: z.string(),
detail: z.string().optional(),
});

export const routes = new Hono<AppBindings>();

routes.get(
Expand Down Expand Up @@ -336,3 +369,121 @@ routes.get(
return listenSocketHandler(c, next);
},
);

routes.post(
"/transcribe",
describeRoute({
tags: [API_TAGS.APP],
summary: "Batch speech-to-text transcription",
description:
"HTTP endpoint for batch speech-to-text transcription via file upload. Supports Deepgram, AssemblyAI, and Soniox providers. Use query parameter ?provider=deepgram|assemblyai|soniox to select provider. Requires Supabase authentication.",
security: [{ Bearer: [] }],
responses: {
200: {
description: "Transcription completed successfully",
content: {
"application/json": {
schema: resolver(BatchResponseSchema),
},
},
},
400: {
description: "Bad request - missing or invalid audio file",
content: {
"application/json": {
schema: resolver(BatchErrorSchema),
},
},
},
401: {
description: "Unauthorized - missing or invalid authentication",
content: {
"text/plain": {
schema: { type: "string", example: "unauthorized" },
},
},
},
500: {
description: "Internal server error during transcription",
content: {
"application/json": {
schema: resolver(BatchErrorSchema),
},
},
},
502: {
description: "Upstream STT service error",
content: {
"application/json": {
schema: resolver(BatchErrorSchema),
},
},
},
},
}),
async (c) => {
const { transcribeBatch } = await import("./stt");
type BatchProvider = "deepgram" | "assemblyai" | "soniox";

const clientUrl = new URL(c.req.url, "http://localhost");
const provider =
(clientUrl.searchParams.get("provider") as BatchProvider) ?? "deepgram";

const languages = clientUrl.searchParams.getAll("language");
const keywords = clientUrl.searchParams.getAll("keyword");
const model = clientUrl.searchParams.get("model") ?? undefined;

const contentType =
c.req.header("content-type") ?? "application/octet-stream";

return Sentry.startSpan(
{ op: "http.client", name: `stt.batch.${provider}` },
async (span) => {
const startTime = performance.now();

try {
const audioData = await c.req.arrayBuffer();

if (!audioData || audioData.byteLength === 0) {
return c.json(
{ error: "missing_audio_data", detail: "Request body is empty" },
400,
);
}

span.setAttribute("stt.provider", provider);
span.setAttribute("stt.audio_size", audioData.byteLength);

const response = await transcribeBatch(
provider,
audioData,
contentType,
{ languages, keywords, model },
);

Metrics.upstreamLatency(provider, performance.now() - startTime);
span.setAttribute("http.status_code", 200);

return c.json(response, 200);
} catch (error) {
Metrics.upstreamLatency(provider, performance.now() - startTime);

const errorMessage =
error instanceof Error ? error.message : "unknown error";
const isUpstreamError = errorMessage.includes("failed:");

Sentry.captureException(error, {
tags: { provider, operation: "batch_transcribe" },
});

span.setAttribute("http.status_code", isUpstreamError ? 502 : 500);

return c.json(
{ error: "transcription_failed", detail: errorMessage },
isUpstreamError ? 502 : 500,
);
}
},
);
},
);
192 changes: 192 additions & 0 deletions apps/api/src/stt/batch-assemblyai.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import { env } from "../env";
import type {
BatchAlternatives,
BatchChannel,
BatchParams,
BatchResponse,
BatchResults,
BatchWord,
} from "./batch-types";

const ASSEMBLYAI_API_URL = "https://api.assemblyai.com/v2";
const POLL_INTERVAL_MS = 3000;
const MAX_POLL_ATTEMPTS = 200;

type AssemblyAIWord = {
text: string;
start: number;
end: number;
confidence: number;
speaker?: string;
};

type AssemblyAITranscriptResponse = {
id: string;
status: string;
text?: string;
words?: AssemblyAIWord[];
confidence?: number;
audio_duration?: number;
error?: string;
};

const uploadAudio = async (audioData: ArrayBuffer): Promise<string> => {
const response = await fetch(`${ASSEMBLYAI_API_URL}/upload`, {
method: "POST",
headers: {
Authorization: env.ASSEMBLYAI_API_KEY,
"Content-Type": "application/octet-stream",
},
body: audioData,
});

if (!response.ok) {
const errorText = await response.text();
throw new Error(
`AssemblyAI upload failed: ${response.status} - ${errorText}`,
);
}

const result = (await response.json()) as { upload_url: string };
return result.upload_url;
};

const createTranscript = async (
audioUrl: string,
params: BatchParams,
): Promise<string> => {
const languageCode =
params.languages && params.languages.length === 1
? params.languages[0]
: undefined;
const languageDetection =
!params.languages ||
params.languages.length === 0 ||
params.languages.length > 1;

const requestBody: Record<string, unknown> = {
audio_url: audioUrl,
speaker_labels: true,
};

if (languageCode) {
requestBody.language_code = languageCode;
}
if (languageDetection) {
requestBody.language_detection = true;
}
if (params.keywords && params.keywords.length > 0) {
requestBody.keyterms_prompt = params.keywords;
}
if (params.model) {
requestBody.speech_model = params.model;
}

const response = await fetch(`${ASSEMBLYAI_API_URL}/transcript`, {
method: "POST",
headers: {
Authorization: env.ASSEMBLYAI_API_KEY,
"Content-Type": "application/json",
},
body: JSON.stringify(requestBody),
});

if (!response.ok) {
const errorText = await response.text();
throw new Error(
`AssemblyAI transcript creation failed: ${response.status} - ${errorText}`,
);
}

const result = (await response.json()) as { id: string };
return result.id;
};

const pollTranscript = async (
transcriptId: string,
): Promise<AssemblyAITranscriptResponse> => {
for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) {
const response = await fetch(
`${ASSEMBLYAI_API_URL}/transcript/${transcriptId}`,
{
headers: {
Authorization: env.ASSEMBLYAI_API_KEY,
},
},
);

if (!response.ok) {
const errorText = await response.text();
throw new Error(
`AssemblyAI poll failed: ${response.status} - ${errorText}`,
);
}

const result = (await response.json()) as AssemblyAITranscriptResponse;

if (result.status === "completed") {
return result;
}

if (result.status === "error") {
throw new Error(
`AssemblyAI transcription failed: ${result.error ?? "unknown error"}`,
);
}

await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
}

throw new Error("AssemblyAI transcription timed out");
};

const convertToResponse = (
result: AssemblyAITranscriptResponse,
): BatchResponse => {
const words: BatchWord[] = (result.words ?? []).map((w) => {
const speaker = w.speaker
? parseInt(w.speaker.replace(/\D/g, ""), 10)
: undefined;

return {
word: w.text,
start: w.start / 1000,
end: w.end / 1000,
confidence: w.confidence,
speaker: Number.isNaN(speaker) ? undefined : speaker,
punctuated_word: w.text,
};
});

const alternatives: BatchAlternatives = {
transcript: result.text ?? "",
confidence: result.confidence ?? 1.0,
words,
};

const channel: BatchChannel = {
alternatives: [alternatives],
};

const results: BatchResults = {
channels: [channel],
};

return {
metadata: {
audio_duration: result.audio_duration,
},
results,
};
};

export const transcribeWithAssemblyAI = async (
audioData: ArrayBuffer,
_contentType: string,
params: BatchParams,
): Promise<BatchResponse> => {
const uploadUrl = await uploadAudio(audioData);
const transcriptId = await createTranscript(uploadUrl, params);
const result = await pollTranscript(transcriptId);
return convertToResponse(result);
};
Loading