|
1 | 1 | import { NextRequest, NextResponse } from 'next/server'; |
2 | 2 | import OpenAI from 'openai'; |
3 | 3 | import { SpeechCreateParams } from 'openai/resources/audio/speech.mjs'; |
4 | | -import { isKokoroModel, stripVoiceWeights } from '@/utils/voice'; |
| 4 | +import { isKokoroModel } from '@/utils/voice'; |
| 5 | +import { LRUCache } from 'lru-cache'; |
| 6 | +import { createHash } from 'crypto'; |
| 7 | + |
| 8 | +export const runtime = 'nodejs'; |
5 | 9 |
|
6 | 10 | type CustomVoice = string; |
7 | 11 | type ExtendedSpeechParams = Omit<SpeechCreateParams, 'voice'> & { |
8 | 12 | voice: SpeechCreateParams['voice'] | CustomVoice; |
9 | 13 | instructions?: string; |
10 | 14 | }; |
| 15 | +type AudioBufferValue = ArrayBuffer; |
| 16 | + |
| 17 | +const TTS_CACHE_MAX_SIZE_BYTES = Number(process.env.TTS_CACHE_MAX_SIZE_BYTES || 256 * 1024 * 1024); // 256MB |
| 18 | +const TTS_CACHE_TTL_MS = Number(process.env.TTS_CACHE_TTL_MS || 1000 * 60 * 30); // 30 minutes |
| 19 | + |
| 20 | +const ttsAudioCache = new LRUCache<string, AudioBufferValue>({ |
| 21 | + maxSize: TTS_CACHE_MAX_SIZE_BYTES, |
| 22 | + sizeCalculation: (value) => value.byteLength, |
| 23 | + ttl: TTS_CACHE_TTL_MS, |
| 24 | +}); |
| 25 | + |
| 26 | +function makeCacheKey(input: { |
| 27 | + provider: string; |
| 28 | + model: string | null | undefined; |
| 29 | + voice: string | undefined; |
| 30 | + speed: number; |
| 31 | + format: string; |
| 32 | + text: string; |
| 33 | + instructions?: string; |
| 34 | +}) { |
| 35 | + const canonical = { |
| 36 | + provider: input.provider, |
| 37 | + model: input.model || '', |
| 38 | + voice: input.voice || '', |
| 39 | + speed: input.speed, |
| 40 | + format: input.format, |
| 41 | + text: input.text, |
| 42 | + // Only include instructions when present (for models like gpt-4o-mini-tts) |
| 43 | + instructions: input.instructions || undefined, |
| 44 | + }; |
| 45 | + return createHash('sha256').update(JSON.stringify(canonical)).digest('hex'); |
| 46 | +} |
11 | 47 |
|
12 | 48 | export async function POST(req: NextRequest) { |
13 | 49 | try { |
14 | 50 | // Get API credentials from headers or fall back to environment variables |
15 | 51 | const openApiKey = req.headers.get('x-openai-key') || process.env.API_KEY || 'none'; |
16 | 52 | const openApiBaseUrl = req.headers.get('x-openai-base-url') || process.env.API_BASE; |
17 | 53 | const provider = req.headers.get('x-tts-provider') || 'openai'; |
18 | | - const { text, voice, speed, format, model, instructions } = await req.json(); |
19 | | - console.log('Received TTS request:', { provider, model, voice, speed, format, hasInstructions: Boolean(instructions) }); |
| 54 | + const { text, voice, speed, format, model: req_model, instructions } = await req.json(); |
| 55 | + console.log('Received TTS request:', { provider, req_model, voice, speed, format, hasInstructions: Boolean(instructions) }); |
20 | 56 |
|
21 | 57 | if (!text || !voice || !speed) { |
22 | 58 | return NextResponse.json({ error: 'Missing required parameters' }, { status: 400 }); |
23 | 59 | } |
24 | | - |
25 | | - // Apply Deepinfra defaults if provider is deepinfra |
26 | | - const finalModel = provider === 'deepinfra' && !model ? 'hexgrad/Kokoro-82M' : model; |
27 | | - const initialVoice = provider === 'deepinfra' && !voice ? 'af_bella' : voice; |
28 | | - |
29 | | - // For SDK providers (OpenAI/Deepinfra), preserve multi-voice for Kokoro models, otherwise normalize to first token |
30 | | - const isKokoro = isKokoroModel(finalModel); |
31 | | - let normalizedVoice = initialVoice; |
32 | | - if (!isKokoro && typeof normalizedVoice === 'string' && normalizedVoice.includes('+')) { |
33 | | - normalizedVoice = stripVoiceWeights(normalizedVoice.split('+')[0]); |
34 | | - console.log('Normalized multi-voice to single for non-Kokoro SDK provider:', normalizedVoice); |
35 | | - } |
| 60 | + // Use default Kokoro model for Deepinfra if none specified |
| 61 | + const model = provider === 'deepinfra' && !req_model ? 'hexgrad/Kokoro-82M' : req_model; |
36 | 62 |
|
37 | 63 | // Initialize OpenAI client with abort signal (OpenAI/deepinfra) |
38 | 64 | const openai = new OpenAI({ |
39 | 65 | apiKey: openApiKey, |
40 | 66 | baseURL: openApiBaseUrl, |
41 | 67 | }); |
42 | 68 |
|
43 | | - // Unified path: all providers (openai, deepinfra, custom-openai) go through the SDK below. |
44 | | - |
45 | | - // Request audio from OpenAI and pass along the abort signal |
| 69 | + const normalizedVoice = ( |
| 70 | + !isKokoroModel(model) && voice.includes('+') |
| 71 | + ? (voice.split('+')[0].trim()) |
| 72 | + : voice |
| 73 | + ) as SpeechCreateParams['voice']; |
| 74 | + |
46 | 75 | const createParams: ExtendedSpeechParams = { |
47 | | - model: finalModel || 'tts-1', |
48 | | - voice: normalizedVoice as SpeechCreateParams['voice'], |
| 76 | + model: model, |
| 77 | + voice: normalizedVoice, |
49 | 78 | input: text, |
50 | 79 | speed: speed, |
51 | 80 | response_format: format === 'aac' ? 'aac' : 'mp3', |
52 | 81 | }; |
53 | | - |
54 | 82 | // Only add instructions if model is gpt-4o-mini-tts and instructions are provided |
55 | | - if (finalModel === 'gpt-4o-mini-tts' && instructions) { |
| 83 | + if (model === 'gpt-4o-mini-tts' && instructions) { |
56 | 84 | createParams.instructions = instructions; |
57 | 85 | } |
58 | 86 |
|
| 87 | + // Compute cache key and check LRU before making provider call |
| 88 | + const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg'; |
| 89 | + |
| 90 | + // Preserve voice string as-is for cache key (no weight stripping) |
| 91 | + const voiceForKey = typeof createParams.voice === 'string' |
| 92 | + ? createParams.voice |
| 93 | + : String(createParams.voice); |
| 94 | + |
| 95 | + const cacheKey = makeCacheKey({ |
| 96 | + provider, |
| 97 | + model: createParams.model, |
| 98 | + voice: voiceForKey, |
| 99 | + speed: Number(createParams.speed), |
| 100 | + format: String(createParams.response_format), |
| 101 | + text, |
| 102 | + instructions: createParams.instructions, |
| 103 | + }); |
| 104 | + |
| 105 | + const cachedBuffer = ttsAudioCache.get(cacheKey); |
| 106 | + if (cachedBuffer) { |
| 107 | + console.log('TTS cache HIT for key:', cacheKey.slice(0, 8)); |
| 108 | + return new NextResponse(cachedBuffer, { |
| 109 | + headers: { |
| 110 | + 'Content-Type': contentType, |
| 111 | + 'X-Cache': 'HIT', |
| 112 | + } |
| 113 | + }); |
| 114 | + } |
| 115 | + |
59 | 116 | const response = await openai.audio.speech.create(createParams as SpeechCreateParams, { signal: req.signal }); |
60 | 117 |
|
61 | 118 | // Read the audio data as an ArrayBuffer and return it with appropriate headers |
62 | 119 | // This will also be aborted if the client cancels |
63 | 120 | const buffer = await response.arrayBuffer(); |
64 | | - const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg'; |
| 121 | + |
| 122 | + // Save to cache |
| 123 | + ttsAudioCache.set(cacheKey, buffer); |
| 124 | + |
65 | 125 | return new NextResponse(buffer, { |
66 | 126 | headers: { |
67 | | - 'Content-Type': contentType |
| 127 | + 'Content-Type': contentType, |
| 128 | + 'X-Cache': 'MISS' |
68 | 129 | } |
69 | 130 | }); |
70 | 131 | } catch (error) { |
|
0 commit comments