Skip to content

Commit 70723f4

Browse files
committed
perf(api): add LRU cache for TTS audio responses
Introduce an in-memory LRU cache for TTS audio with configurable size and TTL via TTS_CACHE_MAX_SIZE_BYTES and TTS_CACHE_TTL_MS. Return X-Cache headers (HIT/MISS) and set route runtime to nodejs. Cache key includes provider, model, voice, speed, format, text, and optional instructions. Normalize non-Kokoro multi-voice input to the first token while preserving full voice string in the cache key. Default Deepinfra model to hexgrad/Kokoro-82M when none is provided. Fix Deepinfra Kokoro behavior by enforcing single-voice selection: - ui: only enable multi-select when provider supports >1 voices - voice utils: Deepinfra max voices set to 1 - tests: gate provider selection and multi-voice tests by CI and increase timeout for stability
1 parent 1dcffc8 commit 70723f4

File tree

5 files changed

+110
-75
lines changed

5 files changed

+110
-75
lines changed

src/app/api/tts/route.ts

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,131 @@
11
import { NextRequest, NextResponse } from 'next/server';
22
import OpenAI from 'openai';
33
import { SpeechCreateParams } from 'openai/resources/audio/speech.mjs';
4-
import { isKokoroModel, stripVoiceWeights } from '@/utils/voice';
4+
import { isKokoroModel } from '@/utils/voice';
5+
import { LRUCache } from 'lru-cache';
6+
import { createHash } from 'crypto';
7+
8+
export const runtime = 'nodejs';
59

610
type CustomVoice = string;
711
type ExtendedSpeechParams = Omit<SpeechCreateParams, 'voice'> & {
812
voice: SpeechCreateParams['voice'] | CustomVoice;
913
instructions?: string;
1014
};
15+
type AudioBufferValue = ArrayBuffer;
16+
17+
const TTS_CACHE_MAX_SIZE_BYTES = Number(process.env.TTS_CACHE_MAX_SIZE_BYTES || 256 * 1024 * 1024); // 256MB
18+
const TTS_CACHE_TTL_MS = Number(process.env.TTS_CACHE_TTL_MS || 1000 * 60 * 30); // 30 minutes
19+
20+
const ttsAudioCache = new LRUCache<string, AudioBufferValue>({
21+
maxSize: TTS_CACHE_MAX_SIZE_BYTES,
22+
sizeCalculation: (value) => value.byteLength,
23+
ttl: TTS_CACHE_TTL_MS,
24+
});
25+
26+
function makeCacheKey(input: {
27+
provider: string;
28+
model: string | null | undefined;
29+
voice: string | undefined;
30+
speed: number;
31+
format: string;
32+
text: string;
33+
instructions?: string;
34+
}) {
35+
const canonical = {
36+
provider: input.provider,
37+
model: input.model || '',
38+
voice: input.voice || '',
39+
speed: input.speed,
40+
format: input.format,
41+
text: input.text,
42+
// Only include instructions when present (for models like gpt-4o-mini-tts)
43+
instructions: input.instructions || undefined,
44+
};
45+
return createHash('sha256').update(JSON.stringify(canonical)).digest('hex');
46+
}
1147

1248
export async function POST(req: NextRequest) {
1349
try {
1450
// Get API credentials from headers or fall back to environment variables
1551
const openApiKey = req.headers.get('x-openai-key') || process.env.API_KEY || 'none';
1652
const openApiBaseUrl = req.headers.get('x-openai-base-url') || process.env.API_BASE;
1753
const provider = req.headers.get('x-tts-provider') || 'openai';
18-
const { text, voice, speed, format, model, instructions } = await req.json();
19-
console.log('Received TTS request:', { provider, model, voice, speed, format, hasInstructions: Boolean(instructions) });
54+
const { text, voice, speed, format, model: req_model, instructions } = await req.json();
55+
console.log('Received TTS request:', { provider, req_model, voice, speed, format, hasInstructions: Boolean(instructions) });
2056

2157
if (!text || !voice || !speed) {
2258
return NextResponse.json({ error: 'Missing required parameters' }, { status: 400 });
2359
}
24-
25-
// Apply Deepinfra defaults if provider is deepinfra
26-
const finalModel = provider === 'deepinfra' && !model ? 'hexgrad/Kokoro-82M' : model;
27-
const initialVoice = provider === 'deepinfra' && !voice ? 'af_bella' : voice;
28-
29-
// For SDK providers (OpenAI/Deepinfra), preserve multi-voice for Kokoro models, otherwise normalize to first token
30-
const isKokoro = isKokoroModel(finalModel);
31-
let normalizedVoice = initialVoice;
32-
if (!isKokoro && typeof normalizedVoice === 'string' && normalizedVoice.includes('+')) {
33-
normalizedVoice = stripVoiceWeights(normalizedVoice.split('+')[0]);
34-
console.log('Normalized multi-voice to single for non-Kokoro SDK provider:', normalizedVoice);
35-
}
60+
// Use default Kokoro model for Deepinfra if none specified
61+
const model = provider === 'deepinfra' && !req_model ? 'hexgrad/Kokoro-82M' : req_model;
3662

3763
// Initialize OpenAI client with abort signal (OpenAI/deepinfra)
3864
const openai = new OpenAI({
3965
apiKey: openApiKey,
4066
baseURL: openApiBaseUrl,
4167
});
4268

43-
// Unified path: all providers (openai, deepinfra, custom-openai) go through the SDK below.
44-
45-
// Request audio from OpenAI and pass along the abort signal
69+
const normalizedVoice = (
70+
!isKokoroModel(model) && voice.includes('+')
71+
? (voice.split('+')[0].trim())
72+
: voice
73+
) as SpeechCreateParams['voice'];
74+
4675
const createParams: ExtendedSpeechParams = {
47-
model: finalModel || 'tts-1',
48-
voice: normalizedVoice as SpeechCreateParams['voice'],
76+
model: model,
77+
voice: normalizedVoice,
4978
input: text,
5079
speed: speed,
5180
response_format: format === 'aac' ? 'aac' : 'mp3',
5281
};
53-
5482
// Only add instructions if model is gpt-4o-mini-tts and instructions are provided
55-
if (finalModel === 'gpt-4o-mini-tts' && instructions) {
83+
if (model === 'gpt-4o-mini-tts' && instructions) {
5684
createParams.instructions = instructions;
5785
}
5886

87+
// Compute cache key and check LRU before making provider call
88+
const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg';
89+
90+
// Preserve voice string as-is for cache key (no weight stripping)
91+
const voiceForKey = typeof createParams.voice === 'string'
92+
? createParams.voice
93+
: String(createParams.voice);
94+
95+
const cacheKey = makeCacheKey({
96+
provider,
97+
model: createParams.model,
98+
voice: voiceForKey,
99+
speed: Number(createParams.speed),
100+
format: String(createParams.response_format),
101+
text,
102+
instructions: createParams.instructions,
103+
});
104+
105+
const cachedBuffer = ttsAudioCache.get(cacheKey);
106+
if (cachedBuffer) {
107+
console.log('TTS cache HIT for key:', cacheKey.slice(0, 8));
108+
return new NextResponse(cachedBuffer, {
109+
headers: {
110+
'Content-Type': contentType,
111+
'X-Cache': 'HIT',
112+
}
113+
});
114+
}
115+
59116
const response = await openai.audio.speech.create(createParams as SpeechCreateParams, { signal: req.signal });
60117

61118
// Read the audio data as an ArrayBuffer and return it with appropriate headers
62119
// This will also be aborted if the client cancels
63120
const buffer = await response.arrayBuffer();
64-
const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg';
121+
122+
// Save to cache
123+
ttsAudioCache.set(cacheKey, buffer);
124+
65125
return new NextResponse(buffer, {
66126
headers: {
67-
'Content-Type': contentType
127+
'Content-Type': contentType,
128+
'X-Cache': 'MISS'
68129
}
69130
});
70131
} catch (error) {

src/components/player/VoicesControl.tsx

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,8 @@ import {
88
} from '@headlessui/react';
99
import { ChevronUpDownIcon, AudioWaveIcon } from '@/components/icons/Icons';
1010
import { useConfig } from '@/contexts/ConfigContext';
11-
import { useEffect, useMemo, useState, useCallback } from 'react';
12-
import {
13-
parseKokoroVoiceNames,
14-
buildKokoroVoiceString,
15-
getMaxVoicesForProvider,
16-
isKokoroModel
17-
} from '@/utils/voice';
11+
import { useEffect, useMemo, useState } from 'react';
12+
import { parseKokoroVoiceNames, buildKokoroVoiceString, isKokoroModel, getMaxVoicesForProvider } from '@/utils/voice';
1813

1914
export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
2015
availableVoices: string[];
@@ -23,20 +18,13 @@ export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
2318
const { voice: configVoice, ttsModel, ttsProvider } = useConfig();
2419

2520
const isKokoro = isKokoroModel(ttsModel);
26-
const maxVoices = getMaxVoicesForProvider(ttsProvider, ttsModel || '');
27-
28-
const clampToLimit = useCallback((names: string[]): string[] => {
29-
if (maxVoices === Infinity) return names;
30-
if (names.length <= maxVoices) return names;
31-
// For initial clamp, keep the first up to max allowed
32-
return names.slice(0, maxVoices);
33-
}, [maxVoices]);
21+
const maxVoices = getMaxVoicesForProvider(ttsProvider, ttsModel);
3422

3523
// Local selection state for Kokoro multi-select
3624
const [selectedVoices, setSelectedVoices] = useState<string[]>([]);
3725

3826
useEffect(() => {
39-
if (!isKokoro) return;
27+
if (!(isKokoro && maxVoices > 1)) return;
4028
let initial: string[] = [];
4129
if (configVoice && configVoice.includes('+')) {
4230
initial = parseKokoroVoiceNames(configVoice);
@@ -45,23 +33,27 @@ export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
4533
} else if (availableVoices.length > 0) {
4634
initial = [availableVoices[0]];
4735
}
48-
setSelectedVoices(clampToLimit(initial));
49-
}, [isKokoro, configVoice, availableVoices, maxVoices, clampToLimit]);
36+
// Clamp to provider limit
37+
if (initial.length > maxVoices) {
38+
initial = initial.slice(0, maxVoices);
39+
}
40+
setSelectedVoices(initial);
41+
}, [isKokoro, maxVoices, configVoice, availableVoices]);
5042

51-
// If the saved voice is not in the available list, use the first available voice (non-Kokoro)
43+
// If the saved voice is not in the available list, use the first available voice (non-Kokoro or Kokoro limited)
5244
const currentVoice = useMemo(() => {
53-
if (isKokoro) {
45+
if (isKokoro && maxVoices > 1) {
5446
const combined = buildKokoroVoiceString(selectedVoices);
5547
return combined || (availableVoices[0] || '');
5648
}
5749
return (configVoice && availableVoices.includes(configVoice))
5850
? configVoice
5951
: availableVoices[0] || '';
60-
}, [isKokoro, selectedVoices, availableVoices, configVoice]);
52+
}, [isKokoro, maxVoices, selectedVoices, availableVoices, configVoice]);
6153

6254
return (
6355
<div className="relative">
64-
{isKokoro ? (
56+
{(isKokoro && maxVoices > 1) ? (
6557
<Listbox
6658
multiple
6759
value={selectedVoices}
@@ -70,17 +62,14 @@ export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
7062

7163
let next = vals;
7264

73-
// Enforce deepinfra max selection of 2 voices
74-
if (maxVoices !== Infinity && vals.length > maxVoices) {
75-
// Determine the newly added voice
65+
// Enforce provider max selection
66+
if (vals.length > maxVoices) {
7667
const newlyAdded = vals.find(v => !selectedVoices.includes(v));
7768
if (newlyAdded) {
7869
const lastPrev = selectedVoices[selectedVoices.length - 1] ?? selectedVoices[0] ?? '';
79-
// Build next as [last previously selected, newly added], deduped, limited to max
8070
const pair = Array.from(new Set([lastPrev, newlyAdded])).filter(Boolean);
8171
next = pair.slice(0, maxVoices);
8272
} else {
83-
// Fallback: keep the last maxVoices options
8473
next = vals.slice(-maxVoices);
8574
}
8675
}

src/utils/voice.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ export const stripVoiceWeights = (voiceString: string): string => {
6666
export const getMaxVoicesForProvider = (provider: string, model: string): number => {
6767
if (!isKokoroModel(model)) return 1;
6868

69-
// Deepinfra Kokoro supports up to 2 voices
70-
if (provider === 'deepinfra') return 2;
69+
// Deepinfra Kokoro does not support multiple voices
70+
if (provider === 'deepinfra') return 1;
7171

7272
// Other providers with Kokoro support unlimited voices
7373
return Infinity;

tests/helpers.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,10 @@ export async function setupTest(page: Page) {
9696
//await page.waitForLoadState('networkidle');
9797

9898
// If running in CI, select the "Custom OpenAI-Like" model and "Deepinfra" provider
99-
//if (process.env.CI) {
100-
await page.getByRole('button', { name: 'Custom OpenAI-Like' }).click();
101-
await page.getByText('Deepinfra').click();
102-
//}
99+
if (process.env.CI) {
100+
await page.getByRole('button', { name: 'Custom OpenAI-Like' }).click();
101+
await page.getByText('Deepinfra').click();
102+
}
103103

104104
// Click the "done" button to dismiss the welcome message
105105
await page.getByRole('button', { name: 'Save' }).click();

tests/play.spec.ts

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ test.describe('Play/Pause Tests', () => {
1515
await setupTest(page);
1616
});
1717

18-
test.describe.configure({ mode: 'serial' });
18+
test.describe.configure({ mode: 'serial', timeout: 60000 });
1919

2020
test('plays and pauses TTS for a PDF document', async ({ page }) => {
2121
// Play TTS for the PDF document
@@ -62,29 +62,14 @@ test.describe('Play/Pause Tests', () => {
6262
const options = page.getByRole('option');
6363
expect(await options.count()).toBeGreaterThan(0);
6464

65-
// Step 1: Select af_bella (adds it to the multi-select list)
6665
await selectVoiceAndAssertPlayback(page, 'af_bella');
67-
68-
// Step 2: Deselect the first (initially selected) voice so that only af_bella remains
69-
await openVoicesMenu(page);
70-
const selected = page.locator('[role="option"][aria-selected="true"]');
71-
const count = await selected.count();
72-
for (let i = 0; i < count; i++) {
73-
const opt = selected.nth(i);
74-
const name = (await opt.textContent())?.trim() ?? '';
75-
// Deselect the first selected option that is not af_bella
76-
if (!/af_bella/i.test(name)) {
77-
await opt.click();
78-
break;
79-
}
80-
}
81-
await expectProcessingTransition(page);
66+
//await expectProcessingTransition(page);
8267

8368
// Final state should be playing
8469
await expectMediaState(page, 'playing');
8570
});
8671

87-
test('selects multiple Kokoro voices and resumes playing', async ({ page }) => {
72+
if (!process.env.CI) test('selects multiple Kokoro voices and resumes playing', async ({ page }) => {
8873
// Start playback
8974
await playTTSAndWaitForASecond(page, 'sample.pdf');
9075

0 commit comments

Comments
 (0)