perf(api): add LRU cache for TTS audio responses

richardr1126 · richardr1126 · commit 70723f4c1da8 · 2025-11-12T23:28:05.000-07:00
Introduce an in-memory LRU cache for TTS audio with configurable
size and TTL via TTS_CACHE_MAX_SIZE_BYTES and TTS_CACHE_TTL_MS.
Return X-Cache headers (HIT/MISS) and set route runtime to nodejs.
Cache key includes provider, model, voice, speed, format, text,
and optional instructions.

Normalize non-Kokoro multi-voice input to the first token while
preserving full voice string in the cache key. Default Deepinfra
model to hexgrad/Kokoro-82M when none is provided.

Fix Deepinfra Kokoro behavior by enforcing single-voice selection:
- ui: only enable multi-select when provider supports &gt;1 voices
- voice utils: Deepinfra max voices set to 1
- tests: gate provider selection and multi-voice tests by CI and
  increase timeout for stability
diff --git a/src/app/api/tts/route.ts b/src/app/api/tts/route.ts
@@ -1,70 +1,131 @@
 import { NextRequest, NextResponse } from 'next/server';
 import OpenAI from 'openai';
 import { SpeechCreateParams } from 'openai/resources/audio/speech.mjs';
-import { isKokoroModel, stripVoiceWeights } from '@/utils/voice';
+import { isKokoroModel } from '@/utils/voice';
+import { LRUCache } from 'lru-cache';
+import { createHash } from 'crypto';
+
+export const runtime = 'nodejs';
 
 type CustomVoice = string;
 type ExtendedSpeechParams = Omit<SpeechCreateParams, 'voice'> & {
   voice: SpeechCreateParams['voice'] | CustomVoice;
   instructions?: string;
 };
+type AudioBufferValue = ArrayBuffer;
+
+const TTS_CACHE_MAX_SIZE_BYTES = Number(process.env.TTS_CACHE_MAX_SIZE_BYTES || 256 * 1024 * 1024); // 256MB
+const TTS_CACHE_TTL_MS = Number(process.env.TTS_CACHE_TTL_MS || 1000 * 60 * 30); // 30 minutes
+
+const ttsAudioCache = new LRUCache<string, AudioBufferValue>({
+  maxSize: TTS_CACHE_MAX_SIZE_BYTES,
+  sizeCalculation: (value) => value.byteLength,
+  ttl: TTS_CACHE_TTL_MS,
+});
+
+function makeCacheKey(input: {
+  provider: string;
+  model: string | null | undefined;
+  voice: string | undefined;
+  speed: number;
+  format: string;
+  text: string;
+  instructions?: string;
+}) {
+  const canonical = {
+    provider: input.provider,
+    model: input.model || '',
+    voice: input.voice || '',
+    speed: input.speed,
+    format: input.format,
+    text: input.text,
+    // Only include instructions when present (for models like gpt-4o-mini-tts)
+    instructions: input.instructions || undefined,
+  };
+  return createHash('sha256').update(JSON.stringify(canonical)).digest('hex');
+}
 
 export async function POST(req: NextRequest) {
   try {
     // Get API credentials from headers or fall back to environment variables
     const openApiKey = req.headers.get('x-openai-key') || process.env.API_KEY || 'none';
     const openApiBaseUrl = req.headers.get('x-openai-base-url') || process.env.API_BASE;
     const provider = req.headers.get('x-tts-provider') || 'openai';
-    const { text, voice, speed, format, model, instructions } = await req.json();
-    console.log('Received TTS request:', { provider, model, voice, speed, format, hasInstructions: Boolean(instructions) });
+    const { text, voice, speed, format, model: req_model, instructions } = await req.json();
+    console.log('Received TTS request:', { provider, req_model, voice, speed, format, hasInstructions: Boolean(instructions) });
 
     if (!text || !voice || !speed) {
       return NextResponse.json({ error: 'Missing required parameters' }, { status: 400 });
     }
-
-    // Apply Deepinfra defaults if provider is deepinfra
-    const finalModel = provider === 'deepinfra' && !model ? 'hexgrad/Kokoro-82M' : model;
-    const initialVoice = provider === 'deepinfra' && !voice ? 'af_bella' : voice;
-
-    // For SDK providers (OpenAI/Deepinfra), preserve multi-voice for Kokoro models, otherwise normalize to first token
-    const isKokoro = isKokoroModel(finalModel);
-    let normalizedVoice = initialVoice;
-    if (!isKokoro && typeof normalizedVoice === 'string' && normalizedVoice.includes('+')) {
-      normalizedVoice = stripVoiceWeights(normalizedVoice.split('+')[0]);
-      console.log('Normalized multi-voice to single for non-Kokoro SDK provider:', normalizedVoice);
-    }
+    // Use default Kokoro model for Deepinfra if none specified
+    const model = provider === 'deepinfra' && !req_model ? 'hexgrad/Kokoro-82M' : req_model;
 
     // Initialize OpenAI client with abort signal (OpenAI/deepinfra)
     const openai = new OpenAI({
       apiKey: openApiKey,
       baseURL: openApiBaseUrl,
     });
 
-    // Unified path: all providers (openai, deepinfra, custom-openai) go through the SDK below.
-
-    // Request audio from OpenAI and pass along the abort signal
+    const normalizedVoice = (
+      !isKokoroModel(model) && voice.includes('+')
+      ? (voice.split('+')[0].trim())
+      : voice
+    ) as SpeechCreateParams['voice'];
+    
     const createParams: ExtendedSpeechParams = {
-      model: finalModel || 'tts-1',
-      voice: normalizedVoice as SpeechCreateParams['voice'],
+      model: model,
+      voice: normalizedVoice,
       input: text,
       speed: speed,
       response_format: format === 'aac' ? 'aac' : 'mp3',
     };
-
     // Only add instructions if model is gpt-4o-mini-tts and instructions are provided
-    if (finalModel === 'gpt-4o-mini-tts' && instructions) {
+    if (model === 'gpt-4o-mini-tts' && instructions) {
       createParams.instructions = instructions;
     }
 
+    // Compute cache key and check LRU before making provider call
+    const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg';
+
+    // Preserve voice string as-is for cache key (no weight stripping)
+    const voiceForKey = typeof createParams.voice === 'string'
+      ? createParams.voice
+      : String(createParams.voice);
+
+    const cacheKey = makeCacheKey({
+      provider,
+      model: createParams.model,
+      voice: voiceForKey,
+      speed: Number(createParams.speed),
+      format: String(createParams.response_format),
+      text,
+      instructions: createParams.instructions,
+    });
+
+    const cachedBuffer = ttsAudioCache.get(cacheKey);
+    if (cachedBuffer) {
+      console.log('TTS cache HIT for key:', cacheKey.slice(0, 8));
+      return new NextResponse(cachedBuffer, {
+        headers: {
+          'Content-Type': contentType,
+          'X-Cache': 'HIT',
+        }
+      });
+    }
+
     const response = await openai.audio.speech.create(createParams as SpeechCreateParams, { signal: req.signal });
 
     // Read the audio data as an ArrayBuffer and return it with appropriate headers
     // This will also be aborted if the client cancels
     const buffer = await response.arrayBuffer();
-    const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg';
+
+    // Save to cache
+    ttsAudioCache.set(cacheKey, buffer);
+
     return new NextResponse(buffer, {
       headers: {
-        'Content-Type': contentType
+        'Content-Type': contentType,
+        'X-Cache': 'MISS'
       }
     });
   } catch (error) {
diff --git a/src/components/player/VoicesControl.tsx b/src/components/player/VoicesControl.tsx
@@ -8,13 +8,8 @@ import {
 } from '@headlessui/react';
 import { ChevronUpDownIcon, AudioWaveIcon } from '@/components/icons/Icons';
 import { useConfig } from '@/contexts/ConfigContext';
-import { useEffect, useMemo, useState, useCallback } from 'react';
-import { 
-  parseKokoroVoiceNames, 
-  buildKokoroVoiceString, 
-  getMaxVoicesForProvider,
-  isKokoroModel 
-} from '@/utils/voice';
+import { useEffect, useMemo, useState } from 'react';
+import { parseKokoroVoiceNames, buildKokoroVoiceString, isKokoroModel, getMaxVoicesForProvider } from '@/utils/voice';
 
 export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
   availableVoices: string[];
@@ -23,20 +18,13 @@ export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
   const { voice: configVoice, ttsModel, ttsProvider } = useConfig();
 
   const isKokoro = isKokoroModel(ttsModel);
-  const maxVoices = getMaxVoicesForProvider(ttsProvider, ttsModel || '');
-
-  const clampToLimit = useCallback((names: string[]): string[] => {
-    if (maxVoices === Infinity) return names;
-    if (names.length <= maxVoices) return names;
-    // For initial clamp, keep the first up to max allowed
-    return names.slice(0, maxVoices);
-  }, [maxVoices]);
+  const maxVoices = getMaxVoicesForProvider(ttsProvider, ttsModel);
 
   // Local selection state for Kokoro multi-select
   const [selectedVoices, setSelectedVoices] = useState<string[]>([]);
 
   useEffect(() => {
-    if (!isKokoro) return;
+    if (!(isKokoro && maxVoices > 1)) return;
     let initial: string[] = [];
     if (configVoice && configVoice.includes('+')) {
       initial = parseKokoroVoiceNames(configVoice);
@@ -45,23 +33,27 @@ export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
     } else if (availableVoices.length > 0) {
       initial = [availableVoices[0]];
     }
-    setSelectedVoices(clampToLimit(initial));
-  }, [isKokoro, configVoice, availableVoices, maxVoices, clampToLimit]);
+    // Clamp to provider limit
+    if (initial.length > maxVoices) {
+      initial = initial.slice(0, maxVoices);
+    }
+    setSelectedVoices(initial);
+  }, [isKokoro, maxVoices, configVoice, availableVoices]);
 
-  // If the saved voice is not in the available list, use the first available voice (non-Kokoro)
+  // If the saved voice is not in the available list, use the first available voice (non-Kokoro or Kokoro limited)
   const currentVoice = useMemo(() => {
-    if (isKokoro) {
+    if (isKokoro && maxVoices > 1) {
       const combined = buildKokoroVoiceString(selectedVoices);
       return combined || (availableVoices[0] || '');
     }
     return (configVoice && availableVoices.includes(configVoice))
       ? configVoice
       : availableVoices[0] || '';
-  }, [isKokoro, selectedVoices, availableVoices, configVoice]);
+  }, [isKokoro, maxVoices, selectedVoices, availableVoices, configVoice]);
 
   return (
     <div className="relative">
-      {isKokoro ? (
+      {(isKokoro && maxVoices > 1) ? (
         <Listbox
           multiple
           value={selectedVoices}
@@ -70,17 +62,14 @@ export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
 
             let next = vals;
 
-            // Enforce deepinfra max selection of 2 voices
-            if (maxVoices !== Infinity && vals.length > maxVoices) {
-              // Determine the newly added voice
+            // Enforce provider max selection
+            if (vals.length > maxVoices) {
               const newlyAdded = vals.find(v => !selectedVoices.includes(v));
               if (newlyAdded) {
                 const lastPrev = selectedVoices[selectedVoices.length - 1] ?? selectedVoices[0] ?? '';
-                // Build next as [last previously selected, newly added], deduped, limited to max
                 const pair = Array.from(new Set([lastPrev, newlyAdded])).filter(Boolean);
                 next = pair.slice(0, maxVoices);
               } else {
-                // Fallback: keep the last maxVoices options
                 next = vals.slice(-maxVoices);
               }
             }
diff --git a/src/utils/voice.ts b/src/utils/voice.ts
@@ -66,8 +66,8 @@ export const stripVoiceWeights = (voiceString: string): string => {
 export const getMaxVoicesForProvider = (provider: string, model: string): number => {
   if (!isKokoroModel(model)) return 1;
   
-  // Deepinfra Kokoro supports up to 2 voices
-  if (provider === 'deepinfra') return 2;
+  // Deepinfra Kokoro does not support multiple voices
+  if (provider === 'deepinfra') return 1;
   
   // Other providers with Kokoro support unlimited voices
   return Infinity;
diff --git a/tests/helpers.ts b/tests/helpers.ts
@@ -96,10 +96,10 @@ export async function setupTest(page: Page) {
   //await page.waitForLoadState('networkidle');
 
   // If running in CI, select the "Custom OpenAI-Like" model and "Deepinfra" provider
-  //if (process.env.CI) {
-  await page.getByRole('button', { name: 'Custom OpenAI-Like' }).click();
-  await page.getByText('Deepinfra').click();
-  //}
+  if (process.env.CI) {
+    await page.getByRole('button', { name: 'Custom OpenAI-Like' }).click();
+    await page.getByText('Deepinfra').click();
+  }
 
   // Click the "done" button to dismiss the welcome message
   await page.getByRole('button', { name: 'Save' }).click();
diff --git a/tests/play.spec.ts b/tests/play.spec.ts
@@ -15,7 +15,7 @@ test.describe('Play/Pause Tests', () => {
     await setupTest(page);
   });
 
-  test.describe.configure({ mode: 'serial' });
+  test.describe.configure({ mode: 'serial', timeout: 60000 });
 
   test('plays and pauses TTS for a PDF document', async ({ page }) => {
     // Play TTS for the PDF document
@@ -62,29 +62,14 @@ test.describe('Play/Pause Tests', () => {
     const options = page.getByRole('option');
     expect(await options.count()).toBeGreaterThan(0);
 
-    // Step 1: Select af_bella (adds it to the multi-select list)
     await selectVoiceAndAssertPlayback(page, 'af_bella');
-
-    // Step 2: Deselect the first (initially selected) voice so that only af_bella remains
-    await openVoicesMenu(page);
-    const selected = page.locator('[role="option"][aria-selected="true"]');
-    const count = await selected.count();
-    for (let i = 0; i < count; i++) {
-      const opt = selected.nth(i);
-      const name = (await opt.textContent())?.trim() ?? '';
-      // Deselect the first selected option that is not af_bella
-      if (!/af_bella/i.test(name)) {
-        await opt.click();
-        break;
-      }
-    }
-    await expectProcessingTransition(page);
+    //await expectProcessingTransition(page);
 
     // Final state should be playing
     await expectMediaState(page, 'playing');
   });
 
-  test('selects multiple Kokoro voices and resumes playing', async ({ page }) => {
+  if (!process.env.CI) test('selects multiple Kokoro voices and resumes playing', async ({ page }) => {
     // Start playback
     await playTTSAndWaitForASecond(page, 'sample.pdf');