richardr1126
diff --git a/‎src/app/api/tts/route.ts‎
Lines changed: 18 additions & 13 deletions b/‎src/app/api/tts/route.ts‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎src/app/api/tts/voices/route.ts‎
Lines changed: 6 additions & 1 deletion b/‎src/app/api/tts/voices/route.ts‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/components/player/VoicesControl.tsx‎
Lines changed: 119 additions & 24 deletions b/‎src/components/player/VoicesControl.tsx‎
Lines changed: 119 additions & 24 deletions
diff --git a/‎src/contexts/TTSContext.tsx‎
Lines changed: 32 additions & 8 deletions b/‎src/contexts/TTSContext.tsx‎
Lines changed: 32 additions & 8 deletions
@@ -1,6 +1,7 @@
 import { NextRequest, NextResponse } from 'next/server';
 import OpenAI from 'openai';
 import { SpeechCreateParams } from 'openai/resources/audio/speech.mjs';
+import { isKokoroModel, stripVoiceWeights } from '@/utils/voice';
 
 type CustomVoice = string;
 type ExtendedSpeechParams = Omit<SpeechCreateParams, 'voice'> & {
@@ -15,30 +16,36 @@ export async function POST(req: NextRequest) {
     const openApiBaseUrl = req.headers.get('x-openai-base-url') || process.env.API_BASE;
     const provider = req.headers.get('x-tts-provider') || 'openai';
     const { text, voice, speed, format, model, instructions } = await req.json();
-    console.log('Received TTS request:', text, voice, speed, format, model);
-
-    if (!openApiKey) {
-      return NextResponse.json({ error: 'Missing OpenAI API key' }, { status: 401 });
-    }
+    console.log('Received TTS request:', { provider, model, voice, speed, format, hasInstructions: Boolean(instructions) });
 
     if (!text || !voice || !speed) {
       return NextResponse.json({ error: 'Missing required parameters' }, { status: 400 });
     }
 
     // Apply Deepinfra defaults if provider is deepinfra
     const finalModel = provider === 'deepinfra' && !model ? 'hexgrad/Kokoro-82M' : model;
-    const finalVoice = provider === 'deepinfra' && !voice ? 'af_bella' : voice;
+    const initialVoice = provider === 'deepinfra' && !voice ? 'af_bella' : voice;
+
+    // For SDK providers (OpenAI/Deepinfra), preserve multi-voice for Kokoro models, otherwise normalize to first token
+    const isKokoro = isKokoroModel(finalModel);
+    let normalizedVoice = initialVoice;
+    if (!isKokoro && typeof normalizedVoice === 'string' && normalizedVoice.includes('+')) {
+      normalizedVoice = stripVoiceWeights(normalizedVoice.split('+')[0]);
+      console.log('Normalized multi-voice to single for non-Kokoro SDK provider:', normalizedVoice);
+    }
 
-    // Initialize OpenAI client with abort signal
+    // Initialize OpenAI client with abort signal (OpenAI/deepinfra)
     const openai = new OpenAI({
       apiKey: openApiKey,
       baseURL: openApiBaseUrl,
     });
 
+    // Unified path: all providers (openai, deepinfra, custom-openai) go through the SDK below.
+
     // Request audio from OpenAI and pass along the abort signal
     const createParams: ExtendedSpeechParams = {
       model: finalModel || 'tts-1',
-      voice: finalVoice as "alloy",
+      voice: normalizedVoice as SpeechCreateParams['voice'],
       input: text,
       speed: speed,
       response_format: format === 'aac' ? 'aac' : 'mp3',
@@ -51,13 +58,11 @@ export async function POST(req: NextRequest) {
 
     const response = await openai.audio.speech.create(createParams as SpeechCreateParams, { signal: req.signal });
 
-    // Get the audio data as array buffer
+    // Read the audio data as an ArrayBuffer and return it with appropriate headers
     // This will also be aborted if the client cancels
-    const stream = response.body;
-
-    // Return audio data with appropriate headers
+    const buffer = await response.arrayBuffer();
     const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg';
-    return new NextResponse(stream, {
+    return new NextResponse(buffer, {
       headers: {
         'Content-Type': contentType
       }
 
@@ -1,4 +1,5 @@
 import { NextRequest, NextResponse } from 'next/server';
+import { isKokoroModel } from '@/utils/voice';
 
 const OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'];
 const GPT4O_MINI_VOICES = ['alloy', 'ash', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer'];
@@ -29,6 +30,10 @@ function getDefaultVoices(provider: string, model: string): string[] {
 
   // For Custom OpenAI-Like provider
   if (provider === 'custom-openai') {
+    // If using Kokoro-FastAPI (model string contains 'kokoro'), expose full Kokoro voices
+    if (isKokoroModel(model)) {
+      return KOKORO_VOICES;
+    }
     return CUSTOM_OPENAI_VOICES;
   }
 
@@ -72,7 +77,7 @@ async function fetchDeepinfraVoices(apiKey: string): Promise<string[]> {
     }
 
     const data = await response.json();
-    //console.log('Deepinfra voices response:', data);
+    
     // Extract voice names from the response, excluding preset voices
     if (data.voices && Array.isArray(data.voices)) {
       return data.voices
 
@@ -8,40 +8,135 @@ import {
 } from '@headlessui/react';
 import { ChevronUpDownIcon, AudioWaveIcon } from '@/components/icons/Icons';
 import { useConfig } from '@/contexts/ConfigContext';
+import { useEffect, useMemo, useState, useCallback } from 'react';
+import { 
+  parseKokoroVoiceNames, 
+  buildKokoroVoiceString, 
+  getMaxVoicesForProvider,
+  isKokoroModel 
+} from '@/utils/voice';
 
 export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
   availableVoices: string[];
   setVoiceAndRestart: (voice: string) => void;
 }) => {
-  const { voice: configVoice } = useConfig();
+  const { voice: configVoice, ttsModel, ttsProvider } = useConfig();
 
-  // If the saved voice is not in the available list, use the first available voice
-  const currentVoice = (configVoice && availableVoices.includes(configVoice)) 
-    ? configVoice 
-    : availableVoices[0] || '';
+  const isKokoro = isKokoroModel(ttsModel);
+  const maxVoices = getMaxVoicesForProvider(ttsProvider, ttsModel || '');
+
+  const clampToLimit = useCallback((names: string[]): string[] => {
+    if (maxVoices === Infinity) return names;
+    if (names.length <= maxVoices) return names;
+    // For initial clamp, keep the first up to max allowed
+    return names.slice(0, maxVoices);
+  }, [maxVoices]);
+
+  // Local selection state for Kokoro multi-select
+  const [selectedVoices, setSelectedVoices] = useState<string[]>([]);
+
+  useEffect(() => {
+    if (!isKokoro) return;
+    let initial: string[] = [];
+    if (configVoice && configVoice.includes('+')) {
+      initial = parseKokoroVoiceNames(configVoice);
+    } else if (configVoice && availableVoices.includes(configVoice)) {
+      initial = [configVoice];
+    } else if (availableVoices.length > 0) {
+      initial = [availableVoices[0]];
+    }
+    setSelectedVoices(clampToLimit(initial));
+  }, [isKokoro, configVoice, availableVoices, maxVoices, clampToLimit]);
+
+  // If the saved voice is not in the available list, use the first available voice (non-Kokoro)
+  const currentVoice = useMemo(() => {
+    if (isKokoro) {
+      const combined = buildKokoroVoiceString(selectedVoices);
+      return combined || (availableVoices[0] || '');
+    }
+    return (configVoice && availableVoices.includes(configVoice))
+      ? configVoice
+      : availableVoices[0] || '';
+  }, [isKokoro, selectedVoices, availableVoices, configVoice]);
 
   return (
     <div className="relative">
-      <Listbox value={currentVoice} onChange={setVoiceAndRestart}>
-        <ListboxButton className="flex items-center space-x-0.5 sm:space-x-1 bg-transparent text-foreground text-xs sm:text-sm focus:outline-none cursor-pointer hover:bg-offbase rounded pl-1.5 sm:pl-2 pr-0.5 sm:pr-1 py-0.5 sm:py-1 transform transition-transform duration-200 ease-in-out hover:scale-[1.04] hover:text-accent">
-          <AudioWaveIcon className="h-3 w-3 sm:h-3.5 sm:w-3.5" />
-          <span>{currentVoice}</span>
-          <ChevronUpDownIcon className="h-2.5 w-2.5 sm:h-3 sm:w-3" />
-        </ListboxButton>
-        <ListboxOptions anchor='top end' className="absolute z-50 w-28 sm:w-32 max-h-64 overflow-auto rounded-lg bg-base shadow-lg ring-1 ring-black ring-opacity-5 focus:outline-none">
-          {availableVoices.map((voiceId) => (
-            <ListboxOption
-              key={voiceId}
-              value={voiceId}
-              className={({ active, selected }) =>
-                `relative cursor-pointer select-none py-0.5 px-1.5 sm:py-2 sm:px-3 ${active ? 'bg-offbase' : ''} ${selected ? 'font-medium' : ''}`
+      {isKokoro ? (
+        <Listbox
+          multiple
+          value={selectedVoices}
+          onChange={(vals: string[]) => {
+            if (!vals || vals.length === 0) return; // prevent empty selection
+
+            let next = vals;
+
+            // Enforce deepinfra max selection of 2 voices
+            if (maxVoices !== Infinity && vals.length > maxVoices) {
+              // Determine the newly added voice
+              const newlyAdded = vals.find(v => !selectedVoices.includes(v));
+              if (newlyAdded) {
+                const lastPrev = selectedVoices[selectedVoices.length - 1] ?? selectedVoices[0] ?? '';
+                // Build next as [last previously selected, newly added], deduped, limited to max
+                const pair = Array.from(new Set([lastPrev, newlyAdded])).filter(Boolean);
+                next = pair.slice(0, maxVoices);
+              } else {
+                // Fallback: keep the last maxVoices options
+                next = vals.slice(-maxVoices);
               }
-            >
-              <span className='text-xs sm:text-sm'>{voiceId}</span>
-            </ListboxOption>
-          ))}
-        </ListboxOptions>
-      </Listbox>
+            }
+
+            setSelectedVoices(next);
+            const combined = buildKokoroVoiceString(next);
+            if (combined) {
+              setVoiceAndRestart(combined);
+            }
+          }}
+        >
+          <ListboxButton className="flex items-center space-x-0.5 sm:space-x-1 bg-transparent text-foreground text-xs sm:text-sm focus:outline-none cursor-pointer hover:bg-offbase rounded pl-1.5 sm:pl-2 pr-0.5 sm:pr-1 py-0.5 sm:py-1 transform transition-transform duration-200 ease-in-out hover:scale-[1.04] hover:text-accent">
+            <AudioWaveIcon className="h-3 w-3 sm:h-3.5 sm:w-3.5" />
+            <span>
+              {selectedVoices.length > 1
+                ? selectedVoices.join(' + ')
+                : selectedVoices[0] || currentVoice}
+            </span>
+            <ChevronUpDownIcon className="h-2.5 w-2.5 sm:h-3 sm:w-3" />
+          </ListboxButton>
+          <ListboxOptions anchor='top end' className="absolute z-50 w-40 sm:w-44 max-h-64 overflow-auto rounded-lg bg-base shadow-lg ring-1 ring-black ring-opacity-5 focus:outline-none">
+            {availableVoices.map((voiceId) => (
+              <ListboxOption
+                key={voiceId}
+                value={voiceId}
+                className={({ active, selected }) =>
+                  `relative cursor-pointer select-none py-1 px-2 sm:py-2 sm:px-3 ${active ? 'bg-offbase' : ''} ${selected ? 'font-medium bg-accent text-background' : ''} ${selected && active ? 'text-foreground' : ''}`
+                }
+              >
+                <span className='text-xs sm:text-sm'>{voiceId}</span>
+              </ListboxOption>
+            ))}
+          </ListboxOptions>
+        </Listbox>
+      ) : (
+        <Listbox value={currentVoice} onChange={setVoiceAndRestart}>
+          <ListboxButton className="flex items-center space-x-0.5 sm:space-x-1 bg-transparent text-foreground text-xs sm:text-sm focus:outline-none cursor-pointer hover:bg-offbase rounded pl-1.5 sm:pl-2 pr-0.5 sm:pr-1 py-0.5 sm:py-1 transform transition-transform duration-200 ease-in-out hover:scale-[1.04] hover:text-accent">
+            <AudioWaveIcon className="h-3 w-3 sm:h-3.5 sm:w-3.5" />
+            <span>{currentVoice}</span>
+            <ChevronUpDownIcon className="h-2.5 w-2.5 sm:h-3 sm:w-3" />
+          </ListboxButton>
+          <ListboxOptions anchor='top end' className="absolute z-50 w-28 sm:w-32 max-h-64 overflow-auto rounded-lg bg-base shadow-lg ring-1 ring-black ring-opacity-5 focus:outline-none">
+            {availableVoices.map((voiceId) => (
+              <ListboxOption
+                key={voiceId}
+                value={voiceId}
+                className={({ active, selected }) =>
+                  `relative cursor-pointer select-none py-1 px-2 sm:py-2 sm:px-3 ${active ? 'bg-offbase' : ''} ${selected ? 'font-medium bg-accent text-background' : ''} ${selected && active ? 'text-foreground' : ''}`
+                }
+              >
+                <span className='text-xs sm:text-sm'>{voiceId}</span>
+              </ListboxOption>
+            ))}
+          </ListboxOptions>
+        </Listbox>
+      )}
     </div>
   );
 }
@@ -38,6 +38,7 @@ import { getLastDocumentLocation, setLastDocumentLocation } from '@/utils/indexe
 import { useBackgroundState } from '@/hooks/audio/useBackgroundState';
 import { withRetry } from '@/utils/audio';
 import { processTextToSentences } from '@/utils/nlp';
+import { isKokoroModel } from '@/utils/voice';
 
 // Media globals
 declare global {
@@ -155,6 +156,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
   const activeAbortControllers = useRef<Set<AbortController>>(new Set());
   // Track if we're restoring from a saved position
   const [pendingRestoreIndex, setPendingRestoreIndex] = useState<number | null>(null);
+  // Guard to coalesce rapid restarts and only resume the latest change
+  const restartSeqRef = useRef(0);
 
   /**
    * Processes text into sentences using the shared NLP utility
@@ -412,19 +415,31 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
    */
   useEffect(() => {
     if (availableVoices.length > 0) {
+      // Allow Kokoro multi-voice strings (e.g., "voice1(0.5)+voice2(0.5)") for any provider
+      const isKokoro = isKokoroModel(configTTSModel);
+
+      if (isKokoro) {
+        // If Kokoro and we have any voice string (including plus/weights), don't override it.
+        // Only default when voice is empty.
+        if (!voice) {
+          setVoice(availableVoices[0]);
+        }
+        return;
+      }
+
       if (!voice || !availableVoices.includes(voice)) {
         console.log(`Voice "${voice || '(empty)'}" not found in available voices. Using "${availableVoices[0]}"`);
         setVoice(availableVoices[0]);
         // Don't save to config - just use it temporarily until user explicitly selects one
       }
     }
-  }, [availableVoices, voice]);
+  }, [availableVoices, voice, configTTSModel]);
 
   /**
    * Generates and plays audio for the current sentence
    * 
    * @param {string} sentence - The sentence to generate audio for
-   * @returns {Promise<AudioBuffer | undefined>} The generated audio buffer
+   * @returns {Promise<ArrayBuffer | undefined>} The generated audio buffer
    */
   const getAudio = useCallback(async (sentence: string): Promise<ArrayBuffer | undefined> => {
     // Check if the audio is already cached
@@ -791,6 +806,9 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
   const setSpeedAndRestart = useCallback((newSpeed: number) => {
     const wasPlaying = isPlaying;
 
+    // Bump restart sequence to invalidate older restarts
+    const mySeq = ++restartSeqRef.current;
+
     // Set a flag to prevent double audio requests during config update
     setIsProcessing(true);
 
@@ -806,8 +824,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
     // Update config after state changes
     updateConfigKey('voiceSpeed', newSpeed).then(() => {
       setIsProcessing(false);
-      // Resume playback if it was playing before
-      if (wasPlaying) {
+      // Resume playback if it was playing before and this is the latest restart
+      if (wasPlaying && mySeq === restartSeqRef.current) {
         setIsPlaying(true);
       }
     });
@@ -821,6 +839,9 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
   const setVoiceAndRestart = useCallback((newVoice: string) => {
     const wasPlaying = isPlaying;
 
+    // Bump restart sequence to invalidate older restarts
+    const mySeq = ++restartSeqRef.current;
+
     // Set a flag to prevent double audio requests during config update
     setIsProcessing(true);
 
@@ -836,8 +857,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
     // Update config after state changes
     updateConfigKey('voice', newVoice).then(() => {
       setIsProcessing(false);
-      // Resume playback if it was playing before
-      if (wasPlaying) {
+      // Resume playback if it was playing before and this is the latest restart
+      if (wasPlaying && mySeq === restartSeqRef.current) {
         setIsPlaying(true);
       }
     });
@@ -851,6 +872,9 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
   const setAudioPlayerSpeedAndRestart = useCallback((newSpeed: number) => {
     const wasPlaying = isPlaying;
 
+    // Bump restart sequence to invalidate older restarts
+    const mySeq = ++restartSeqRef.current;
+
     // Set a flag to prevent double audio requests during config update
     setIsProcessing(true);
 
@@ -865,8 +889,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
     // Update config after state changes
     updateConfigKey('audioPlayerSpeed', newSpeed).then(() => {
       setIsProcessing(false);
-      // Resume playback if it was playing before
-      if (wasPlaying) {
+      // Resume playback if it was playing before and this is the latest restart
+      if (wasPlaying && mySeq === restartSeqRef.current) {
         setIsPlaying(true);
       }
     });