Skip to content

Commit 1dcffc8

Browse files
committed
feat(tts,ui,api): add Kokoro multi-voice selection and SDK support
- Introduce voice utils (model detection, voice parsing/weights, limits) - Enable Kokoro multi-voice strings across OpenAI/Deepinfra/custom providers - Normalize non-Kokoro voices to single token for SDK calls - Expose full Kokoro voice list for custom-openai Kokoro models - Update TTS API to return audio as ArrayBuffer and improve logging - Add multi-select UI for Kokoro voices with provider-based clamping - Preserve Kokoro voice strings in TTSContext and coalesce restarts - Merge multi-sentence quoted dialogue in NLP sentence splitter - Update tests for single and multi-voice selection flows
1 parent 21870ed commit 1dcffc8

File tree

8 files changed

+368
-54
lines changed

8 files changed

+368
-54
lines changed

src/app/api/tts/route.ts

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { NextRequest, NextResponse } from 'next/server';
22
import OpenAI from 'openai';
33
import { SpeechCreateParams } from 'openai/resources/audio/speech.mjs';
4+
import { isKokoroModel, stripVoiceWeights } from '@/utils/voice';
45

56
type CustomVoice = string;
67
type ExtendedSpeechParams = Omit<SpeechCreateParams, 'voice'> & {
@@ -15,30 +16,36 @@ export async function POST(req: NextRequest) {
1516
const openApiBaseUrl = req.headers.get('x-openai-base-url') || process.env.API_BASE;
1617
const provider = req.headers.get('x-tts-provider') || 'openai';
1718
const { text, voice, speed, format, model, instructions } = await req.json();
18-
console.log('Received TTS request:', text, voice, speed, format, model);
19-
20-
if (!openApiKey) {
21-
return NextResponse.json({ error: 'Missing OpenAI API key' }, { status: 401 });
22-
}
19+
console.log('Received TTS request:', { provider, model, voice, speed, format, hasInstructions: Boolean(instructions) });
2320

2421
if (!text || !voice || !speed) {
2522
return NextResponse.json({ error: 'Missing required parameters' }, { status: 400 });
2623
}
2724

2825
// Apply Deepinfra defaults if provider is deepinfra
2926
const finalModel = provider === 'deepinfra' && !model ? 'hexgrad/Kokoro-82M' : model;
30-
const finalVoice = provider === 'deepinfra' && !voice ? 'af_bella' : voice;
27+
const initialVoice = provider === 'deepinfra' && !voice ? 'af_bella' : voice;
28+
29+
// For SDK providers (OpenAI/Deepinfra), preserve multi-voice for Kokoro models, otherwise normalize to first token
30+
const isKokoro = isKokoroModel(finalModel);
31+
let normalizedVoice = initialVoice;
32+
if (!isKokoro && typeof normalizedVoice === 'string' && normalizedVoice.includes('+')) {
33+
normalizedVoice = stripVoiceWeights(normalizedVoice.split('+')[0]);
34+
console.log('Normalized multi-voice to single for non-Kokoro SDK provider:', normalizedVoice);
35+
}
3136

32-
// Initialize OpenAI client with abort signal
37+
// Initialize OpenAI client with abort signal (OpenAI/deepinfra)
3338
const openai = new OpenAI({
3439
apiKey: openApiKey,
3540
baseURL: openApiBaseUrl,
3641
});
3742

43+
// Unified path: all providers (openai, deepinfra, custom-openai) go through the SDK below.
44+
3845
// Request audio from OpenAI and pass along the abort signal
3946
const createParams: ExtendedSpeechParams = {
4047
model: finalModel || 'tts-1',
41-
voice: finalVoice as "alloy",
48+
voice: normalizedVoice as SpeechCreateParams['voice'],
4249
input: text,
4350
speed: speed,
4451
response_format: format === 'aac' ? 'aac' : 'mp3',
@@ -51,13 +58,11 @@ export async function POST(req: NextRequest) {
5158

5259
const response = await openai.audio.speech.create(createParams as SpeechCreateParams, { signal: req.signal });
5360

54-
// Get the audio data as array buffer
61+
// Read the audio data as an ArrayBuffer and return it with appropriate headers
5562
// This will also be aborted if the client cancels
56-
const stream = response.body;
57-
58-
// Return audio data with appropriate headers
63+
const buffer = await response.arrayBuffer();
5964
const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg';
60-
return new NextResponse(stream, {
65+
return new NextResponse(buffer, {
6166
headers: {
6267
'Content-Type': contentType
6368
}

src/app/api/tts/voices/route.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { NextRequest, NextResponse } from 'next/server';
2+
import { isKokoroModel } from '@/utils/voice';
23

34
const OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'];
45
const GPT4O_MINI_VOICES = ['alloy', 'ash', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer'];
@@ -29,6 +30,10 @@ function getDefaultVoices(provider: string, model: string): string[] {
2930

3031
// For Custom OpenAI-Like provider
3132
if (provider === 'custom-openai') {
33+
// If using Kokoro-FastAPI (model string contains 'kokoro'), expose full Kokoro voices
34+
if (isKokoroModel(model)) {
35+
return KOKORO_VOICES;
36+
}
3237
return CUSTOM_OPENAI_VOICES;
3338
}
3439

@@ -72,7 +77,7 @@ async function fetchDeepinfraVoices(apiKey: string): Promise<string[]> {
7277
}
7378

7479
const data = await response.json();
75-
//console.log('Deepinfra voices response:', data);
80+
7681
// Extract voice names from the response, excluding preset voices
7782
if (data.voices && Array.isArray(data.voices)) {
7883
return data.voices

src/components/player/VoicesControl.tsx

Lines changed: 119 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,40 +8,135 @@ import {
88
} from '@headlessui/react';
99
import { ChevronUpDownIcon, AudioWaveIcon } from '@/components/icons/Icons';
1010
import { useConfig } from '@/contexts/ConfigContext';
11+
import { useEffect, useMemo, useState, useCallback } from 'react';
12+
import {
13+
parseKokoroVoiceNames,
14+
buildKokoroVoiceString,
15+
getMaxVoicesForProvider,
16+
isKokoroModel
17+
} from '@/utils/voice';
1118

1219
export const VoicesControl = ({ availableVoices, setVoiceAndRestart }: {
1320
availableVoices: string[];
1421
setVoiceAndRestart: (voice: string) => void;
1522
}) => {
16-
const { voice: configVoice } = useConfig();
23+
const { voice: configVoice, ttsModel, ttsProvider } = useConfig();
1724

18-
// If the saved voice is not in the available list, use the first available voice
19-
const currentVoice = (configVoice && availableVoices.includes(configVoice))
20-
? configVoice
21-
: availableVoices[0] || '';
25+
const isKokoro = isKokoroModel(ttsModel);
26+
const maxVoices = getMaxVoicesForProvider(ttsProvider, ttsModel || '');
27+
28+
const clampToLimit = useCallback((names: string[]): string[] => {
29+
if (maxVoices === Infinity) return names;
30+
if (names.length <= maxVoices) return names;
31+
// For initial clamp, keep the first up to max allowed
32+
return names.slice(0, maxVoices);
33+
}, [maxVoices]);
34+
35+
// Local selection state for Kokoro multi-select
36+
const [selectedVoices, setSelectedVoices] = useState<string[]>([]);
37+
38+
useEffect(() => {
39+
if (!isKokoro) return;
40+
let initial: string[] = [];
41+
if (configVoice && configVoice.includes('+')) {
42+
initial = parseKokoroVoiceNames(configVoice);
43+
} else if (configVoice && availableVoices.includes(configVoice)) {
44+
initial = [configVoice];
45+
} else if (availableVoices.length > 0) {
46+
initial = [availableVoices[0]];
47+
}
48+
setSelectedVoices(clampToLimit(initial));
49+
}, [isKokoro, configVoice, availableVoices, maxVoices, clampToLimit]);
50+
51+
// If the saved voice is not in the available list, use the first available voice (non-Kokoro)
52+
const currentVoice = useMemo(() => {
53+
if (isKokoro) {
54+
const combined = buildKokoroVoiceString(selectedVoices);
55+
return combined || (availableVoices[0] || '');
56+
}
57+
return (configVoice && availableVoices.includes(configVoice))
58+
? configVoice
59+
: availableVoices[0] || '';
60+
}, [isKokoro, selectedVoices, availableVoices, configVoice]);
2261

2362
return (
2463
<div className="relative">
25-
<Listbox value={currentVoice} onChange={setVoiceAndRestart}>
26-
<ListboxButton className="flex items-center space-x-0.5 sm:space-x-1 bg-transparent text-foreground text-xs sm:text-sm focus:outline-none cursor-pointer hover:bg-offbase rounded pl-1.5 sm:pl-2 pr-0.5 sm:pr-1 py-0.5 sm:py-1 transform transition-transform duration-200 ease-in-out hover:scale-[1.04] hover:text-accent">
27-
<AudioWaveIcon className="h-3 w-3 sm:h-3.5 sm:w-3.5" />
28-
<span>{currentVoice}</span>
29-
<ChevronUpDownIcon className="h-2.5 w-2.5 sm:h-3 sm:w-3" />
30-
</ListboxButton>
31-
<ListboxOptions anchor='top end' className="absolute z-50 w-28 sm:w-32 max-h-64 overflow-auto rounded-lg bg-base shadow-lg ring-1 ring-black ring-opacity-5 focus:outline-none">
32-
{availableVoices.map((voiceId) => (
33-
<ListboxOption
34-
key={voiceId}
35-
value={voiceId}
36-
className={({ active, selected }) =>
37-
`relative cursor-pointer select-none py-0.5 px-1.5 sm:py-2 sm:px-3 ${active ? 'bg-offbase' : ''} ${selected ? 'font-medium' : ''}`
64+
{isKokoro ? (
65+
<Listbox
66+
multiple
67+
value={selectedVoices}
68+
onChange={(vals: string[]) => {
69+
if (!vals || vals.length === 0) return; // prevent empty selection
70+
71+
let next = vals;
72+
73+
// Enforce deepinfra max selection of 2 voices
74+
if (maxVoices !== Infinity && vals.length > maxVoices) {
75+
// Determine the newly added voice
76+
const newlyAdded = vals.find(v => !selectedVoices.includes(v));
77+
if (newlyAdded) {
78+
const lastPrev = selectedVoices[selectedVoices.length - 1] ?? selectedVoices[0] ?? '';
79+
// Build next as [last previously selected, newly added], deduped, limited to max
80+
const pair = Array.from(new Set([lastPrev, newlyAdded])).filter(Boolean);
81+
next = pair.slice(0, maxVoices);
82+
} else {
83+
// Fallback: keep the last maxVoices options
84+
next = vals.slice(-maxVoices);
3885
}
39-
>
40-
<span className='text-xs sm:text-sm'>{voiceId}</span>
41-
</ListboxOption>
42-
))}
43-
</ListboxOptions>
44-
</Listbox>
86+
}
87+
88+
setSelectedVoices(next);
89+
const combined = buildKokoroVoiceString(next);
90+
if (combined) {
91+
setVoiceAndRestart(combined);
92+
}
93+
}}
94+
>
95+
<ListboxButton className="flex items-center space-x-0.5 sm:space-x-1 bg-transparent text-foreground text-xs sm:text-sm focus:outline-none cursor-pointer hover:bg-offbase rounded pl-1.5 sm:pl-2 pr-0.5 sm:pr-1 py-0.5 sm:py-1 transform transition-transform duration-200 ease-in-out hover:scale-[1.04] hover:text-accent">
96+
<AudioWaveIcon className="h-3 w-3 sm:h-3.5 sm:w-3.5" />
97+
<span>
98+
{selectedVoices.length > 1
99+
? selectedVoices.join(' + ')
100+
: selectedVoices[0] || currentVoice}
101+
</span>
102+
<ChevronUpDownIcon className="h-2.5 w-2.5 sm:h-3 sm:w-3" />
103+
</ListboxButton>
104+
<ListboxOptions anchor='top end' className="absolute z-50 w-40 sm:w-44 max-h-64 overflow-auto rounded-lg bg-base shadow-lg ring-1 ring-black ring-opacity-5 focus:outline-none">
105+
{availableVoices.map((voiceId) => (
106+
<ListboxOption
107+
key={voiceId}
108+
value={voiceId}
109+
className={({ active, selected }) =>
110+
`relative cursor-pointer select-none py-1 px-2 sm:py-2 sm:px-3 ${active ? 'bg-offbase' : ''} ${selected ? 'font-medium bg-accent text-background' : ''} ${selected && active ? 'text-foreground' : ''}`
111+
}
112+
>
113+
<span className='text-xs sm:text-sm'>{voiceId}</span>
114+
</ListboxOption>
115+
))}
116+
</ListboxOptions>
117+
</Listbox>
118+
) : (
119+
<Listbox value={currentVoice} onChange={setVoiceAndRestart}>
120+
<ListboxButton className="flex items-center space-x-0.5 sm:space-x-1 bg-transparent text-foreground text-xs sm:text-sm focus:outline-none cursor-pointer hover:bg-offbase rounded pl-1.5 sm:pl-2 pr-0.5 sm:pr-1 py-0.5 sm:py-1 transform transition-transform duration-200 ease-in-out hover:scale-[1.04] hover:text-accent">
121+
<AudioWaveIcon className="h-3 w-3 sm:h-3.5 sm:w-3.5" />
122+
<span>{currentVoice}</span>
123+
<ChevronUpDownIcon className="h-2.5 w-2.5 sm:h-3 sm:w-3" />
124+
</ListboxButton>
125+
<ListboxOptions anchor='top end' className="absolute z-50 w-28 sm:w-32 max-h-64 overflow-auto rounded-lg bg-base shadow-lg ring-1 ring-black ring-opacity-5 focus:outline-none">
126+
{availableVoices.map((voiceId) => (
127+
<ListboxOption
128+
key={voiceId}
129+
value={voiceId}
130+
className={({ active, selected }) =>
131+
`relative cursor-pointer select-none py-1 px-2 sm:py-2 sm:px-3 ${active ? 'bg-offbase' : ''} ${selected ? 'font-medium bg-accent text-background' : ''} ${selected && active ? 'text-foreground' : ''}`
132+
}
133+
>
134+
<span className='text-xs sm:text-sm'>{voiceId}</span>
135+
</ListboxOption>
136+
))}
137+
</ListboxOptions>
138+
</Listbox>
139+
)}
45140
</div>
46141
);
47142
}

src/contexts/TTSContext.tsx

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import { getLastDocumentLocation, setLastDocumentLocation } from '@/utils/indexe
3838
import { useBackgroundState } from '@/hooks/audio/useBackgroundState';
3939
import { withRetry } from '@/utils/audio';
4040
import { processTextToSentences } from '@/utils/nlp';
41+
import { isKokoroModel } from '@/utils/voice';
4142

4243
// Media globals
4344
declare global {
@@ -155,6 +156,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
155156
const activeAbortControllers = useRef<Set<AbortController>>(new Set());
156157
// Track if we're restoring from a saved position
157158
const [pendingRestoreIndex, setPendingRestoreIndex] = useState<number | null>(null);
159+
// Guard to coalesce rapid restarts and only resume the latest change
160+
const restartSeqRef = useRef(0);
158161

159162
/**
160163
* Processes text into sentences using the shared NLP utility
@@ -412,19 +415,31 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
412415
*/
413416
useEffect(() => {
414417
if (availableVoices.length > 0) {
418+
// Allow Kokoro multi-voice strings (e.g., "voice1(0.5)+voice2(0.5)") for any provider
419+
const isKokoro = isKokoroModel(configTTSModel);
420+
421+
if (isKokoro) {
422+
// If Kokoro and we have any voice string (including plus/weights), don't override it.
423+
// Only default when voice is empty.
424+
if (!voice) {
425+
setVoice(availableVoices[0]);
426+
}
427+
return;
428+
}
429+
415430
if (!voice || !availableVoices.includes(voice)) {
416431
console.log(`Voice "${voice || '(empty)'}" not found in available voices. Using "${availableVoices[0]}"`);
417432
setVoice(availableVoices[0]);
418433
// Don't save to config - just use it temporarily until user explicitly selects one
419434
}
420435
}
421-
}, [availableVoices, voice]);
436+
}, [availableVoices, voice, configTTSModel]);
422437

423438
/**
424439
* Generates and plays audio for the current sentence
425440
*
426441
* @param {string} sentence - The sentence to generate audio for
427-
* @returns {Promise<AudioBuffer | undefined>} The generated audio buffer
442+
* @returns {Promise<ArrayBuffer | undefined>} The generated audio buffer
428443
*/
429444
const getAudio = useCallback(async (sentence: string): Promise<ArrayBuffer | undefined> => {
430445
// Check if the audio is already cached
@@ -791,6 +806,9 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
791806
const setSpeedAndRestart = useCallback((newSpeed: number) => {
792807
const wasPlaying = isPlaying;
793808

809+
// Bump restart sequence to invalidate older restarts
810+
const mySeq = ++restartSeqRef.current;
811+
794812
// Set a flag to prevent double audio requests during config update
795813
setIsProcessing(true);
796814

@@ -806,8 +824,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
806824
// Update config after state changes
807825
updateConfigKey('voiceSpeed', newSpeed).then(() => {
808826
setIsProcessing(false);
809-
// Resume playback if it was playing before
810-
if (wasPlaying) {
827+
// Resume playback if it was playing before and this is the latest restart
828+
if (wasPlaying && mySeq === restartSeqRef.current) {
811829
setIsPlaying(true);
812830
}
813831
});
@@ -821,6 +839,9 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
821839
const setVoiceAndRestart = useCallback((newVoice: string) => {
822840
const wasPlaying = isPlaying;
823841

842+
// Bump restart sequence to invalidate older restarts
843+
const mySeq = ++restartSeqRef.current;
844+
824845
// Set a flag to prevent double audio requests during config update
825846
setIsProcessing(true);
826847

@@ -836,8 +857,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
836857
// Update config after state changes
837858
updateConfigKey('voice', newVoice).then(() => {
838859
setIsProcessing(false);
839-
// Resume playback if it was playing before
840-
if (wasPlaying) {
860+
// Resume playback if it was playing before and this is the latest restart
861+
if (wasPlaying && mySeq === restartSeqRef.current) {
841862
setIsPlaying(true);
842863
}
843864
});
@@ -851,6 +872,9 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
851872
const setAudioPlayerSpeedAndRestart = useCallback((newSpeed: number) => {
852873
const wasPlaying = isPlaying;
853874

875+
// Bump restart sequence to invalidate older restarts
876+
const mySeq = ++restartSeqRef.current;
877+
854878
// Set a flag to prevent double audio requests during config update
855879
setIsProcessing(true);
856880

@@ -865,8 +889,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
865889
// Update config after state changes
866890
updateConfigKey('audioPlayerSpeed', newSpeed).then(() => {
867891
setIsProcessing(false);
868-
// Resume playback if it was playing before
869-
if (wasPlaying) {
892+
// Resume playback if it was playing before and this is the latest restart
893+
if (wasPlaying && mySeq === restartSeqRef.current) {
870894
setIsPlaying(true);
871895
}
872896
});

0 commit comments

Comments
 (0)