Skip to content

Commit 7046666

Browse files
committed
fix(tts): ensure robust caching for audio and alignments
- New buildCacheKey function creates unique identifiers for TTS cache entries. - Cache keys now include sentence, voice, speed, provider, and model parameters. - Prevents serving cached audio or alignment data that mismatches current TTS parameters. - Removes redundant audioCache.clear() calls when voice or speed change, as entries are now distinct.
1 parent 773778e commit 7046666

File tree

1 file changed

+63
-17
lines changed

1 file changed

+63
-17
lines changed

src/contexts/TTSContext.tsx

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,22 @@ const mergeContinuation = (text: string, nextText: string): TTSSmartMergeResult
246246
};
247247
};
248248

249+
const buildCacheKey = (
250+
sentence: string,
251+
voice: string,
252+
speed: number,
253+
provider: string,
254+
model: string,
255+
) => {
256+
return [
257+
`provider=${provider || ''}`,
258+
`model=${model || ''}`,
259+
`voice=${voice || ''}`,
260+
`speed=${Number.isFinite(speed) ? speed : ''}`,
261+
`text=${sentence}`,
262+
].join('|');
263+
};
264+
249265
// Create the context
250266
const TTSContext = createContext<TTSContextType | undefined>(undefined);
251267

@@ -745,7 +761,14 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
745761
// served from the local cache.
746762
const ensureAlignment = (arrayBuffer: TTSAudioBuffer) => {
747763
if (!alignmentEnabledForCurrentDoc) return;
748-
if (sentenceAlignmentCacheRef.current.has(sentence)) return;
764+
const alignmentKey = buildCacheKey(
765+
sentence,
766+
voice,
767+
speed,
768+
configTTSProvider,
769+
ttsModel,
770+
);
771+
if (sentenceAlignmentCacheRef.current.has(alignmentKey)) return;
749772

750773
try {
751774
const audioBytes = Array.from(new Uint8Array(arrayBuffer));
@@ -760,7 +783,7 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
760783
return;
761784
}
762785
const alignment = data.alignments[0] as TTSSentenceAlignment;
763-
sentenceAlignmentCacheRef.current.set(sentence, alignment);
786+
sentenceAlignmentCacheRef.current.set(alignmentKey, alignment);
764787

765788
const currentSentence = sentencesRef.current[currentIndexRef.current];
766789
if (currentSentence === sentence) {
@@ -776,8 +799,16 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
776799
}
777800
};
778801

802+
const audioCacheKey = buildCacheKey(
803+
sentence,
804+
voice,
805+
speed,
806+
configTTSProvider,
807+
ttsModel,
808+
);
809+
779810
// Check if the audio is already cached
780-
const cachedAudio = audioCache.get(sentence);
811+
const cachedAudio = audioCache.get(audioCacheKey);
781812
if (cachedAudio) {
782813
console.log('Using cached audio for sentence:', sentence.substring(0, 20));
783814
// If we have audio but no alignment (e.g. after a
@@ -829,7 +860,7 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
829860
activeAbortControllers.current.delete(controller);
830861

831862
// Cache the array buffer
832-
audioCache.set(sentence, arrayBuffer);
863+
audioCache.set(audioCacheKey, arrayBuffer);
833864

834865
// Fire-and-forget alignment request; do not block audio playback
835866
ensureAlignment(arrayBuffer);
@@ -1084,7 +1115,14 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
10841115

10851116
const playAudio = useCallback(async () => {
10861117
const sentence = sentences[currentIndex];
1087-
const cachedAlignment = sentenceAlignmentCacheRef.current.get(sentence);
1118+
const alignmentKey = buildCacheKey(
1119+
sentence,
1120+
voice,
1121+
speed,
1122+
configTTSProvider,
1123+
ttsModel,
1124+
);
1125+
const cachedAlignment = sentenceAlignmentCacheRef.current.get(alignmentKey);
10881126
if (cachedAlignment) {
10891127
setCurrentSentenceAlignment(cachedAlignment);
10901128
setCurrentWordIndex(null);
@@ -1097,7 +1135,7 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
10971135
if (howl) {
10981136
howl.play();
10991137
}
1100-
}, [sentences, currentIndex, playSentenceWithHowl]);
1138+
}, [sentences, currentIndex, playSentenceWithHowl, voice, speed, configTTSProvider, ttsModel]);
11011139

11021140
// Place useBackgroundState after playAudio is defined
11031141
const isBackgrounded = useBackgroundState({
@@ -1153,16 +1191,26 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
11531191
const preloadNextAudio = useCallback(async () => {
11541192
try {
11551193
const nextSentence = sentences[currentIndex + 1];
1156-
if (nextSentence && !audioCache.has(nextSentence) && !preloadRequests.current.has(nextSentence)) {
1194+
if (nextSentence) {
1195+
const nextKey = buildCacheKey(
1196+
nextSentence,
1197+
voice,
1198+
speed,
1199+
configTTSProvider,
1200+
ttsModel,
1201+
);
1202+
1203+
if (!audioCache.has(nextKey) && !preloadRequests.current.has(nextSentence)) {
11571204
// Start preloading but don't wait for it to complete
1158-
processSentence(nextSentence, true).catch(error => {
1159-
console.error('Error preloading next sentence:', error);
1160-
});
1205+
processSentence(nextSentence, true).catch(error => {
1206+
console.error('Error preloading next sentence:', error);
1207+
});
1208+
}
11611209
}
11621210
} catch (error) {
11631211
console.error('Error initiating preload:', error);
11641212
}
1165-
}, [currentIndex, sentences, audioCache, processSentence]);
1213+
}, [currentIndex, sentences, audioCache, processSentence, voice, speed, configTTSProvider, ttsModel]);
11661214

11671215
/**
11681216
* Main Playback Driver
@@ -1251,9 +1299,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
12511299
abortAudio(true); // Clear pending requests since speed changed
12521300
setActiveHowl(null);
12531301

1254-
// Update speed, clear cache, and config
1302+
// Update speed and config
12551303
setSpeed(newSpeed);
1256-
audioCache.clear();
12571304

12581305
// Update config after state changes
12591306
updateConfigKey('voiceSpeed', newSpeed).then(() => {
@@ -1263,7 +1310,7 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
12631310
setIsPlaying(true);
12641311
}
12651312
});
1266-
}, [abortAudio, updateConfigKey, audioCache, isPlaying]);
1313+
}, [abortAudio, updateConfigKey, isPlaying]);
12671314

12681315
/**
12691316
* Sets the voice and restarts the playback
@@ -1284,9 +1331,8 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
12841331
abortAudio(true); // Clear pending requests since voice changed
12851332
setActiveHowl(null);
12861333

1287-
// Update voice, clear cache, and config
1334+
// Update voice and config
12881335
setVoice(newVoice);
1289-
audioCache.clear();
12901336

12911337
// Update config after state changes
12921338
updateConfigKey('voice', newVoice).then(() => {
@@ -1296,7 +1342,7 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
12961342
setIsPlaying(true);
12971343
}
12981344
});
1299-
}, [abortAudio, updateConfigKey, audioCache, isPlaying]);
1345+
}, [abortAudio, updateConfigKey, isPlaying]);
13001346

13011347
/**
13021348
* Sets the audio player speed and restarts the playback

0 commit comments

Comments
 (0)