Skip to content

Commit f7969c0

Browse files
john0isaacJohn Azizpamelafox
authored
On Demand Azure Speech Generation (#1894)
* on demand speech, fix #1892 * cache speech urls * maintain one audio source across app prevent speech generation while streaming the response * create speechConfig type to group speech config * Update app/frontend/src/pages/ask/Ask.tsx Co-authored-by: Pamela Fox <[email protected]> * Update app/frontend/src/pages/chat/Chat.tsx * Preload the sync icon --------- Co-authored-by: John Aziz <[email protected]> Co-authored-by: Pamela Fox <[email protected]>
1 parent 781bf21 commit f7969c0

File tree

5 files changed

+113
-61
lines changed

5 files changed

+113
-61
lines changed

app/frontend/src/api/models.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,11 @@ export type Config = {
9292
export type SimpleAPIResponse = {
9393
message?: string;
9494
};
95+
96+
export interface SpeechConfig {
97+
speechUrls: (string | null)[];
98+
setSpeechUrls: (urls: (string | null)[]) => void;
99+
audio: HTMLAudioElement;
100+
isPlaying: boolean;
101+
setIsPlaying: (isPlaying: boolean) => void;
102+
}

app/frontend/src/components/Answer/Answer.tsx

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@ import remarkGfm from "remark-gfm";
66
import rehypeRaw from "rehype-raw";
77

88
import styles from "./Answer.module.css";
9-
import { ChatAppResponse, getCitationFilePath } from "../../api";
9+
import { ChatAppResponse, getCitationFilePath, SpeechConfig } from "../../api";
1010
import { parseAnswerToHtml } from "./AnswerParser";
1111
import { AnswerIcon } from "./AnswerIcon";
1212
import { SpeechOutputBrowser } from "./SpeechOutputBrowser";
1313
import { SpeechOutputAzure } from "./SpeechOutputAzure";
1414

1515
interface Props {
1616
answer: ChatAppResponse;
17+
index: number;
18+
speechConfig: SpeechConfig;
1719
isSelected?: boolean;
1820
isStreaming: boolean;
1921
onCitationClicked: (filePath: string) => void;
@@ -23,11 +25,12 @@ interface Props {
2325
showFollowupQuestions?: boolean;
2426
showSpeechOutputBrowser?: boolean;
2527
showSpeechOutputAzure?: boolean;
26-
speechUrl: string | null;
2728
}
2829

2930
export const Answer = ({
3031
answer,
32+
index,
33+
speechConfig,
3134
isSelected,
3235
isStreaming,
3336
onCitationClicked,
@@ -36,13 +39,11 @@ export const Answer = ({
3639
onFollowupQuestionClicked,
3740
showFollowupQuestions,
3841
showSpeechOutputAzure,
39-
showSpeechOutputBrowser,
40-
speechUrl
42+
showSpeechOutputBrowser
4143
}: Props) => {
4244
const followupQuestions = answer.context?.followup_questions;
4345
const messageContent = answer.message.content;
4446
const parsedAnswer = useMemo(() => parseAnswerToHtml(messageContent, isStreaming, onCitationClicked), [answer]);
45-
4647
const sanitizedAnswerHtml = DOMPurify.sanitize(parsedAnswer.answerHtml);
4748

4849
return (
@@ -67,7 +68,9 @@ export const Answer = ({
6768
onClick={() => onSupportingContentClicked()}
6869
disabled={!answer.context.data_points}
6970
/>
70-
{showSpeechOutputAzure && <SpeechOutputAzure url={speechUrl} />}
71+
{showSpeechOutputAzure && (
72+
<SpeechOutputAzure answer={sanitizedAnswerHtml} index={index} speechConfig={speechConfig} isStreaming={isStreaming} />
73+
)}
7174
{showSpeechOutputBrowser && <SpeechOutputBrowser answer={sanitizedAnswerHtml} />}
7275
</div>
7376
</Stack>
Lines changed: 62 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,80 @@
11
import { useState } from "react";
22

33
import { IconButton } from "@fluentui/react";
4+
import { getSpeechApi, SpeechConfig } from "../../api";
45

56
interface Props {
6-
url: string | null;
7+
answer: string;
8+
speechConfig: SpeechConfig;
9+
index: number;
10+
isStreaming: boolean;
711
}
812

9-
let audio = new Audio();
13+
export const SpeechOutputAzure = ({ answer, speechConfig, index, isStreaming }: Props) => {
14+
const [isLoading, setIsLoading] = useState(false);
15+
const [localPlayingState, setLocalPlayingState] = useState(false);
1016

11-
export const SpeechOutputAzure = ({ url }: Props) => {
12-
const [isPlaying, setIsPlaying] = useState(false);
17+
const playAudio = async (url: string) => {
18+
speechConfig.audio.src = url;
19+
await speechConfig.audio
20+
.play()
21+
.then(() => {
22+
speechConfig.audio.onended = () => {
23+
speechConfig.setIsPlaying(false);
24+
setLocalPlayingState(false);
25+
};
26+
speechConfig.setIsPlaying(true);
27+
setLocalPlayingState(true);
28+
})
29+
.catch(() => {
30+
alert("Failed to play speech output.");
31+
console.error("Failed to play speech output.");
32+
speechConfig.setIsPlaying(false);
33+
setLocalPlayingState(false);
34+
});
35+
};
1336

14-
const startOrStopAudio = async () => {
15-
if (isPlaying) {
16-
audio.pause();
17-
setIsPlaying(false);
37+
const startOrStopSpeech = async (answer: string) => {
38+
if (speechConfig.isPlaying) {
39+
speechConfig.audio.pause();
40+
speechConfig.audio.currentTime = 0;
41+
speechConfig.setIsPlaying(false);
42+
setLocalPlayingState(false);
1843
return;
1944
}
20-
21-
if (!url) {
22-
console.error("Speech output is not yet available.");
45+
if (speechConfig.speechUrls[index]) {
46+
playAudio(speechConfig.speechUrls[index]);
2347
return;
2448
}
25-
audio = new Audio(url);
26-
await audio.play();
27-
audio.addEventListener("ended", () => {
28-
setIsPlaying(false);
49+
setIsLoading(true);
50+
await getSpeechApi(answer).then(async speechUrl => {
51+
if (!speechUrl) {
52+
alert("Speech output is not available.");
53+
console.error("Speech output is not available.");
54+
return;
55+
}
56+
setIsLoading(false);
57+
speechConfig.setSpeechUrls(speechConfig.speechUrls.map((url, i) => (i === index ? speechUrl : url)));
58+
playAudio(speechUrl);
2959
});
30-
setIsPlaying(true);
3160
};
3261

33-
const color = isPlaying ? "red" : "black";
34-
return (
35-
<IconButton
36-
style={{ color: color }}
37-
iconProps={{ iconName: "Volume3" }}
38-
title="Speak answer"
39-
ariaLabel="Speak answer"
40-
onClick={() => startOrStopAudio()}
41-
disabled={!url}
42-
/>
62+
const color = localPlayingState ? "red" : "black";
63+
64+
// We always preload the Sync icon in hidden mode so that there's no visual glitch when icon changes
65+
return isLoading ? (
66+
<IconButton style={{ color: color }} iconProps={{ iconName: "Sync" }} title="Loading speech" ariaLabel="Loading speech" disabled={true} />
67+
) : (
68+
<>
69+
<IconButton iconProps={{ iconName: "Sync" }} ariaHidden={true} disabled={true} style={{ display: "none" }} />
70+
<IconButton
71+
style={{ color: color }}
72+
iconProps={{ iconName: "Volume3" }}
73+
title="Speak answer"
74+
ariaLabel="Speak answer"
75+
onClick={() => startOrStopSpeech(answer)}
76+
disabled={isStreaming}
77+
/>
78+
</>
4379
);
4480
};

app/frontend/src/pages/ask/Ask.tsx

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { useId } from "@fluentui/react-hooks";
44

55
import styles from "./Ask.module.css";
66

7-
import { askApi, configApi, getSpeechApi, ChatAppResponse, ChatAppRequest, RetrievalMode, VectorFieldOptions, GPT4VInput } from "../../api";
7+
import { askApi, configApi, ChatAppResponse, ChatAppRequest, RetrievalMode, VectorFieldOptions, GPT4VInput, SpeechConfig } from "../../api";
88
import { Answer, AnswerError } from "../../components/Answer";
99
import { QuestionInput } from "../../components/QuestionInput";
1010
import { ExampleList } from "../../components/Example";
@@ -48,13 +48,24 @@ export function Component(): JSX.Element {
4848
const [showSpeechInput, setShowSpeechInput] = useState<boolean>(false);
4949
const [showSpeechOutputBrowser, setShowSpeechOutputBrowser] = useState<boolean>(false);
5050
const [showSpeechOutputAzure, setShowSpeechOutputAzure] = useState<boolean>(false);
51+
const audio = useRef(new Audio()).current;
52+
const [isPlaying, setIsPlaying] = useState(false);
5153

5254
const lastQuestionRef = useRef<string>("");
5355

5456
const [isLoading, setIsLoading] = useState<boolean>(false);
5557
const [error, setError] = useState<unknown>();
5658
const [answer, setAnswer] = useState<ChatAppResponse>();
57-
const [speechUrl, setSpeechUrl] = useState<string | null>(null);
59+
// For the Ask tab, this array will hold a maximum of one URL
60+
const [speechUrls, setSpeechUrls] = useState<(string | null)[]>([]);
61+
62+
const speechConfig: SpeechConfig = {
63+
speechUrls,
64+
setSpeechUrls,
65+
audio,
66+
isPlaying,
67+
setIsPlaying
68+
};
5869

5970
const [activeCitation, setActiveCitation] = useState<string>();
6071
const [activeAnalysisPanelTab, setActiveAnalysisPanelTab] = useState<AnalysisPanelTabs | undefined>(undefined);
@@ -82,14 +93,6 @@ export function Component(): JSX.Element {
8293
getConfig();
8394
}, []);
8495

85-
useEffect(() => {
86-
if (answer && showSpeechOutputAzure) {
87-
getSpeechApi(answer.message.content).then(speechUrl => {
88-
setSpeechUrl(speechUrl);
89-
});
90-
}
91-
}, [answer]);
92-
9396
const makeApiRequest = async (question: string) => {
9497
lastQuestionRef.current = question;
9598

@@ -134,7 +137,7 @@ export function Component(): JSX.Element {
134137
};
135138
const result = await askApi(request, token);
136139
setAnswer(result);
137-
setSpeechUrl(null);
140+
setSpeechUrls([null]);
138141
} catch (e) {
139142
setError(e);
140143
} finally {
@@ -256,13 +259,14 @@ export function Component(): JSX.Element {
256259
<div className={styles.askAnswerContainer}>
257260
<Answer
258261
answer={answer}
262+
index={0}
263+
speechConfig={speechConfig}
259264
isStreaming={false}
260265
onCitationClicked={x => onShowCitation(x)}
261266
onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab)}
262267
onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab)}
263268
showSpeechOutputAzure={showSpeechOutputAzure}
264269
showSpeechOutputBrowser={showSpeechOutputBrowser}
265-
speechUrl={speechUrl}
266270
/>
267271
</div>
268272
)}

app/frontend/src/pages/chat/Chat.tsx

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ import styles from "./Chat.module.css";
99
import {
1010
chatApi,
1111
configApi,
12-
getSpeechApi,
1312
RetrievalMode,
1413
ChatAppResponse,
1514
ChatAppResponseOrError,
1615
ChatAppRequest,
1716
ResponseMessage,
1817
VectorFieldOptions,
19-
GPT4VInput
18+
GPT4VInput,
19+
SpeechConfig
2020
} from "../../api";
2121
import { Answer, AnswerError, AnswerLoading } from "../../components/Answer";
2222
import { QuestionInput } from "../../components/QuestionInput";
@@ -77,6 +77,16 @@ const Chat = () => {
7777
const [showSpeechInput, setShowSpeechInput] = useState<boolean>(false);
7878
const [showSpeechOutputBrowser, setShowSpeechOutputBrowser] = useState<boolean>(false);
7979
const [showSpeechOutputAzure, setShowSpeechOutputAzure] = useState<boolean>(false);
80+
const audio = useRef(new Audio()).current;
81+
const [isPlaying, setIsPlaying] = useState(false);
82+
83+
const speechConfig: SpeechConfig = {
84+
speechUrls,
85+
setSpeechUrls,
86+
audio,
87+
isPlaying,
88+
setIsPlaying
89+
};
8090

8191
const getConfig = async () => {
8292
configApi().then(config => {
@@ -199,6 +209,7 @@ const Chat = () => {
199209
}
200210
setAnswers([...answers, [question, parsedResponse as ChatAppResponse]]);
201211
}
212+
setSpeechUrls([...speechUrls, null]);
202213
} catch (e) {
203214
setError(e);
204215
} finally {
@@ -212,6 +223,7 @@ const Chat = () => {
212223
setActiveCitation(undefined);
213224
setActiveAnalysisPanelTab(undefined);
214225
setAnswers([]);
226+
setSpeechUrls([]);
215227
setStreamedAnswers([]);
216228
setIsLoading(false);
217229
setIsStreaming(false);
@@ -223,19 +235,6 @@ const Chat = () => {
223235
getConfig();
224236
}, []);
225237

226-
useEffect(() => {
227-
if (answers && showSpeechOutputAzure) {
228-
// For each answer that is missing a speech URL, fetch the speech URL
229-
for (let i = 0; i < answers.length; i++) {
230-
if (!speechUrls[i]) {
231-
getSpeechApi(answers[i][1].message.content).then(speechUrl => {
232-
setSpeechUrls([...speechUrls.slice(0, i), speechUrl, ...speechUrls.slice(i + 1)]);
233-
});
234-
}
235-
}
236-
}
237-
}, [answers]);
238-
239238
const onPromptTemplateChange = (_ev?: React.FormEvent<HTMLInputElement | HTMLTextAreaElement>, newValue?: string) => {
240239
setPromptTemplate(newValue || "");
241240
};
@@ -368,6 +367,8 @@ const Chat = () => {
368367
isStreaming={true}
369368
key={index}
370369
answer={streamedAnswer[1]}
370+
index={index}
371+
speechConfig={speechConfig}
371372
isSelected={false}
372373
onCitationClicked={c => onShowCitation(c, index)}
373374
onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, index)}
@@ -376,7 +377,6 @@ const Chat = () => {
376377
showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
377378
showSpeechOutputAzure={showSpeechOutputAzure}
378379
showSpeechOutputBrowser={showSpeechOutputBrowser}
379-
speechUrl={speechUrls[index]}
380380
/>
381381
</div>
382382
</div>
@@ -390,6 +390,8 @@ const Chat = () => {
390390
isStreaming={false}
391391
key={index}
392392
answer={answer[1]}
393+
index={index}
394+
speechConfig={speechConfig}
393395
isSelected={selectedAnswer === index && activeAnalysisPanelTab !== undefined}
394396
onCitationClicked={c => onShowCitation(c, index)}
395397
onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, index)}
@@ -398,7 +400,6 @@ const Chat = () => {
398400
showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
399401
showSpeechOutputAzure={showSpeechOutputAzure}
400402
showSpeechOutputBrowser={showSpeechOutputBrowser}
401-
speechUrl={speechUrls[index]}
402403
/>
403404
</div>
404405
</div>

0 commit comments

Comments
 (0)