Skip to content

Commit 306ac38

Browse files
authored
feat: add low cost browser text to speech output using WebSpeechAPI (#1671)
* feat: add low cost browser text to speech output using WebSpeechAPI This PR adds the optional feature to enable the built into browsers text to speech optional feature by setting the boolean. * improve and refactor code * fix typo and remove unnessesary check * add docs * fix note * fix ruff
1 parent f80a3db commit 306ac38

File tree

15 files changed

+130
-20
lines changed

15 files changed

+130
-20
lines changed

.azdo/pipelines/azure-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ steps:
8787
AZURE_COMPUTER_VISION_LOCATION: $(AZURE_COMPUTER_VISION_LOCATION)
8888
AZURE_COMPUTER_VISION_SKU: $(AZURE_COMPUTER_VISION_SKU)
8989
USE_SPEECH_INPUT_BROWSER: $(USE_SPEECH_INPUT_BROWSER)
90+
USE_SPEECH_OUTPUT_BROWSER: $(USE_SPEECH_OUTPUT_BROWSER)
9091
USE_SPEECH_OUTPUT_AZURE: $(USE_SPEECH_OUTPUT_AZURE)
9192
AZURE_SPEECH_SERVICE: $(AZURE_SPEECH_SERVICE)
9293
AZURE_SPEECH_SERVICE_RESOURCE_GROUP: $(AZURE_SPEECH_SERVICE_RESOURCE_GROUP)

.github/workflows/azure-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ jobs:
7474
AZURE_VISION_ENDPOINT: ${{ vars.AZURE_VISION_ENDPOINT }}
7575
VISION_SECRET_NAME: ${{ vars.VISION_SECRET_NAME }}
7676
USE_SPEECH_INPUT_BROWSER: ${{ vars.USE_SPEECH_INPUT_BROWSER }}
77+
USE_SPEECH_OUTPUT_BROWSER: ${{ vars.USE_SPEECH_OUTPUT_BROWSER }}
7778
USE_SPEECH_OUTPUT_AZURE: ${{ vars.USE_SPEECH_OUTPUT_AZURE }}
7879
AZURE_SPEECH_SERVICE: ${{ vars.AZURE_SPEECH_SERVICE }}
7980
AZURE_SPEECH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SPEECH_RESOURCE_GROUP }}

app/backend/app.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@
6363
CONFIG_SEARCH_CLIENT,
6464
CONFIG_SEMANTIC_RANKER_DEPLOYED,
6565
CONFIG_SPEECH_INPUT_ENABLED,
66-
CONFIG_SPEECH_OUTPUT_ENABLED,
66+
CONFIG_SPEECH_OUTPUT_AZURE_ENABLED,
67+
CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED,
6768
CONFIG_SPEECH_SERVICE_ID,
6869
CONFIG_SPEECH_SERVICE_LOCATION,
6970
CONFIG_SPEECH_SERVICE_TOKEN,
@@ -245,7 +246,8 @@ def config():
245246
"showVectorOption": current_app.config[CONFIG_VECTOR_SEARCH_ENABLED],
246247
"showUserUpload": current_app.config[CONFIG_USER_UPLOAD_ENABLED],
247248
"showSpeechInput": current_app.config[CONFIG_SPEECH_INPUT_ENABLED],
248-
"showSpeechOutput": current_app.config[CONFIG_SPEECH_OUTPUT_ENABLED],
249+
"showSpeechOutputBrowser": current_app.config[CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED],
250+
"showSpeechOutputAzure": current_app.config[CONFIG_SPEECH_OUTPUT_AZURE_ENABLED],
249251
}
250252
)
251253

@@ -405,6 +407,7 @@ async def setup_clients():
405407
USE_GPT4V = os.getenv("USE_GPT4V", "").lower() == "true"
406408
USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true"
407409
USE_SPEECH_INPUT_BROWSER = os.getenv("USE_SPEECH_INPUT_BROWSER", "").lower() == "true"
410+
USE_SPEECH_OUTPUT_BROWSER = os.getenv("USE_SPEECH_OUTPUT_BROWSER", "").lower() == "true"
408411
USE_SPEECH_OUTPUT_AZURE = os.getenv("USE_SPEECH_OUTPUT_AZURE", "").lower() == "true"
409412

410413
# Use the current user identity to authenticate with Azure OpenAI, AI Search and Blob Storage (no secrets needed,
@@ -536,7 +539,8 @@ async def setup_clients():
536539
current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false"
537540
current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD)
538541
current_app.config[CONFIG_SPEECH_INPUT_ENABLED] = USE_SPEECH_INPUT_BROWSER
539-
current_app.config[CONFIG_SPEECH_OUTPUT_ENABLED] = USE_SPEECH_OUTPUT_AZURE
542+
current_app.config[CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED] = USE_SPEECH_OUTPUT_BROWSER
543+
current_app.config[CONFIG_SPEECH_OUTPUT_AZURE_ENABLED] = USE_SPEECH_OUTPUT_AZURE
540544

541545
# Various approaches to integrate GPT and external knowledge, most applications will use a single one of these patterns
542546
# or some derivative, here we include several for exploration purposes

app/backend/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
CONFIG_OPENAI_CLIENT = "openai_client"
1616
CONFIG_INGESTER = "ingester"
1717
CONFIG_SPEECH_INPUT_ENABLED = "speech_input_enabled"
18-
CONFIG_SPEECH_OUTPUT_ENABLED = "speech_output_enabled"
18+
CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED = "speech_output_browser_enabled"
19+
CONFIG_SPEECH_OUTPUT_AZURE_ENABLED = "speech_output_azure_enabled"
1920
CONFIG_SPEECH_SERVICE_ID = "speech_service_id"
2021
CONFIG_SPEECH_SERVICE_LOCATION = "speech_service_location"
2122
CONFIG_SPEECH_SERVICE_TOKEN = "speech_service_token"

app/frontend/src/api/models.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ export type Config = {
8686
showVectorOption: boolean;
8787
showUserUpload: boolean;
8888
showSpeechInput: boolean;
89-
showSpeechOutput: boolean;
89+
showSpeechOutputBrowser: boolean;
90+
showSpeechOutputAzure: boolean;
9091
};
9192

9293
export type SimpleAPIResponse = {

app/frontend/src/components/Answer/Answer.tsx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ import styles from "./Answer.module.css";
66
import { ChatAppResponse, getCitationFilePath } from "../../api";
77
import { parseAnswerToHtml } from "./AnswerParser";
88
import { AnswerIcon } from "./AnswerIcon";
9-
import { SpeechOutput } from "./SpeechOutput";
9+
import { SpeechOutputBrowser } from "./SpeechOutputBrowser";
10+
import { SpeechOutputAzure } from "./SpeechOutputAzure";
1011

1112
interface Props {
1213
answer: ChatAppResponse;
@@ -17,7 +18,8 @@ interface Props {
1718
onSupportingContentClicked: () => void;
1819
onFollowupQuestionClicked?: (question: string) => void;
1920
showFollowupQuestions?: boolean;
20-
showSpeechOutput?: boolean;
21+
showSpeechOutputBrowser?: boolean;
22+
showSpeechOutputAzure?: boolean;
2123
speechUrl: string | null;
2224
}
2325

@@ -30,7 +32,8 @@ export const Answer = ({
3032
onSupportingContentClicked,
3133
onFollowupQuestionClicked,
3234
showFollowupQuestions,
33-
showSpeechOutput,
35+
showSpeechOutputAzure,
36+
showSpeechOutputBrowser,
3437
speechUrl
3538
}: Props) => {
3639
const followupQuestions = answer.choices[0].context.followup_questions;
@@ -61,7 +64,8 @@ export const Answer = ({
6164
onClick={() => onSupportingContentClicked()}
6265
disabled={!answer.choices[0].context.data_points}
6366
/>
64-
{showSpeechOutput && <SpeechOutput url={speechUrl} />}
67+
{showSpeechOutputAzure && <SpeechOutputAzure url={speechUrl} />}
68+
{showSpeechOutputBrowser && <SpeechOutputBrowser answer={sanitizedAnswerHtml} />}
6569
</div>
6670
</Stack>
6771
</Stack.Item>

app/frontend/src/components/Answer/SpeechOutput.tsx renamed to app/frontend/src/components/Answer/SpeechOutputAzure.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ interface Props {
88

99
let audio = new Audio();
1010

11-
export const SpeechOutput = ({ url }: Props) => {
11+
export const SpeechOutputAzure = ({ url }: Props) => {
1212
const [isPlaying, setIsPlaying] = useState(false);
1313

1414
const startOrStopAudio = async () => {
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import { useState } from "react";
2+
import { IconButton } from "@fluentui/react";
3+
4+
interface Props {
5+
answer: string;
6+
}
7+
8+
const SpeechSynthesis = (window as any).speechSynthesis || (window as any).webkitSpeechSynthesis;
9+
10+
let synth: SpeechSynthesis | null = null;
11+
12+
try {
13+
synth = SpeechSynthesis;
14+
} catch (err) {
15+
console.error("SpeechSynthesis is not supported");
16+
}
17+
18+
const getUtterance = function (text: string) {
19+
if (synth) {
20+
const utterance = new SpeechSynthesisUtterance(text);
21+
utterance.lang = "en-US";
22+
utterance.volume = 1;
23+
utterance.rate = 1;
24+
utterance.pitch = 1;
25+
utterance.voice = synth.getVoices().filter((voice: SpeechSynthesisVoice) => voice.lang === "en-US")[0];
26+
return utterance;
27+
}
28+
};
29+
30+
export const SpeechOutputBrowser = ({ answer }: Props) => {
31+
const [isPlaying, setIsPlaying] = useState<boolean>(false);
32+
33+
const startOrStopSpeech = (answer: string) => {
34+
if (synth != null) {
35+
if (isPlaying) {
36+
synth.cancel(); // removes all utterances from the utterance queue.
37+
setIsPlaying(false);
38+
return;
39+
}
40+
const utterance: SpeechSynthesisUtterance | undefined = getUtterance(answer);
41+
42+
if (!utterance) {
43+
return;
44+
}
45+
46+
synth.speak(utterance);
47+
48+
utterance.onstart = () => {
49+
setIsPlaying(true);
50+
return;
51+
};
52+
53+
utterance.onend = () => {
54+
setIsPlaying(false);
55+
return;
56+
};
57+
}
58+
};
59+
const color = isPlaying ? "red" : "black";
60+
61+
return (
62+
<IconButton
63+
style={{ color: color }}
64+
iconProps={{ iconName: "Volume3" }}
65+
title="Speak answer"
66+
ariaLabel="Speak answer"
67+
onClick={() => startOrStopSpeech(answer)}
68+
disabled={!synth}
69+
/>
70+
);
71+
};
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
export * from "./Answer";
22
export * from "./AnswerLoading";
33
export * from "./AnswerError";
4-
export * from "./SpeechOutput";
4+
export * from "./SpeechOutputBrowser";
5+
export * from "./SpeechOutputAzure";

app/frontend/src/pages/ask/Ask.tsx

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ export function Component(): JSX.Element {
4141
const [showVectorOption, setShowVectorOption] = useState<boolean>(false);
4242
const [showUserUpload, setShowUserUpload] = useState<boolean>(false);
4343
const [showSpeechInput, setShowSpeechInput] = useState<boolean>(false);
44-
const [showSpeechOutput, setShowSpeechOutput] = useState<boolean>(false);
44+
const [showSpeechOutputBrowser, setShowSpeechOutputBrowser] = useState<boolean>(false);
45+
const [showSpeechOutputAzure, setShowSpeechOutputAzure] = useState<boolean>(false);
4546

4647
const lastQuestionRef = useRef<string>("");
4748

@@ -66,7 +67,8 @@ export function Component(): JSX.Element {
6667
}
6768
setShowUserUpload(config.showUserUpload);
6869
setShowSpeechInput(config.showSpeechInput);
69-
setShowSpeechOutput(config.showSpeechOutput);
70+
setShowSpeechOutputBrowser(config.showSpeechOutputBrowser);
71+
setShowSpeechOutputAzure(config.showSpeechOutputAzure);
7072
});
7173
};
7274

@@ -75,7 +77,7 @@ export function Component(): JSX.Element {
7577
}, []);
7678

7779
useEffect(() => {
78-
if (answer && showSpeechOutput) {
80+
if (answer && showSpeechOutputAzure) {
7981
getSpeechApi(answer.choices[0].message.content).then(speechUrl => {
8082
setSpeechUrl(speechUrl);
8183
});
@@ -239,7 +241,8 @@ export function Component(): JSX.Element {
239241
onCitationClicked={x => onShowCitation(x)}
240242
onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab)}
241243
onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab)}
242-
showSpeechOutput={showSpeechOutput}
244+
showSpeechOutputAzure={showSpeechOutputAzure}
245+
showSpeechOutputBrowser={showSpeechOutputBrowser}
243246
speechUrl={speechUrl}
244247
/>
245248
</div>

0 commit comments

Comments
 (0)