feat: add low cost browser text to speech output using WebSpeechAPI (#1671)

john0isaac · web-flow · commit 306ac386573b · 2024-06-03T14:58:56.000-07:00
* feat: add low cost browser text to speech output using WebSpeechAPI

This PR adds the optional feature to enable the built into browsers text to speech optional feature by setting the  boolean.

* improve and refactor code

* fix typo and remove unnessesary check

* add docs

* fix note

* fix ruff
diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml
@@ -87,6 +87,7 @@ steps:
       AZURE_COMPUTER_VISION_LOCATION: $(AZURE_COMPUTER_VISION_LOCATION)
       AZURE_COMPUTER_VISION_SKU: $(AZURE_COMPUTER_VISION_SKU)
       USE_SPEECH_INPUT_BROWSER: $(USE_SPEECH_INPUT_BROWSER)
+      USE_SPEECH_OUTPUT_BROWSER: $(USE_SPEECH_OUTPUT_BROWSER)
       USE_SPEECH_OUTPUT_AZURE: $(USE_SPEECH_OUTPUT_AZURE)
       AZURE_SPEECH_SERVICE: $(AZURE_SPEECH_SERVICE)
       AZURE_SPEECH_SERVICE_RESOURCE_GROUP: $(AZURE_SPEECH_SERVICE_RESOURCE_GROUP)
diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml
@@ -74,6 +74,7 @@ jobs:
       AZURE_VISION_ENDPOINT: ${{ vars.AZURE_VISION_ENDPOINT }}
       VISION_SECRET_NAME: ${{ vars.VISION_SECRET_NAME }}
       USE_SPEECH_INPUT_BROWSER: ${{ vars.USE_SPEECH_INPUT_BROWSER }}
+      USE_SPEECH_OUTPUT_BROWSER: ${{ vars.USE_SPEECH_OUTPUT_BROWSER }}
       USE_SPEECH_OUTPUT_AZURE: ${{ vars.USE_SPEECH_OUTPUT_AZURE }}
       AZURE_SPEECH_SERVICE: ${{ vars.AZURE_SPEECH_SERVICE }}
       AZURE_SPEECH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SPEECH_RESOURCE_GROUP }}
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -63,7 +63,8 @@
     CONFIG_SEARCH_CLIENT,
     CONFIG_SEMANTIC_RANKER_DEPLOYED,
     CONFIG_SPEECH_INPUT_ENABLED,
-    CONFIG_SPEECH_OUTPUT_ENABLED,
+    CONFIG_SPEECH_OUTPUT_AZURE_ENABLED,
+    CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED,
     CONFIG_SPEECH_SERVICE_ID,
     CONFIG_SPEECH_SERVICE_LOCATION,
     CONFIG_SPEECH_SERVICE_TOKEN,
@@ -245,7 +246,8 @@ def config():
             "showVectorOption": current_app.config[CONFIG_VECTOR_SEARCH_ENABLED],
             "showUserUpload": current_app.config[CONFIG_USER_UPLOAD_ENABLED],
             "showSpeechInput": current_app.config[CONFIG_SPEECH_INPUT_ENABLED],
-            "showSpeechOutput": current_app.config[CONFIG_SPEECH_OUTPUT_ENABLED],
+            "showSpeechOutputBrowser": current_app.config[CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED],
+            "showSpeechOutputAzure": current_app.config[CONFIG_SPEECH_OUTPUT_AZURE_ENABLED],
         }
     )
 
@@ -405,6 +407,7 @@ async def setup_clients():
     USE_GPT4V = os.getenv("USE_GPT4V", "").lower() == "true"
     USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true"
     USE_SPEECH_INPUT_BROWSER = os.getenv("USE_SPEECH_INPUT_BROWSER", "").lower() == "true"
+    USE_SPEECH_OUTPUT_BROWSER = os.getenv("USE_SPEECH_OUTPUT_BROWSER", "").lower() == "true"
     USE_SPEECH_OUTPUT_AZURE = os.getenv("USE_SPEECH_OUTPUT_AZURE", "").lower() == "true"
 
     # Use the current user identity to authenticate with Azure OpenAI, AI Search and Blob Storage (no secrets needed,
@@ -536,7 +539,8 @@ async def setup_clients():
     current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false"
     current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD)
     current_app.config[CONFIG_SPEECH_INPUT_ENABLED] = USE_SPEECH_INPUT_BROWSER
-    current_app.config[CONFIG_SPEECH_OUTPUT_ENABLED] = USE_SPEECH_OUTPUT_AZURE
+    current_app.config[CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED] = USE_SPEECH_OUTPUT_BROWSER
+    current_app.config[CONFIG_SPEECH_OUTPUT_AZURE_ENABLED] = USE_SPEECH_OUTPUT_AZURE
 
     # Various approaches to integrate GPT and external knowledge, most applications will use a single one of these patterns
     # or some derivative, here we include several for exploration purposes
diff --git a/app/backend/config.py b/app/backend/config.py
@@ -15,7 +15,8 @@
 CONFIG_OPENAI_CLIENT = "openai_client"
 CONFIG_INGESTER = "ingester"
 CONFIG_SPEECH_INPUT_ENABLED = "speech_input_enabled"
-CONFIG_SPEECH_OUTPUT_ENABLED = "speech_output_enabled"
+CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED = "speech_output_browser_enabled"
+CONFIG_SPEECH_OUTPUT_AZURE_ENABLED = "speech_output_azure_enabled"
 CONFIG_SPEECH_SERVICE_ID = "speech_service_id"
 CONFIG_SPEECH_SERVICE_LOCATION = "speech_service_location"
 CONFIG_SPEECH_SERVICE_TOKEN = "speech_service_token"
diff --git a/app/frontend/src/api/models.ts b/app/frontend/src/api/models.ts
@@ -86,7 +86,8 @@ export type Config = {
     showVectorOption: boolean;
     showUserUpload: boolean;
     showSpeechInput: boolean;
-    showSpeechOutput: boolean;
+    showSpeechOutputBrowser: boolean;
+    showSpeechOutputAzure: boolean;
 };
 
 export type SimpleAPIResponse = {
diff --git a/app/frontend/src/components/Answer/Answer.tsx b/app/frontend/src/components/Answer/Answer.tsx
@@ -6,7 +6,8 @@ import styles from "./Answer.module.css";
 import { ChatAppResponse, getCitationFilePath } from "../../api";
 import { parseAnswerToHtml } from "./AnswerParser";
 import { AnswerIcon } from "./AnswerIcon";
-import { SpeechOutput } from "./SpeechOutput";
+import { SpeechOutputBrowser } from "./SpeechOutputBrowser";
+import { SpeechOutputAzure } from "./SpeechOutputAzure";
 
 interface Props {
     answer: ChatAppResponse;
@@ -17,7 +18,8 @@ interface Props {
     onSupportingContentClicked: () => void;
     onFollowupQuestionClicked?: (question: string) => void;
     showFollowupQuestions?: boolean;
-    showSpeechOutput?: boolean;
+    showSpeechOutputBrowser?: boolean;
+    showSpeechOutputAzure?: boolean;
     speechUrl: string | null;
 }
 
@@ -30,7 +32,8 @@ export const Answer = ({
     onSupportingContentClicked,
     onFollowupQuestionClicked,
     showFollowupQuestions,
-    showSpeechOutput,
+    showSpeechOutputAzure,
+    showSpeechOutputBrowser,
     speechUrl
 }: Props) => {
     const followupQuestions = answer.choices[0].context.followup_questions;
@@ -61,7 +64,8 @@ export const Answer = ({
                             onClick={() => onSupportingContentClicked()}
                             disabled={!answer.choices[0].context.data_points}
                         />
-                        {showSpeechOutput && <SpeechOutput url={speechUrl} />}
+                        {showSpeechOutputAzure && <SpeechOutputAzure url={speechUrl} />}
+                        {showSpeechOutputBrowser && <SpeechOutputBrowser answer={sanitizedAnswerHtml} />}
                     </div>
                 </Stack>
             </Stack.Item>
diff --git a/app/frontend/src/components/Answer/SpeechOutputAzure.tsx b/app/frontend/src/components/Answer/SpeechOutputAzure.tsx
@@ -8,7 +8,7 @@ interface Props {
 
 let audio = new Audio();
 
-export const SpeechOutput = ({ url }: Props) => {
+export const SpeechOutputAzure = ({ url }: Props) => {
     const [isPlaying, setIsPlaying] = useState(false);
 
     const startOrStopAudio = async () => {
diff --git a/app/frontend/src/components/Answer/SpeechOutputBrowser.tsx b/app/frontend/src/components/Answer/SpeechOutputBrowser.tsx
@@ -0,0 +1,71 @@
+import { useState } from "react";
+import { IconButton } from "@fluentui/react";
+
+interface Props {
+    answer: string;
+}
+
+const SpeechSynthesis = (window as any).speechSynthesis || (window as any).webkitSpeechSynthesis;
+
+let synth: SpeechSynthesis | null = null;
+
+try {
+    synth = SpeechSynthesis;
+} catch (err) {
+    console.error("SpeechSynthesis is not supported");
+}
+
+const getUtterance = function (text: string) {
+    if (synth) {
+        const utterance = new SpeechSynthesisUtterance(text);
+        utterance.lang = "en-US";
+        utterance.volume = 1;
+        utterance.rate = 1;
+        utterance.pitch = 1;
+        utterance.voice = synth.getVoices().filter((voice: SpeechSynthesisVoice) => voice.lang === "en-US")[0];
+        return utterance;
+    }
+};
+
+export const SpeechOutputBrowser = ({ answer }: Props) => {
+    const [isPlaying, setIsPlaying] = useState<boolean>(false);
+
+    const startOrStopSpeech = (answer: string) => {
+        if (synth != null) {
+            if (isPlaying) {
+                synth.cancel(); // removes all utterances from the utterance queue.
+                setIsPlaying(false);
+                return;
+            }
+            const utterance: SpeechSynthesisUtterance | undefined = getUtterance(answer);
+
+            if (!utterance) {
+                return;
+            }
+
+            synth.speak(utterance);
+
+            utterance.onstart = () => {
+                setIsPlaying(true);
+                return;
+            };
+
+            utterance.onend = () => {
+                setIsPlaying(false);
+                return;
+            };
+        }
+    };
+    const color = isPlaying ? "red" : "black";
+
+    return (
+        <IconButton
+            style={{ color: color }}
+            iconProps={{ iconName: "Volume3" }}
+            title="Speak answer"
+            ariaLabel="Speak answer"
+            onClick={() => startOrStopSpeech(answer)}
+            disabled={!synth}
+        />
+    );
+};
diff --git a/app/frontend/src/components/Answer/index.ts b/app/frontend/src/components/Answer/index.ts
@@ -1,4 +1,5 @@
 export * from "./Answer";
 export * from "./AnswerLoading";
 export * from "./AnswerError";
-export * from "./SpeechOutput";
+export * from "./SpeechOutputBrowser";
+export * from "./SpeechOutputAzure";
diff --git a/app/frontend/src/pages/ask/Ask.tsx b/app/frontend/src/pages/ask/Ask.tsx
@@ -41,7 +41,8 @@ export function Component(): JSX.Element {
     const [showVectorOption, setShowVectorOption] = useState<boolean>(false);
     const [showUserUpload, setShowUserUpload] = useState<boolean>(false);
     const [showSpeechInput, setShowSpeechInput] = useState<boolean>(false);
-    const [showSpeechOutput, setShowSpeechOutput] = useState<boolean>(false);
+    const [showSpeechOutputBrowser, setShowSpeechOutputBrowser] = useState<boolean>(false);
+    const [showSpeechOutputAzure, setShowSpeechOutputAzure] = useState<boolean>(false);
 
     const lastQuestionRef = useRef<string>("");
 
@@ -66,7 +67,8 @@ export function Component(): JSX.Element {
             }
             setShowUserUpload(config.showUserUpload);
             setShowSpeechInput(config.showSpeechInput);
-            setShowSpeechOutput(config.showSpeechOutput);
+            setShowSpeechOutputBrowser(config.showSpeechOutputBrowser);
+            setShowSpeechOutputAzure(config.showSpeechOutputAzure);
         });
     };
 
@@ -75,7 +77,7 @@ export function Component(): JSX.Element {
     }, []);
 
     useEffect(() => {
-        if (answer && showSpeechOutput) {
+        if (answer && showSpeechOutputAzure) {
             getSpeechApi(answer.choices[0].message.content).then(speechUrl => {
                 setSpeechUrl(speechUrl);
             });
@@ -239,7 +241,8 @@ export function Component(): JSX.Element {
                             onCitationClicked={x => onShowCitation(x)}
                             onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab)}
                             onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab)}
-                            showSpeechOutput={showSpeechOutput}
+                            showSpeechOutputAzure={showSpeechOutputAzure}
+                            showSpeechOutputBrowser={showSpeechOutputBrowser}
                             speechUrl={speechUrl}
                         />
                     </div>
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
@@ -70,7 +70,8 @@ const Chat = () => {
     const [showVectorOption, setShowVectorOption] = useState<boolean>(false);
     const [showUserUpload, setShowUserUpload] = useState<boolean>(false);
     const [showSpeechInput, setShowSpeechInput] = useState<boolean>(false);
-    const [showSpeechOutput, setShowSpeechOutput] = useState<boolean>(false);
+    const [showSpeechOutputBrowser, setShowSpeechOutputBrowser] = useState<boolean>(false);
+    const [showSpeechOutputAzure, setShowSpeechOutputAzure] = useState<boolean>(false);
 
     const getConfig = async () => {
         configApi().then(config => {
@@ -83,7 +84,8 @@ const Chat = () => {
             }
             setShowUserUpload(config.showUserUpload);
             setShowSpeechInput(config.showSpeechInput);
-            setShowSpeechOutput(config.showSpeechOutput);
+            setShowSpeechOutputBrowser(config.showSpeechOutputBrowser);
+            setShowSpeechOutputAzure(config.showSpeechOutputAzure);
         });
     };
 
@@ -213,7 +215,7 @@ const Chat = () => {
     }, []);
 
     useEffect(() => {
-        if (answers && showSpeechOutput) {
+        if (answers && showSpeechOutputAzure) {
             // For each answer that is missing a speech URL, fetch the speech URL
             for (let i = 0; i < answers.length; i++) {
                 if (!speechUrls[i]) {
@@ -335,7 +337,8 @@ const Chat = () => {
                                                 onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, index)}
                                                 onFollowupQuestionClicked={q => makeApiRequest(q)}
                                                 showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
-                                                showSpeechOutput={showSpeechOutput}
+                                                showSpeechOutputAzure={showSpeechOutputAzure}
+                                                showSpeechOutputBrowser={showSpeechOutputBrowser}
                                                 speechUrl={speechUrls[index]}
                                             />
                                         </div>
@@ -356,7 +359,8 @@ const Chat = () => {
                                                 onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, index)}
                                                 onFollowupQuestionClicked={q => makeApiRequest(q)}
                                                 showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
-                                                showSpeechOutput={showSpeechOutput}
+                                                showSpeechOutputAzure={showSpeechOutputAzure}
+                                                showSpeechOutputBrowser={showSpeechOutputBrowser}
                                                 speechUrl={speechUrls[index]}
                                             />
                                         </div>
diff --git a/azure.yaml b/azure.yaml
@@ -68,6 +68,7 @@ pipeline:
       - AZURE_COMPUTER_VISION_LOCATION
       - AZURE_COMPUTER_VISION_SKU
       - USE_SPEECH_INPUT_BROWSER
+      - USE_SPEECH_OUTPUT_BROWSER
       - USE_SPEECH_OUTPUT_AZURE
       - AZURE_SPEECH_SERVICE
       - AZURE_SPEECH_SERVICE_RESOURCE_GROUP
diff --git a/docs/deploy_features.md b/docs/deploy_features.md
@@ -111,12 +111,16 @@ This section covers the integration of GPT-4 Vision with Azure AI Search. Learn
 
 You can optionally enable speech input/output by setting the azd environment variables.
 
+### Speech Input:
+
 The speech input feature uses the browser's built-in [Speech Recognition API](https://developer.mozilla.org/docs/Web/API/SpeechRecognition). It may not work in all browser/OS combinations. To enable speech input, run:
 
 ```shell
 azd env set USE_SPEECH_INPUT_BROWSER true
 ```
 
+### Speech Output:
+
 The speech output feature uses [Azure Speech Service](https://learn.microsoft.com/azure/ai-services/speech-service/overview) for speech-to-text. Additional costs will be incurred for using the Azure Speech Service. [See pricing](https://azure.microsoft.com/pricing/details/cognitive-services/speech-services/). To enable speech output, run:
 
 ```shell
@@ -129,6 +133,14 @@ To set [the voice](https://learn.microsoft.com/azure/ai-services/speech-service/
 azd env set AZURE_SPEECH_SERVICE_VOICE en-US-AndrewMultilingualNeural
 ```
 
+> [!NOTE]
+> Alternatively you can use the browser's built-in [Speech Synthesis API](https://developer.mozilla.org/docs/Web/API/SpeechSynthesis). It may not work in all browser/OS combinations. To enable speech output, run:
+>
+>    ```shell
+>    azd env set USE_SPEECH_OUTPUT_BROWSER true
+>    ```
+>
+
 ## Enabling Integrated Vectorization
 
 Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in/ba-p/3960809#:~:text=Integrated%20vectorization%20is%20a%20new%20feature%20of%20Azure,pull-indexers%2C%20and%20vectorization%20of%20text%20queries%20through%20vectorizers). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies.
diff --git a/infra/main.bicep b/infra/main.bicep
@@ -167,6 +167,8 @@ param useApplicationInsights bool = false
 
 @description('Use speech recognition feature in browser')
 param useSpeechInputBrowser bool = false
+@description('Use speech synthesis in browser')
+param useSpeechOutputBrowser bool = false
 @description('Use Azure speech service for reading out text')
 param useSpeechOutputAzure bool = false
 @description('Show options to use vector embeddings for searching in the app UI')
@@ -299,6 +301,7 @@ module backend 'core/host/appservice.bicep' = {
       AZURE_SPEECH_SERVICE_ID: useSpeechOutputAzure ? speech.outputs.id : ''
       AZURE_SPEECH_SERVICE_LOCATION: useSpeechOutputAzure ? speech.outputs.location : ''
       USE_SPEECH_INPUT_BROWSER: useSpeechInputBrowser
+      USE_SPEECH_OUTPUT_BROWSER: useSpeechOutputBrowser
       USE_SPEECH_OUTPUT_AZURE: useSpeechOutputAzure
       // Shared by all OpenAI deployments
       OPENAI_HOST: openAiHost
diff --git a/infra/main.parameters.json b/infra/main.parameters.json
@@ -152,6 +152,9 @@
     "useSpeechInputBrowser": {
       "value": "${USE_SPEECH_INPUT_BROWSER=false}"
     },
+    "useSpeechOutputBrowser": {
+      "value": "${USE_SPEECH_OUTPUT_BROWSER=false}"
+    },
     "useSpeechOutputAzure": {
       "value": "${USE_SPEECH_OUTPUT_AZURE=false}"
     },