integrate speech recog

Jicheng Lu · Jicheng Lu · commit 1e713fd8280b · 2025-05-12T15:33:20.000-05:00
diff --git a/src/lib/helpers/types/conversationTypes.js b/src/lib/helpers/types/conversationTypes.js
@@ -27,6 +27,7 @@
  * @property {string} status - The conversation status.
  * @property {Object} states - The conversation states.
  * @property {string[]} tags - The conversation tags.
+ * @property {boolean?} is_realtime_enabled - Whether the realtime feature is enabled.
  * @property {Date} updated_time - The conversation updated time.
  * @property {Date} created_time - The conversation created time.
  */
diff --git a/src/lib/services/conversation-service.js b/src/lib/services/conversation-service.js
@@ -32,17 +32,6 @@ export async function getConversation(id, isLoadStates = false) {
     return response.data;
 }
 
-/**
- * Get conversation user
- * @param {string} id
- * @returns {Promise<import('$userTypes').UserModel>}
- */
-export async function getConversationUser(id) {
-    let url = replaceUrl(endpoints.conversationUserUrl, {conversationId: id});
-    const response = await axios.get(url);
-    return response.data;
-}
-
 /**
  * Get conversation list
  * @param {import('$conversationTypes').ConversationFilter} filter
diff --git a/src/lib/services/web-speech.js b/src/lib/services/web-speech.js
@@ -1,75 +1,116 @@
 // // https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API/Using_the_Web_Speech_API
+// @ts-ignore
 const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
-const SpeechRecognitionEvent = window.SpeechRecognitionEvent || window.webkitSpeechRecognitionEvent;
 
-const recognition = !navigator.userAgent.includes('Firefox') ? new SpeechRecognition() : null;
-if (recognition) {
-    recognition.continuous = false;
-    recognition.lang = "en-US";
-    recognition.interimResults = false;
-    recognition.maxAlternatives = 1;
-}
-
-
-const synth = window.speechSynthesis;
-
-const utterThis = new SpeechSynthesisUtterance();
-utterThis.pitch = 1;
-utterThis.rate = 1;
 
 export const SPEECH_VOICES = [
     "Microsoft Michelle Online (Natural) - English (United States)",
     "Google US English"
 ];
 
 export const webSpeech = {
-      /** @type {import('$conversationTypes').OnSpeechToTextDetected} */
+    /** @type {SpeechRecognition | null} */
+    recognition: null,
+
+    /** @type {import('$conversationTypes').OnSpeechToTextDetected} */
     onSpeechToTextDetected: () => {},
 
-    start() {
-        if (recognition) {
-            recognition.start();
-            console.log("Ready to receive a voice command."); 
+    onRecognitionStarted: () => {},
+
+    onRecognitionEnded: () => {},
+
+    /** @param {{continuous?: boolean, lang?: string, interimResults?: boolean, maxAlternatives?: number}} options */
+    start(options = {
+        continuous: false,
+        lang: "en-US",
+        interimResults: false,
+        maxAlternatives: 1
+    }) {
+        this.recognition = !navigator.userAgent.includes('Firefox') ? new SpeechRecognition() : null;
+        if (this.recognition == null) return;
+
+        this.recognition.continuous = options.continuous || false;
+        this.recognition.lang = options.lang || "en-US";
+        this.recognition.interimResults = options.interimResults || false;
+        this.recognition.maxAlternatives = options.maxAlternatives || 1;
+
+        this.recognition.onstart = () => {
+            console.log('Recognition starts...');
+            this.onRecognitionStarted?.();
+        };
+
+        this.recognition.onresult = (/** @type {any} */ event) => {
+            const len = event.results.length;
+            const text = event.results[len-1][0].transcript;
+            console.log(`Confidence: ${text} ${event.results[len-1][0].confidence}`);
+            this.onSpeechToTextDetected?.(text);
+        };
+
+        this.recognition.onsoundstart = () => {
+            console.log('Recognition sound start...');
+        };
+
+        this.recognition.onaudiostart = () => {
+            console.log('Recognition audio start...');
+        };
+
+        this.recognition.onspeechstart = () => {
+            console.log('Recognition speech start...');
+        };
+        
+        this.recognition.onnomatch = () => {
+            console.log("I didn't recognize the voice.");
+        };
+        
+        this.recognition.onerror = (/** @type {any} */ event) => {
+            console.log(`Error occurred in recognition: ${event.error}`);
+        };
+
+        this.recognition.onend = () => {
+            console.log('Recognition is ended.');
+            this.onRecognitionEnded?.();
+        };
+
+        try {
+            this.recognition.start();
+        } catch (err) {
+            console.log('Error when starting speach recognition...');
+            setTimeout(() => {
+                this.recognition.start();
+            }, 500);
         }
     },
 
+    abort() {
+        if (this.recognition) {
+            this.recognition.abort();
+        }
+    }
+};
+
+export const webSpeaker = {
+    /** @type {SpeechSynthesisUtterance | null} */
+    utter: null,
+
+    synth: window.speechSynthesis,
+
     /** @param {string} transcript */
-    utter(transcript) {
-        setVoiceSynthesis();
-        utterThis.text = transcript
-        synth.speak(utterThis);
+    speak(transcript) {
+        this.utter = new SpeechSynthesisUtterance();
+        this.utter.pitch = 1;
+        this.utter.rate = 1;
+        this.utter.text = transcript;
+        
+        // set voice
+        if (this.utter.voice == null) {
+            this.utter.voice = this.synth.getVoices().find(x => SPEECH_VOICES.includes(x.name)) || null;
+        }
+
+        this.synth.speak(this.utter);
     },
 
     stop() {
-        synth.cancel();
-    }
-}
-
-function setVoiceSynthesis() {
-    if (utterThis.voice == null) {
-        const voices = synth.getVoices();
-        for (let i = 0; i < voices.length; i++) {
-            if (SPEECH_VOICES.includes(voices[i].name)) {
-              utterThis.voice = voices[i];
-              console.log(voices[i].name);
-              break;
-            }
-        }
+        this.synth.cancel();
+        this.utter = null;
     }
-}
-
-if (recognition) {
-    recognition.onresult = (/** @type {any} */ event) => {
-        const text = event.results[0][0].transcript;
-        console.log(`Confidence: ${text} ${event.results[0][0].confidence}`);
-        webSpeech.onSpeechToTextDetected(text);
-    };
-    
-    recognition.onnomatch = (/** @type {any} */ event) => {
-        console.log("I didn't recognize that color.");
-    };
-    
-    recognition.onerror = (/** @type {any} */ event) => {
-        console.log(`Error occurred in recognition: ${event.error}`);
-    };
-}
+};
diff --git a/src/routes/chat/[agentId]/[conversationId]/chat-box.svelte b/src/routes/chat/[agentId]/[conversationId]/chat-box.svelte
@@ -31,7 +31,6 @@
 		updateConversationMessage,
 		updateConversationTags,
 		getConversationFiles,
-		getConversationUser,
 		uploadConversationFiles,
 		getAddressOptions,
 		pinConversationToDashboard,
@@ -71,6 +70,7 @@
 	import InstantLog from './instant-log/instant-log.svelte';
 	import LocalStorageManager from '$lib/helpers/utils/storage-manager';
 	import { realtimeChat } from '$lib/services/realtime-chat-service';
+	import { webSpeech } from '$lib/services/web-speech';
 
 	
 	const options = {
@@ -132,7 +132,6 @@
 	
 	/** @type {any[]} */
     let scrollbars = [];
-	let microphoneIcon = "microphone-off";
 
 	/** @type {import('$conversationTypes').ConversationModel} */
     let conversation;
@@ -222,7 +221,7 @@
 		disableSpeech = navigator.userAgent.includes('Firefox');
 		conversation = await getConversation(params.conversationId, true);
 		dialogs = await getDialogs(params.conversationId, dialogCount);
-		conversationUser = await getConversationUser(params.conversationId);
+		conversationUser = conversation?.user;
 		selectedTags = conversation?.tags || [];
 		latestStateLog = conversation?.states;
 		initUserSentMessages(dialogs);
@@ -669,17 +668,38 @@
 		}
     }
 
-    async function startListen() {
+    function startListen() {
 		if (disableSpeech) return;
 
-		if (!isListening) {
-			realtimeChat.start(params.agentId, params.conversationId);
-			isListening = true;
-			microphoneIcon = "microphone";
+		isListening = !isListening;
+		if (conversation?.is_realtime_enabled) {
+
+			if (isListening) {
+				realtimeChat.start(params.agentId, params.conversationId);
+			} else {
+				realtimeChat.stop();
+			}
 		} else {
-			realtimeChat.stop();
-			isListening = false;
-			microphoneIcon = "microphone-off";
+			webSpeech.onSpeechToTextDetected = (transcript) => {
+				if (!!!_.trim(transcript) || isSendingMsg) {
+					return;
+				}
+
+				sendChatMessage(transcript);
+			};
+			webSpeech.onRecognitionStarted = () => {
+				isListening = true;
+			};
+			webSpeech.onRecognitionEnded = () => {
+				isListening = false;
+			};
+
+			if (isListening) {
+				webSpeech.start({ continuous: true });
+			} else {
+				webSpeech.abort();
+			}
+			
 		}
 	}
 
@@ -1819,7 +1839,7 @@
 										disabled={isSendingMsg || isThinking || disableAction}
 										on:click={() => startListen()}
 									>
-										<i class="mdi mdi-{microphoneIcon} md-36" />
+										<i class="mdi mdi-{isListening ? 'microphone' : 'microphone-off'} md-36" />
 									</button>
 								{/if}
 							</div>