Skip to content

Commit 1e713fd

Browse files
author
Jicheng Lu
committed
integrate speech recog
1 parent b0cca21 commit 1e713fd

File tree

4 files changed

+129
-78
lines changed

4 files changed

+129
-78
lines changed

src/lib/helpers/types/conversationTypes.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
* @property {string} status - The conversation status.
2828
* @property {Object} states - The conversation states.
2929
* @property {string[]} tags - The conversation tags.
30+
* @property {boolean?} is_realtime_enabled - Whether the realtime feature is enabled.
3031
* @property {Date} updated_time - The conversation updated time.
3132
* @property {Date} created_time - The conversation created time.
3233
*/

src/lib/services/conversation-service.js

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,6 @@ export async function getConversation(id, isLoadStates = false) {
3232
return response.data;
3333
}
3434

35-
/**
36-
* Get conversation user
37-
* @param {string} id
38-
* @returns {Promise<import('$userTypes').UserModel>}
39-
*/
40-
export async function getConversationUser(id) {
41-
let url = replaceUrl(endpoints.conversationUserUrl, {conversationId: id});
42-
const response = await axios.get(url);
43-
return response.data;
44-
}
45-
4635
/**
4736
* Get conversation list
4837
* @param {import('$conversationTypes').ConversationFilter} filter

src/lib/services/web-speech.js

Lines changed: 96 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,116 @@
11
// // https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API/Using_the_Web_Speech_API
2+
// @ts-ignore
23
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
3-
const SpeechRecognitionEvent = window.SpeechRecognitionEvent || window.webkitSpeechRecognitionEvent;
44

5-
const recognition = !navigator.userAgent.includes('Firefox') ? new SpeechRecognition() : null;
6-
if (recognition) {
7-
recognition.continuous = false;
8-
recognition.lang = "en-US";
9-
recognition.interimResults = false;
10-
recognition.maxAlternatives = 1;
11-
}
12-
13-
14-
const synth = window.speechSynthesis;
15-
16-
const utterThis = new SpeechSynthesisUtterance();
17-
utterThis.pitch = 1;
18-
utterThis.rate = 1;
195

206
export const SPEECH_VOICES = [
217
"Microsoft Michelle Online (Natural) - English (United States)",
228
"Google US English"
239
];
2410

2511
export const webSpeech = {
26-
/** @type {import('$conversationTypes').OnSpeechToTextDetected} */
12+
/** @type {SpeechRecognition | null} */
13+
recognition: null,
14+
15+
/** @type {import('$conversationTypes').OnSpeechToTextDetected} */
2716
onSpeechToTextDetected: () => {},
2817

29-
start() {
30-
if (recognition) {
31-
recognition.start();
32-
console.log("Ready to receive a voice command.");
18+
onRecognitionStarted: () => {},
19+
20+
onRecognitionEnded: () => {},
21+
22+
/** @param {{continuous?: boolean, lang?: string, interimResults?: boolean, maxAlternatives?: number}} options */
23+
start(options = {
24+
continuous: false,
25+
lang: "en-US",
26+
interimResults: false,
27+
maxAlternatives: 1
28+
}) {
29+
this.recognition = !navigator.userAgent.includes('Firefox') ? new SpeechRecognition() : null;
30+
if (this.recognition == null) return;
31+
32+
this.recognition.continuous = options.continuous || false;
33+
this.recognition.lang = options.lang || "en-US";
34+
this.recognition.interimResults = options.interimResults || false;
35+
this.recognition.maxAlternatives = options.maxAlternatives || 1;
36+
37+
this.recognition.onstart = () => {
38+
console.log('Recognition starts...');
39+
this.onRecognitionStarted?.();
40+
};
41+
42+
this.recognition.onresult = (/** @type {any} */ event) => {
43+
const len = event.results.length;
44+
const text = event.results[len-1][0].transcript;
45+
console.log(`Confidence: ${text} ${event.results[len-1][0].confidence}`);
46+
this.onSpeechToTextDetected?.(text);
47+
};
48+
49+
this.recognition.onsoundstart = () => {
50+
console.log('Recognition sound start...');
51+
};
52+
53+
this.recognition.onaudiostart = () => {
54+
console.log('Recognition audio start...');
55+
};
56+
57+
this.recognition.onspeechstart = () => {
58+
console.log('Recognition speech start...');
59+
};
60+
61+
this.recognition.onnomatch = () => {
62+
console.log("I didn't recognize the voice.");
63+
};
64+
65+
this.recognition.onerror = (/** @type {any} */ event) => {
66+
console.log(`Error occurred in recognition: ${event.error}`);
67+
};
68+
69+
this.recognition.onend = () => {
70+
console.log('Recognition is ended.');
71+
this.onRecognitionEnded?.();
72+
};
73+
74+
try {
75+
this.recognition.start();
76+
} catch (err) {
77+
console.log('Error when starting speach recognition...');
78+
setTimeout(() => {
79+
this.recognition.start();
80+
}, 500);
3381
}
3482
},
3583

84+
abort() {
85+
if (this.recognition) {
86+
this.recognition.abort();
87+
}
88+
}
89+
};
90+
91+
export const webSpeaker = {
92+
/** @type {SpeechSynthesisUtterance | null} */
93+
utter: null,
94+
95+
synth: window.speechSynthesis,
96+
3697
/** @param {string} transcript */
37-
utter(transcript) {
38-
setVoiceSynthesis();
39-
utterThis.text = transcript
40-
synth.speak(utterThis);
98+
speak(transcript) {
99+
this.utter = new SpeechSynthesisUtterance();
100+
this.utter.pitch = 1;
101+
this.utter.rate = 1;
102+
this.utter.text = transcript;
103+
104+
// set voice
105+
if (this.utter.voice == null) {
106+
this.utter.voice = this.synth.getVoices().find(x => SPEECH_VOICES.includes(x.name)) || null;
107+
}
108+
109+
this.synth.speak(this.utter);
41110
},
42111

43112
stop() {
44-
synth.cancel();
45-
}
46-
}
47-
48-
function setVoiceSynthesis() {
49-
if (utterThis.voice == null) {
50-
const voices = synth.getVoices();
51-
for (let i = 0; i < voices.length; i++) {
52-
if (SPEECH_VOICES.includes(voices[i].name)) {
53-
utterThis.voice = voices[i];
54-
console.log(voices[i].name);
55-
break;
56-
}
57-
}
113+
this.synth.cancel();
114+
this.utter = null;
58115
}
59-
}
60-
61-
if (recognition) {
62-
recognition.onresult = (/** @type {any} */ event) => {
63-
const text = event.results[0][0].transcript;
64-
console.log(`Confidence: ${text} ${event.results[0][0].confidence}`);
65-
webSpeech.onSpeechToTextDetected(text);
66-
};
67-
68-
recognition.onnomatch = (/** @type {any} */ event) => {
69-
console.log("I didn't recognize that color.");
70-
};
71-
72-
recognition.onerror = (/** @type {any} */ event) => {
73-
console.log(`Error occurred in recognition: ${event.error}`);
74-
};
75-
}
116+
};

src/routes/chat/[agentId]/[conversationId]/chat-box.svelte

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
updateConversationMessage,
3232
updateConversationTags,
3333
getConversationFiles,
34-
getConversationUser,
3534
uploadConversationFiles,
3635
getAddressOptions,
3736
pinConversationToDashboard,
@@ -71,6 +70,7 @@
7170
import InstantLog from './instant-log/instant-log.svelte';
7271
import LocalStorageManager from '$lib/helpers/utils/storage-manager';
7372
import { realtimeChat } from '$lib/services/realtime-chat-service';
73+
import { webSpeech } from '$lib/services/web-speech';
7474
7575
7676
const options = {
@@ -132,7 +132,6 @@
132132
133133
/** @type {any[]} */
134134
let scrollbars = [];
135-
let microphoneIcon = "microphone-off";
136135
137136
/** @type {import('$conversationTypes').ConversationModel} */
138137
let conversation;
@@ -222,7 +221,7 @@
222221
disableSpeech = navigator.userAgent.includes('Firefox');
223222
conversation = await getConversation(params.conversationId, true);
224223
dialogs = await getDialogs(params.conversationId, dialogCount);
225-
conversationUser = await getConversationUser(params.conversationId);
224+
conversationUser = conversation?.user;
226225
selectedTags = conversation?.tags || [];
227226
latestStateLog = conversation?.states;
228227
initUserSentMessages(dialogs);
@@ -669,17 +668,38 @@
669668
}
670669
}
671670
672-
async function startListen() {
671+
function startListen() {
673672
if (disableSpeech) return;
674673
675-
if (!isListening) {
676-
realtimeChat.start(params.agentId, params.conversationId);
677-
isListening = true;
678-
microphoneIcon = "microphone";
674+
isListening = !isListening;
675+
if (conversation?.is_realtime_enabled) {
676+
677+
if (isListening) {
678+
realtimeChat.start(params.agentId, params.conversationId);
679+
} else {
680+
realtimeChat.stop();
681+
}
679682
} else {
680-
realtimeChat.stop();
681-
isListening = false;
682-
microphoneIcon = "microphone-off";
683+
webSpeech.onSpeechToTextDetected = (transcript) => {
684+
if (!!!_.trim(transcript) || isSendingMsg) {
685+
return;
686+
}
687+
688+
sendChatMessage(transcript);
689+
};
690+
webSpeech.onRecognitionStarted = () => {
691+
isListening = true;
692+
};
693+
webSpeech.onRecognitionEnded = () => {
694+
isListening = false;
695+
};
696+
697+
if (isListening) {
698+
webSpeech.start({ continuous: true });
699+
} else {
700+
webSpeech.abort();
701+
}
702+
683703
}
684704
}
685705
@@ -1819,7 +1839,7 @@
18191839
disabled={isSendingMsg || isThinking || disableAction}
18201840
on:click={() => startListen()}
18211841
>
1822-
<i class="mdi mdi-{microphoneIcon} md-36" />
1842+
<i class="mdi mdi-{isListening ? 'microphone' : 'microphone-off'} md-36" />
18231843
</button>
18241844
{/if}
18251845
</div>

0 commit comments

Comments
 (0)