Skip to content

Commit 77a5ace

Browse files
author
Florian Treml
committed
BOT-2749 added Azure Speech
1 parent 14a8883 commit 77a5ace

File tree

12 files changed

+321
-25
lines changed

12 files changed

+321
-25
lines changed

connectors/ws/simple.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ const main = async () => {
1919
ws.on('message', (data) => {
2020
try {
2121
const dj = JSON.parse(data)
22-
if (dj.final) console.log('received: %s', dj.text)
22+
if (dj.final) console.log('received %s-%s: %s ', dj.start, dj.end, dj.text)
2323
} catch (err) {
2424
}
2525
})

frontend/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"express-winston": "^4.2.0",
2727
"ibm-watson": "^6.2.1",
2828
"lodash": "^4.17.21",
29+
"microsoft-cognitiveservices-speech-sdk": "^1.19.0",
2930
"mkdirp": "^1.0.4",
3031
"multer": "^1.4.3",
3132
"mustache": "^4.2.0",

frontend/sample.wav

-103 KB
Binary file not shown.

frontend/src/routes.js

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,15 @@ if (tmpPath) {
3737
const ttsEngines = {
3838
google: new (require('./tts/google'))(),
3939
ibm: new (require('./tts/ibm'))(),
40+
azure: new (require('./tts/azure'))(),
4041
marytts: new (require('./tts/marytts'))(),
4142
picotts: new (require('./tts/picotts'))()
4243
}
4344
const sttEngines = {
4445
google: new (require('./stt/google'))(),
4546
kaldi: new (require('./stt/kaldi'))(),
46-
ibm: new (require('./stt/ibm'))()
47+
ibm: new (require('./stt/ibm'))(),
48+
azure: new (require('./stt/azure'))()
4749
}
4850

4951
const multerMemoryStorage = multer.memoryStorage()
@@ -121,7 +123,7 @@ const router = express.Router()
121123
* required: false
122124
* schema:
123125
* type: string
124-
* enum: [kaldi, google, ibm]
126+
* enum: [kaldi, google, ibm, azure]
125127
* responses:
126128
* 200:
127129
* description: List of supported STT languages
@@ -156,7 +158,13 @@ const router = express.Router()
156158
* schema:
157159
* type: string
158160
* - name: hint
159-
* description: Hint text for calculating the Levenshtein edit distance for the result text (word error rate)
161+
* description: Hint text for the Speech-to-text backend (supported by google and azure)
162+
* in: query
163+
* required: false
164+
* schema:
165+
* type: string
166+
* - name: wer
167+
* description: Text for calculating the Levenshtein edit distance for the result text (word error rate)
160168
* in: query
161169
* required: false
162170
* schema:
@@ -167,9 +175,9 @@ const router = express.Router()
167175
* required: false
168176
* schema:
169177
* type: string
170-
* enum: [kaldi, google, ibm]
178+
* enum: [kaldi, google, ibm, azure]
171179
* - name: cache
172-
* description: Disable result cache
180+
* description: Use result cache (default Y)
173181
* in: query
174182
* required: false
175183
* schema:
@@ -227,10 +235,11 @@ router.post('/api/stt/:language', async (req, res, next) => {
227235

228236
const result = await stt.stt(req, {
229237
language: req.params.language,
230-
buffer: buffer
238+
buffer: buffer,
239+
hint: req.query.hint
231240
})
232-
if (req.query.hint) {
233-
result.wer = await wer(req.query.hint, result.text)
241+
if (req.query.wer) {
242+
result.wer = await wer(req.query.wer, result.text)
234243
}
235244
res.json(result).end()
236245

@@ -266,7 +275,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
266275
* required: false
267276
* schema:
268277
* type: string
269-
* enum: [google, ibm, marytts, picotts]
278+
* enum: [google, ibm, azure, marytts, picotts]
270279
* responses:
271280
* 200:
272281
* description: List of supported voices
@@ -307,7 +316,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
307316
* required: false
308317
* schema:
309318
* type: string
310-
* enum: [google, ibm, marytts, picotts]
319+
* enum: [google, ibm, azure, marytts, picotts]
311320
* responses:
312321
* 200:
313322
* description: List of supported TTS languages
@@ -359,9 +368,9 @@ router.post('/api/stt/:language', async (req, res, next) => {
359368
* required: false
360369
* schema:
361370
* type: string
362-
* enum: [google, ibm, marytts, picotts]
371+
* enum: [google, ibm, azure, marytts, picotts]
363372
* - name: cache
364-
* description: Disable result cache
373+
* description: Use result cache (default Y)
365374
* in: query
366375
* required: false
367376
* schema:
@@ -692,7 +701,7 @@ const wssStreams = {}
692701
* required: false
693702
* schema:
694703
* type: string
695-
* enum: [kaldi, google, ibm]
704+
* enum: [kaldi, google, ibm, azure]
696705
* responses:
697706
* 200:
698707
* description: Websocket Url to stream the audio to, and the uri to check status and end the stream

frontend/src/stt/azure.js

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
const _ = require('lodash')
2+
const request = require('request-promise-native')
3+
const cheerio = require('cheerio')
4+
const EventEmitter = require('events')
5+
const { ResultReason, AudioInputStream, AudioStreamFormat, AudioConfig, SpeechRecognizer, PhraseListGrammar, OutputFormat } = require('microsoft-cognitiveservices-speech-sdk')
6+
const debug = require('debug')('botium-speech-processing-azure')
7+
8+
const { azureSpeechConfig, applyExtraAzureSpeechConfig, getAzureErrorDetails } = require('../utils')
9+
10+
const AZURE_STT_LANGUAGES_URL = 'https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support'
11+
const downloadLanguageCodes = async () => {
12+
debug(`Downloading language codes from ${AZURE_STT_LANGUAGES_URL}`)
13+
const htmlString = await request(AZURE_STT_LANGUAGES_URL)
14+
const $ = cheerio.load(htmlString)
15+
16+
const languageCodes = []
17+
$('table:first-of-type tbody tr').each(function () {
18+
const tds = $(this).find('td')
19+
const languageCode = $(tds[1]).text().trim()
20+
if (languageCode) {
21+
languageCodes.push(languageCode)
22+
}
23+
})
24+
return languageCodes
25+
}
26+
27+
let languageCodes = null
28+
29+
class AzureSTT {
30+
async languages (req) {
31+
if (!languageCodes) {
32+
languageCodes = _.uniq(await downloadLanguageCodes()).sort()
33+
}
34+
return languageCodes
35+
}
36+
37+
async stt_OpenStream (req, { language }) {
38+
const speechConfig = azureSpeechConfig(req)
39+
40+
speechConfig.outputFormat = OutputFormat.Detailed
41+
if (language) speechConfig.speechRecognitionLanguage = language
42+
43+
applyExtraAzureSpeechConfig(speechConfig, req)
44+
45+
let audioFormat = AudioStreamFormat.getDefaultInputFormat()
46+
const extraAzureFormatConfig = _.get(req, 'body.azure.config.audioStreamFormat')
47+
if (extraAzureFormatConfig) {
48+
audioFormat = AudioStreamFormat.getWaveFormatPCM(extraAzureFormatConfig.samplesPerSecond || 16000, extraAzureFormatConfig.bitsPerSample || 16, extraAzureFormatConfig.channels || 1)
49+
}
50+
const pushStream = AudioInputStream.createPushStream(audioFormat)
51+
const audioConfig = AudioConfig.fromStreamInput(pushStream)
52+
const recognizer = new SpeechRecognizer(speechConfig, audioConfig)
53+
54+
const events = new EventEmitter()
55+
56+
const recognizedHandler = (s, e) => {
57+
if (e.result.reason === ResultReason.RecognizedSpeech || e.result.reason === ResultReason.RecognizingSpeech) {
58+
const event = {
59+
text: e.result.text,
60+
final: e.result.reason === ResultReason.RecognizedSpeech,
61+
debug: e.result
62+
}
63+
event.start = _.round(e.result.offset / 10000000, 3)
64+
event.end = _.round((e.result.offset + e.result.duration) / 10000000, 3)
65+
events.emit('data', event)
66+
}
67+
}
68+
recognizer.recognizing = recognizedHandler
69+
recognizer.recognized = recognizedHandler
70+
recognizer.sessionStopped = (s, e) => {
71+
recognizer.stopContinuousRecognitionAsync()
72+
events.emit('close')
73+
}
74+
recognizer.startContinuousRecognitionAsync()
75+
76+
return new Promise((resolve, reject) => {
77+
recognizer.canceled = (s, e) => {
78+
recognizer.stopContinuousRecognitionAsync()
79+
reject(new Error(`Azure STT failed: ${getAzureErrorDetails(e)}`))
80+
}
81+
recognizer.sessionStarted = (s, e) => {
82+
resolve({
83+
events,
84+
write: (buffer) => {
85+
pushStream.write(buffer)
86+
},
87+
end: () => {
88+
},
89+
close: () => {
90+
recognizer.stopContinuousRecognitionAsync()
91+
pushStream.close()
92+
}
93+
})
94+
}
95+
})
96+
}
97+
98+
async stt (req, { language, buffer, hint }) {
99+
const speechConfig = azureSpeechConfig(req)
100+
101+
speechConfig.outputFormat = OutputFormat.Detailed
102+
if (language) speechConfig.speechRecognitionLanguage = language
103+
104+
applyExtraAzureSpeechConfig(speechConfig, req)
105+
106+
let audioFormat = AudioStreamFormat.getDefaultInputFormat()
107+
const extraAzureFormatConfig = _.get(req, 'body.azure.config.audioStreamFormat')
108+
if (extraAzureFormatConfig) {
109+
audioFormat = AudioStreamFormat.getWaveFormatPCM(extraAzureFormatConfig.samplesPerSecond || 16000, extraAzureFormatConfig.bitsPerSample || 16, extraAzureFormatConfig.channels || 1)
110+
}
111+
112+
const pushStream = AudioInputStream.createPushStream(audioFormat)
113+
pushStream.write(buffer)
114+
pushStream.close()
115+
116+
return new Promise((resolve, reject) => {
117+
const audioConfig = AudioConfig.fromStreamInput(pushStream)
118+
const recognizer = new SpeechRecognizer(speechConfig, audioConfig)
119+
120+
if (hint && hint.length > 0) {
121+
const phraseList = PhraseListGrammar.fromRecognizer(recognizer)
122+
phraseList.addPhrase(hint)
123+
}
124+
125+
recognizer.recognizeOnceAsync(
126+
result => {
127+
if (result.errorDetails) {
128+
reject(new Error(`Azure STT failed: ${getAzureErrorDetails(result)}`))
129+
} else {
130+
resolve({
131+
text: result.text || '',
132+
debug: result
133+
})
134+
}
135+
recognizer.close()
136+
},
137+
error => {
138+
debug(error)
139+
recognizer.close()
140+
reject(new Error(`Azure STT failed: ${error}`))
141+
})
142+
})
143+
}
144+
}
145+
146+
module.exports = AzureSTT

frontend/src/stt/google.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ class GoogleSTT {
117117
}
118118
}
119119

120-
async stt (req, { language, buffer }) {
120+
async stt (req, { language, buffer, hint }) {
121121
const speechClient = new speech.SpeechClient(googleOptions(req))
122122
const storageClient = new storage.Storage(googleOptions(req))
123123

@@ -128,6 +128,11 @@ class GoogleSTT {
128128
audio: {
129129
}
130130
}
131+
if (hint && hint.length > 0) {
132+
request.config.speechContexts = {
133+
phrases: [hint]
134+
}
135+
}
131136
if (process.env.BOTIUM_SPEECH_GOOGLE_CONFIG) {
132137
try {
133138
const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_GOOGLE_CONFIG)

frontend/src/stt/kaldi.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ class KaldiSTT {
4747
ws.on('message', (data) => {
4848
try {
4949
const dj = JSON.parse(data)
50-
console.log(dj)
5150
const hypotheses = dj.result && dj.result.hypotheses && dj.result.hypotheses[0]
5251
if (hypotheses && hypotheses.transcript) {
5352
const event = {

frontend/src/swagger.json

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"openapi": "3.0.0",
33
"info": {
44
"title": "Botium Speech Processing API",
5-
"version": "1.1.1",
5+
"version": "1.2.1",
66
"description": "Botium Speech Processing API"
77
},
88
"basePath": "/",
@@ -58,7 +58,8 @@
5858
"enum": [
5959
"kaldi",
6060
"google",
61-
"ibm"
61+
"ibm",
62+
"azure"
6263
]
6364
}
6465
}
@@ -99,7 +100,16 @@
99100
},
100101
{
101102
"name": "hint",
102-
"description": "Hint text for calculating the Levenshtein edit distance for the result text (word error rate)",
103+
"description": "Hint text for the Speech-to-text backend (supported by google and azure)",
104+
"in": "query",
105+
"required": false,
106+
"schema": {
107+
"type": "string"
108+
}
109+
},
110+
{
111+
"name": "wer",
112+
"description": "Text for calculating the Levenshtein edit distance for the result text (word error rate)",
103113
"in": "query",
104114
"required": false,
105115
"schema": {
@@ -116,13 +126,14 @@
116126
"enum": [
117127
"kaldi",
118128
"google",
119-
"ibm"
129+
"ibm",
130+
"azure"
120131
]
121132
}
122133
},
123134
{
124135
"name": "cache",
125-
"description": "Disable result cache",
136+
"description": "Use result cache (default Y)",
126137
"in": "query",
127138
"required": false,
128139
"schema": {
@@ -181,6 +192,7 @@
181192
"enum": [
182193
"google",
183194
"ibm",
195+
"azure",
184196
"marytts",
185197
"picotts"
186198
]
@@ -237,6 +249,7 @@
237249
"enum": [
238250
"google",
239251
"ibm",
252+
"azure",
240253
"marytts",
241254
"picotts"
242255
]
@@ -305,14 +318,15 @@
305318
"enum": [
306319
"google",
307320
"ibm",
321+
"azure",
308322
"marytts",
309323
"picotts"
310324
]
311325
}
312326
},
313327
{
314328
"name": "cache",
315-
"description": "Disable result cache",
329+
"description": "Use result cache (default Y)",
316330
"in": "query",
317331
"required": false,
318332
"schema": {
@@ -587,7 +601,8 @@
587601
"enum": [
588602
"kaldi",
589603
"google",
590-
"ibm"
604+
"ibm",
605+
"azure"
591606
]
592607
}
593608
}

0 commit comments

Comments
 (0)