Skip to content

Commit 99c0072

Browse files
author
Florian Treml
committed
Added support for AWS Transcribe Streaming
1 parent 96f47b5 commit 99c0072

File tree

5 files changed

+163
-7
lines changed

5 files changed

+163
-7
lines changed

frontend/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"license": "MIT",
1515
"dependencies": {
1616
"@aws-sdk/client-polly": "^3.47.1",
17+
"@aws-sdk/client-transcribe-streaming": "^3.47.1",
1718
"@google-cloud/speech": "^4.10.0",
1819
"@google-cloud/storage": "^5.18.0",
1920
"@google-cloud/text-to-speech": "^3.4.0",

frontend/src/routes.js

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ const sttEngines = {
4646
google: new (require('./stt/google'))(),
4747
kaldi: new (require('./stt/kaldi'))(),
4848
ibm: new (require('./stt/ibm'))(),
49-
azure: new (require('./stt/azure'))()
49+
azure: new (require('./stt/azure'))(),
50+
awstranscribe: new (require('./stt/awstranscribe'))()
5051
}
5152

5253
const multerMemoryStorage = multer.memoryStorage()
@@ -124,7 +125,7 @@ const router = express.Router()
124125
* required: false
125126
* schema:
126127
* type: string
127-
* enum: [kaldi, google, ibm, azure]
128+
* enum: [kaldi, google, ibm, azure, awstranscribe]
128129
* responses:
129130
* 200:
130131
* description: List of supported STT languages
@@ -176,7 +177,7 @@ const router = express.Router()
176177
* required: false
177178
* schema:
178179
* type: string
179-
* enum: [kaldi, google, ibm, azure]
180+
* enum: [kaldi, google, ibm, azure, awstranscribe]
180181
* - name: cache
181182
* description: Use result cache (default Y)
182183
* in: query
@@ -705,7 +706,7 @@ const wssStreams = {}
705706
* required: false
706707
* schema:
707708
* type: string
708-
* enum: [kaldi, google, ibm, azure]
709+
* enum: [kaldi, google, ibm, azure, awstranscribe]
709710
* responses:
710711
* 200:
711712
* description: Websocket Url to stream the audio to, and the uri to check status and end the stream

frontend/src/stt/awstranscribe.js

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
const _ = require('lodash')
2+
const { TranscribeStreamingClient, StartStreamTranscriptionCommand } = require('@aws-sdk/client-transcribe-streaming')
3+
const { PassThrough } = require('stream')
4+
const EventEmitter = require('events')
5+
6+
const debug = require('debug')('botium-speech-processing-awstranscribe-stt')
7+
8+
const { awstranscribeOptions } = require('../utils')
9+
10+
const languageCodes = [
11+
'af-ZA',
12+
'ar-AE',
13+
'ar-SA',
14+
'zh-CN',
15+
'zh-TW',
16+
'da-DK',
17+
'nl-NL',
18+
'en-AU',
19+
'en-GB',
20+
'en-IN',
21+
'en-IE',
22+
'en-NZ',
23+
'en-AB',
24+
'en-ZA',
25+
'en-US',
26+
'en-WL',
27+
'fr-FR',
28+
'fr-CA',
29+
'fa-IR',
30+
'de-DE',
31+
'de-CH',
32+
'he-IL',
33+
'hi-IN',
34+
'id-ID',
35+
'it-IT',
36+
'ja-JP',
37+
'ko-KR',
38+
'ms-MY',
39+
'pt-PT',
40+
'pt-BR',
41+
'ru-RU',
42+
'es-ES',
43+
'es-US',
44+
'ta-IN',
45+
'te-IN',
46+
'th-TH',
47+
'tr-TR'
48+
].sort()
49+
50+
class AwsTranscribeSTT {
51+
async languages (req) {
52+
return languageCodes
53+
}
54+
55+
async stt_OpenStream (req, { language }) {
56+
const transcribeClient = new TranscribeStreamingClient(awstranscribeOptions(req))
57+
58+
let audioInputStream = new PassThrough()
59+
const audioStream = async function * () {
60+
for await (const payloadChunk of audioInputStream) {
61+
const chunks = _.chunk(payloadChunk, 25000)
62+
for (const chunk of chunks) {
63+
yield { AudioEvent: { AudioChunk: Buffer.from(chunk) } }
64+
}
65+
}
66+
}
67+
68+
const request = {
69+
LanguageCode: language,
70+
MediaEncoding: 'pcm',
71+
MediaSampleRateHertz: 16000,
72+
AudioStream: audioStream()
73+
}
74+
if (req.body && req.body.awstranscribe && req.body.awstranscribe.config) {
75+
Object.assign(request, req.body.awstranscribe.config)
76+
}
77+
78+
const events = new EventEmitter()
79+
try {
80+
const cmdResponse = await transcribeClient.send(new StartStreamTranscriptionCommand(request))
81+
setTimeout(async () => {
82+
try {
83+
for await (const event of cmdResponse.TranscriptResultStream) {
84+
const results = _.get(event, 'TranscriptEvent.Transcript.Results')
85+
if (results && results.length > 0) {
86+
for (const result of results) {
87+
const event = {
88+
text: result.Alternatives[0].Transcript,
89+
final: !result.IsPartial,
90+
start: result.StartTime,
91+
end: result.EndTime,
92+
debug: result
93+
}
94+
events.emit('data', event)
95+
}
96+
}
97+
}
98+
} catch (err) {
99+
events.emit('data', {
100+
err: `${err.message}`
101+
})
102+
}
103+
events.emit('close')
104+
}, 0)
105+
} catch (err) {
106+
debug(err)
107+
throw new Error(`AWS Transcribe STT streaming failed: ${err.message}`)
108+
}
109+
return {
110+
events,
111+
write: (buffer) => {
112+
audioInputStream.push(buffer)
113+
},
114+
end: () => {
115+
if (audioInputStream) {
116+
audioInputStream.end()
117+
}
118+
},
119+
close: () => {
120+
if (audioInputStream) {
121+
audioInputStream.destroy()
122+
}
123+
audioInputStream = null
124+
}
125+
}
126+
}
127+
128+
async stt (req, { language, buffer, hint }) {
129+
130+
}
131+
}
132+
133+
module.exports = AwsTranscribeSTT

frontend/src/swagger.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@
5959
"kaldi",
6060
"google",
6161
"ibm",
62-
"azure"
62+
"azure",
63+
"awstranscribe"
6364
]
6465
}
6566
}
@@ -127,7 +128,8 @@
127128
"kaldi",
128129
"google",
129130
"ibm",
130-
"azure"
131+
"azure",
132+
"awstranscribe"
131133
]
132134
}
133135
},
@@ -605,7 +607,8 @@
605607
"kaldi",
606608
"google",
607609
"ibm",
608-
"azure"
610+
"azure",
611+
"awstranscribe"
609612
]
610613
}
611614
}

frontend/src/utils.js

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,23 @@ const pollyOptions = (req) => {
100100
throw new Error('AWS Polly credentials not found')
101101
}
102102

103+
const awstranscribeOptions = (req) => {
104+
const region = _.get(req, 'body.awstranscribe.credentials.region') || process.env.BOTIUM_SPEECH_AWS_REGION
105+
const accessKeyId = _.get(req, 'body.awstranscribe.credentials.accessKeyId') || process.env.BOTIUM_SPEECH_AWS_ACCESS_KEY_ID
106+
const secretAccessKey = _.get(req, 'body.awstranscribe.credentials.secretAccessKey') || process.env.BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY
107+
108+
if (region && accessKeyId && secretAccessKey) {
109+
return {
110+
region,
111+
credentials: {
112+
accessKeyId,
113+
secretAccessKey
114+
}
115+
}
116+
}
117+
throw new Error('AWS Transcribe credentials not found')
118+
}
119+
103120
const azureSpeechConfig = (req) => {
104121
const subscriptionKey = _.get(req, 'body.azure.credentials.subscriptionKey') || process.env.BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY
105122
const region = _.get(req, 'body.azure.credentials.region') || process.env.BOTIUM_SPEECH_AZURE_REGION
@@ -158,6 +175,7 @@ module.exports = {
158175
ibmSttOptions,
159176
ibmTtsOptions,
160177
pollyOptions,
178+
awstranscribeOptions,
161179
azureSpeechConfig,
162180
applyExtraAzureSpeechConfig,
163181
getAzureErrorDetails,

0 commit comments

Comments
 (0)