Skip to content

Commit 96f47b5

Browse files
author
Florian Treml
committed
BOT-2463 added Amazon Polly
1 parent ea7ee05 commit 96f47b5

File tree

8 files changed

+156
-32
lines changed

8 files changed

+156
-32
lines changed

frontend/package.json

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,46 +7,49 @@
77
"eslint-fix": "eslint --fix src",
88
"start-dev": "cross-env DOTENV_FLOW_PATH=./resources nodemon -w ./resources/.env -w ./resources/.env.local -w ./src/ -x \"node -r dotenv-flow/config\" ./src/server.js",
99
"start-dist": "node -r dotenv-flow/config ./src/server.js",
10-
"jsdoc": "swagger-jsdoc -d ./src/swaggerDef.json -o ./src/swagger.json src/routes.js"
10+
"jsdoc": "swagger-jsdoc -d ./src/swaggerDef.json -o ./src/swagger.json src/routes.js",
11+
"update-dependencies": "npm-check-updates -u --timeout 120000"
1112
},
1213
"author": "Botium GmbH",
1314
"license": "MIT",
1415
"dependencies": {
15-
"@google-cloud/speech": "^4.9.0",
16-
"@google-cloud/storage": "^5.16.0",
17-
"@google-cloud/text-to-speech": "^3.3.1",
18-
"body-parser": "^1.19.0",
16+
"@aws-sdk/client-polly": "^3.47.1",
17+
"@google-cloud/speech": "^4.10.0",
18+
"@google-cloud/storage": "^5.18.0",
19+
"@google-cloud/text-to-speech": "^3.4.0",
20+
"body-parser": "^1.19.1",
1921
"cheerio": "^1.0.0-rc.10",
20-
"content-disposition": "^0.5.3",
22+
"content-disposition": "^0.5.4",
2123
"cors": "^2.8.5",
2224
"cross-env": "^7.0.3",
2325
"debug": "^4.3.3",
2426
"dotenv-flow": "^3.2.0",
25-
"express": "^4.17.1",
27+
"express": "^4.17.2",
2628
"express-winston": "^4.2.0",
27-
"ibm-watson": "^6.2.1",
29+
"ibm-watson": "^6.2.2",
2830
"lodash": "^4.17.21",
2931
"microsoft-cognitiveservices-speech-sdk": "^1.19.0",
3032
"mkdirp": "^1.0.4",
31-
"multer": "^1.4.3",
33+
"multer": "^1.4.4",
3234
"mustache": "^4.2.0",
3335
"nodemon": "^2.0.15",
3436
"request": "^2.88.2",
3537
"request-promise-native": "^1.0.9",
3638
"sanitize-filename": "^1.6.3",
3739
"swagger-jsdoc": "^6.1.0",
38-
"swagger-ui-express": "^4.2.0",
40+
"swagger-ui-express": "^4.3.0",
3941
"uuid": "^8.3.2",
40-
"winston": "^3.3.3",
42+
"winston": "^3.4.0",
4143
"word-error-rate": "^0.0.7",
42-
"ws": "^8.3.0"
44+
"ws": "^8.4.2"
4345
},
4446
"devDependencies": {
45-
"eslint": "^8.3.0",
47+
"eslint": "^8.7.0",
4648
"eslint-config-standard": "^16.0.3",
47-
"eslint-plugin-import": "^2.25.3",
49+
"eslint-plugin-import": "^2.25.4",
4850
"eslint-plugin-node": "^11.1.0",
49-
"eslint-plugin-promise": "^5.2.0",
50-
"eslint-plugin-standard": "^5.0.0"
51+
"eslint-plugin-promise": "^6.0.0",
52+
"eslint-plugin-standard": "^5.0.0",
53+
"npm-check-updates": "^12.2.0"
5154
}
5255
}

frontend/resources/.env

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ BOTIUM_SPEECH_IBM_TTS_SERVICEURL=
5353
BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY=
5454
BOTIUM_SPEECH_AZURE_REGION=
5555

56+
# STT/TTS Provider AWS Polly/Transcribe
57+
BOTIUM_SPEECH_AWS_REGION=
58+
BOTIUM_SPEECH_AWS_ACCESS_KEY_ID=
59+
BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY=
60+
5661
# WAV Conversion Command Line
5762
BOTIUM_SPEECH_CONVERT_PROFILE_WAVTOMONOWAV_CMD=sox -t wav - -r 16k -t wav -c 1 -b 16 -e signed {{{output}}}
5863
BOTIUM_SPEECH_CONVERT_PROFILE_WAVTOMONOWAV_DESC=Converts WAV file to a Mono Wav, 16khz, 16bit

frontend/src/convert/convert.js

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,18 @@ const _isMP3 = (buf) => {
2626
)
2727
}
2828

29-
const runconvert = async (cmdLine, outputName, { inputBuffer, start, end }) => {
29+
const pcmtowav = async (inputBuffer, { sampleRate = 16000, bitDepth = 16, channelCount = 1 }) => {
30+
const result = await runconvert(`sox -r ${sampleRate} -e signed -b ${bitDepth} -c ${channelCount} {{{input}}} {{{output}}}`, 'output.wav', { inputBuffer, inputType: 'raw' })
31+
return result.outputBuffer
32+
}
33+
34+
const runconvert = async (cmdLine, outputName, { inputBuffer, inputType, start, end }) => {
3035
const jobId = uuidv1()
3136

3237
const writeInput = !outputName || cmdLine.indexOf('{{{input}}}') >= 0 || cmdLine.indexOf('{{{inputtype}}}') >= 0
3338

3439
let input = null
35-
let inputtype = null
40+
let inputtype = inputType || null
3641

3742
if (writeInput) {
3843
input = `${process.env.BOTIUM_SPEECH_TMP_DIR || '/tmp'}/${jobId}_input`
@@ -42,15 +47,17 @@ const runconvert = async (cmdLine, outputName, { inputBuffer, start, end }) => {
4247
debug(`conversion process input file ${input} not writable: ${err.message}`)
4348
throw new Error('conversion process input file not writable')
4449
}
45-
if (_isMP3(inputBuffer)) {
46-
inputtype = 'mp3'
47-
} else {
48-
try {
49-
inputtype = await _getSoxFileType(input)
50-
debug(`Identified input type: ${inputtype}`)
51-
} catch (err) {
52-
debug(`identification of input file type ${input} failed: ${err.message}`)
53-
throw new Error('identification of input file type failed')
50+
if (!inputtype) {
51+
if (_isMP3(inputBuffer)) {
52+
inputtype = 'mp3'
53+
} else {
54+
try {
55+
inputtype = await _getSoxFileType(input)
56+
debug(`Identified input type: ${inputtype}`)
57+
} catch (err) {
58+
debug(`identification of input file type ${input} failed: ${err.message}`)
59+
throw new Error('identification of input file type failed')
60+
}
5461
}
5562
}
5663
if (inputtype) {
@@ -142,5 +149,6 @@ const runconvert = async (cmdLine, outputName, { inputBuffer, start, end }) => {
142149
}
143150

144151
module.exports = {
152+
pcmtowav,
145153
runconvert
146154
}

frontend/src/routes.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ const ttsEngines = {
3838
google: new (require('./tts/google'))(),
3939
ibm: new (require('./tts/ibm'))(),
4040
azure: new (require('./tts/azure'))(),
41+
polly: new (require('./tts/polly'))(),
4142
marytts: new (require('./tts/marytts'))(),
4243
picotts: new (require('./tts/picotts'))()
4344
}
@@ -278,7 +279,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
278279
* required: false
279280
* schema:
280281
* type: string
281-
* enum: [google, ibm, azure, marytts, picotts]
282+
* enum: [google, ibm, azure, polly, marytts, picotts]
282283
* responses:
283284
* 200:
284285
* description: List of supported voices
@@ -319,7 +320,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
319320
* required: false
320321
* schema:
321322
* type: string
322-
* enum: [google, ibm, azure, marytts, picotts]
323+
* enum: [google, ibm, azure, polly, marytts, picotts]
323324
* responses:
324325
* 200:
325326
* description: List of supported TTS languages
@@ -371,7 +372,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
371372
* required: false
372373
* schema:
373374
* type: string
374-
* enum: [google, ibm, azure, marytts, picotts]
375+
* enum: [google, ibm, azure, polly, marytts, picotts]
375376
* - name: cache
376377
* description: Use result cache (default Y)
377378
* in: query

frontend/src/swagger.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"openapi": "3.0.0",
33
"info": {
44
"title": "Botium Speech Processing API",
5-
"version": "1.2.2",
5+
"version": "1.3.0",
66
"description": "Botium Speech Processing API"
77
},
88
"basePath": "/",
@@ -193,6 +193,7 @@
193193
"google",
194194
"ibm",
195195
"azure",
196+
"polly",
196197
"marytts",
197198
"picotts"
198199
]
@@ -250,6 +251,7 @@
250251
"google",
251252
"ibm",
252253
"azure",
254+
"polly",
253255
"marytts",
254256
"picotts"
255257
]
@@ -319,6 +321,7 @@
319321
"google",
320322
"ibm",
321323
"azure",
324+
"polly",
322325
"marytts",
323326
"picotts"
324327
]

frontend/src/swaggerDef.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"openapi": "3.0.0",
33
"info": {
44
"title": "Botium Speech Processing API",
5-
"version": "1.2.2",
5+
"version": "1.3.0",
66
"description": "Botium Speech Processing API"
77
},
88
"basePath": "/"

frontend/src/tts/polly.js

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
const _ = require('lodash')
2+
const { PollyClient, DescribeVoicesCommand, SynthesizeSpeechCommand } = require('@aws-sdk/client-polly')
3+
const debug = require('debug')('botium-speech-processing-polly-tts')
4+
5+
const { pollyOptions, ttsFilename } = require('../utils')
6+
const { pcmtowav } = require('../convert/convert')
7+
8+
const genderMap = {
9+
Male: 'male',
10+
Female: 'female'
11+
}
12+
13+
class PollyTTS {
14+
async voices (req) {
15+
const pollyClient = new PollyClient(pollyOptions(req))
16+
17+
const voices = await pollyClient.send(new DescribeVoicesCommand({}))
18+
19+
const pollyVoices = []
20+
voices.Voices.forEach(voice => {
21+
pollyVoices.push({
22+
name: voice.Id,
23+
gender: genderMap[voice.Gender],
24+
language: voice.LanguageCode
25+
})
26+
})
27+
return pollyVoices
28+
}
29+
30+
async languages (req) {
31+
const voicesList = await this.voices(req)
32+
return _.uniq(voicesList.map(v => v.language)).sort()
33+
}
34+
35+
async tts (req, { language, voice, text }) {
36+
const pollyClient = new PollyClient(pollyOptions(req))
37+
38+
if (!voice) {
39+
const voicesList = await this.voices(req)
40+
voice = voicesList.find(v => v.language === language).name
41+
}
42+
43+
const synthesizeParams = {
44+
OutputFormat: 'pcm',
45+
Text: text,
46+
LanguageCode: language,
47+
VoiceId: voice
48+
}
49+
50+
if (req.body.polly && req.body.polly.config) {
51+
Object.assign(synthesizeParams, req.body.polly.config)
52+
}
53+
54+
try {
55+
const synthResult = await pollyClient.send(new SynthesizeSpeechCommand(synthesizeParams))
56+
57+
const chunks = []
58+
for await (const chunk of synthResult.AudioStream) {
59+
chunks.push(chunk)
60+
}
61+
const bufferRaw = Buffer.concat(chunks)
62+
if (synthesizeParams.OutputFormat === 'pcm') {
63+
const bufferWav = await pcmtowav(bufferRaw, { sampleRate: 16000, bitDepth: 16, channelCount: 1 })
64+
return {
65+
buffer: bufferWav,
66+
name: `${ttsFilename(text)}.wav`
67+
}
68+
} else if (synthesizeParams.OutputFormat === 'mp3') {
69+
return {
70+
buffer: bufferRaw,
71+
name: `${ttsFilename(text)}.mp3`
72+
}
73+
} else if (synthesizeParams.OutputFormat === 'ogg_vorbis') {
74+
return {
75+
buffer: bufferRaw,
76+
name: `${ttsFilename(text)}.ogg`
77+
}
78+
}
79+
} catch (err) {
80+
debug(err)
81+
throw new Error(`Polly TTS failed: ${err.message}`)
82+
}
83+
}
84+
}
85+
86+
module.exports = PollyTTS

frontend/src/utils.js

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,23 @@ const ibmTtsOptions = (req) => {
8383
throw new Error('IBM Cloud credentials not found')
8484
}
8585

86+
const pollyOptions = (req) => {
87+
const region = _.get(req, 'body.polly.credentials.region') || process.env.BOTIUM_SPEECH_AWS_REGION
88+
const accessKeyId = _.get(req, 'body.polly.credentials.accessKeyId') || process.env.BOTIUM_SPEECH_AWS_ACCESS_KEY_ID
89+
const secretAccessKey = _.get(req, 'body.polly.credentials.secretAccessKey') || process.env.BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY
90+
91+
if (region && accessKeyId && secretAccessKey) {
92+
return {
93+
region,
94+
credentials: {
95+
accessKeyId,
96+
secretAccessKey
97+
}
98+
}
99+
}
100+
throw new Error('AWS Polly credentials not found')
101+
}
102+
86103
const azureSpeechConfig = (req) => {
87104
const subscriptionKey = _.get(req, 'body.azure.credentials.subscriptionKey') || process.env.BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY
88105
const region = _.get(req, 'body.azure.credentials.region') || process.env.BOTIUM_SPEECH_AZURE_REGION
@@ -140,6 +157,7 @@ module.exports = {
140157
googleOptions,
141158
ibmSttOptions,
142159
ibmTtsOptions,
160+
pollyOptions,
143161
azureSpeechConfig,
144162
applyExtraAzureSpeechConfig,
145163
getAzureErrorDetails,

0 commit comments

Comments
 (0)