BOT-2463 added Amazon Polly

Florian Treml · Florian Treml · commit 96f47b56ab03 · 2022-01-21T15:03:36.000+01:00
diff --git a/frontend/package.json b/frontend/package.json
@@ -7,46 +7,49 @@
     "eslint-fix": "eslint --fix src",
     "start-dev": "cross-env DOTENV_FLOW_PATH=./resources nodemon -w ./resources/.env -w ./resources/.env.local -w ./src/ -x \"node -r dotenv-flow/config\" ./src/server.js",
     "start-dist": "node -r dotenv-flow/config ./src/server.js",
-    "jsdoc": "swagger-jsdoc -d ./src/swaggerDef.json -o ./src/swagger.json src/routes.js"
+    "jsdoc": "swagger-jsdoc -d ./src/swaggerDef.json -o ./src/swagger.json src/routes.js",
+    "update-dependencies": "npm-check-updates -u --timeout 120000"
   },
   "author": "Botium GmbH",
   "license": "MIT",
   "dependencies": {
-    "@google-cloud/speech": "^4.9.0",
-    "@google-cloud/storage": "^5.16.0",
-    "@google-cloud/text-to-speech": "^3.3.1",
-    "body-parser": "^1.19.0",
+    "@aws-sdk/client-polly": "^3.47.1",
+    "@google-cloud/speech": "^4.10.0",
+    "@google-cloud/storage": "^5.18.0",
+    "@google-cloud/text-to-speech": "^3.4.0",
+    "body-parser": "^1.19.1",
     "cheerio": "^1.0.0-rc.10",
-    "content-disposition": "^0.5.3",
+    "content-disposition": "^0.5.4",
     "cors": "^2.8.5",
     "cross-env": "^7.0.3",
     "debug": "^4.3.3",
     "dotenv-flow": "^3.2.0",
-    "express": "^4.17.1",
+    "express": "^4.17.2",
     "express-winston": "^4.2.0",
-    "ibm-watson": "^6.2.1",
+    "ibm-watson": "^6.2.2",
     "lodash": "^4.17.21",
     "microsoft-cognitiveservices-speech-sdk": "^1.19.0",
     "mkdirp": "^1.0.4",
-    "multer": "^1.4.3",
+    "multer": "^1.4.4",
     "mustache": "^4.2.0",
     "nodemon": "^2.0.15",
     "request": "^2.88.2",
     "request-promise-native": "^1.0.9",
     "sanitize-filename": "^1.6.3",
     "swagger-jsdoc": "^6.1.0",
-    "swagger-ui-express": "^4.2.0",
+    "swagger-ui-express": "^4.3.0",
     "uuid": "^8.3.2",
-    "winston": "^3.3.3",
+    "winston": "^3.4.0",
     "word-error-rate": "^0.0.7",
-    "ws": "^8.3.0"
+    "ws": "^8.4.2"
   },
   "devDependencies": {
-    "eslint": "^8.3.0",
+    "eslint": "^8.7.0",
     "eslint-config-standard": "^16.0.3",
-    "eslint-plugin-import": "^2.25.3",
+    "eslint-plugin-import": "^2.25.4",
     "eslint-plugin-node": "^11.1.0",
-    "eslint-plugin-promise": "^5.2.0",
-    "eslint-plugin-standard": "^5.0.0"
+    "eslint-plugin-promise": "^6.0.0",
+    "eslint-plugin-standard": "^5.0.0",
+    "npm-check-updates": "^12.2.0"
   }
 }
diff --git a/frontend/resources/.env b/frontend/resources/.env
@@ -53,6 +53,11 @@ BOTIUM_SPEECH_IBM_TTS_SERVICEURL=
 BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY=
 BOTIUM_SPEECH_AZURE_REGION=
 
+# STT/TTS Provider AWS Polly/Transcribe
+BOTIUM_SPEECH_AWS_REGION=
+BOTIUM_SPEECH_AWS_ACCESS_KEY_ID=
+BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY=
+
 # WAV Conversion Command Line
 BOTIUM_SPEECH_CONVERT_PROFILE_WAVTOMONOWAV_CMD=sox -t wav - -r 16k -t wav -c 1 -b 16 -e signed {{{output}}}
 BOTIUM_SPEECH_CONVERT_PROFILE_WAVTOMONOWAV_DESC=Converts WAV file to a Mono Wav, 16khz, 16bit
diff --git a/frontend/src/convert/convert.js b/frontend/src/convert/convert.js
@@ -26,13 +26,18 @@ const _isMP3 = (buf) => {
   )
 }
 
-const runconvert = async (cmdLine, outputName, { inputBuffer, start, end }) => {
+const pcmtowav = async (inputBuffer, { sampleRate = 16000, bitDepth = 16, channelCount = 1 }) => {
+  const result = await runconvert(`sox -r ${sampleRate} -e signed -b ${bitDepth} -c ${channelCount} {{{input}}} {{{output}}}`, 'output.wav', { inputBuffer, inputType: 'raw' })
+  return result.outputBuffer
+}
+
+const runconvert = async (cmdLine, outputName, { inputBuffer, inputType, start, end }) => {
   const jobId = uuidv1()
 
   const writeInput = !outputName || cmdLine.indexOf('{{{input}}}') >= 0 || cmdLine.indexOf('{{{inputtype}}}') >= 0
 
   let input = null
-  let inputtype = null
+  let inputtype = inputType || null
 
   if (writeInput) {
     input = `${process.env.BOTIUM_SPEECH_TMP_DIR || '/tmp'}/${jobId}_input`
@@ -42,15 +47,17 @@ const runconvert = async (cmdLine, outputName, { inputBuffer, start, end }) => {
       debug(`conversion process input file ${input} not writable: ${err.message}`)
       throw new Error('conversion process input file not writable')
     }
-    if (_isMP3(inputBuffer)) {
-      inputtype = 'mp3'
-    } else {
-      try {
-        inputtype = await _getSoxFileType(input)
-        debug(`Identified input type: ${inputtype}`)
-      } catch (err) {
-        debug(`identification of input file type ${input} failed: ${err.message}`)
-        throw new Error('identification of input file type failed')
+    if (!inputtype) {
+      if (_isMP3(inputBuffer)) {
+        inputtype = 'mp3'
+      } else {
+        try {
+          inputtype = await _getSoxFileType(input)
+          debug(`Identified input type: ${inputtype}`)
+        } catch (err) {
+          debug(`identification of input file type ${input} failed: ${err.message}`)
+          throw new Error('identification of input file type failed')
+        }
       }
     }
     if (inputtype) {
@@ -142,5 +149,6 @@ const runconvert = async (cmdLine, outputName, { inputBuffer, start, end }) => {
 }
 
 module.exports = {
+  pcmtowav,
   runconvert
 }
diff --git a/frontend/src/routes.js b/frontend/src/routes.js
@@ -38,6 +38,7 @@ const ttsEngines = {
   google: new (require('./tts/google'))(),
   ibm: new (require('./tts/ibm'))(),
   azure: new (require('./tts/azure'))(),
+  polly: new (require('./tts/polly'))(),
   marytts: new (require('./tts/marytts'))(),
   picotts: new (require('./tts/picotts'))()
 }
@@ -278,7 +279,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, marytts, picotts]
+ *           enum: [google, ibm, azure, polly, marytts, picotts]
  *     responses:
  *       200:
  *         description: List of supported voices
@@ -319,7 +320,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, marytts, picotts]
+ *           enum: [google, ibm, azure, polly, marytts, picotts]
  *     responses:
  *       200:
  *         description: List of supported TTS languages
@@ -371,7 +372,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, azure, marytts, picotts]
+ *           enum: [google, ibm, azure, polly, marytts, picotts]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
diff --git a/frontend/src/swagger.json b/frontend/src/swagger.json
@@ -2,7 +2,7 @@
   "openapi": "3.0.0",
   "info": {
     "title": "Botium Speech Processing API",
-    "version": "1.2.2",
+    "version": "1.3.0",
     "description": "Botium Speech Processing API"
   },
   "basePath": "/",
@@ -193,6 +193,7 @@
                 "google",
                 "ibm",
                 "azure",
+                "polly",
                 "marytts",
                 "picotts"
               ]
@@ -250,6 +251,7 @@
                 "google",
                 "ibm",
                 "azure",
+                "polly",
                 "marytts",
                 "picotts"
               ]
@@ -319,6 +321,7 @@
                 "google",
                 "ibm",
                 "azure",
+                "polly",
                 "marytts",
                 "picotts"
               ]
diff --git a/frontend/src/swaggerDef.json b/frontend/src/swaggerDef.json
@@ -2,7 +2,7 @@
     "openapi": "3.0.0",
     "info": {
       "title": "Botium Speech Processing API",
-      "version": "1.2.2",
+      "version": "1.3.0",
       "description": "Botium Speech Processing API"
     },
     "basePath": "/"
diff --git a/frontend/src/tts/polly.js b/frontend/src/tts/polly.js
@@ -0,0 +1,86 @@
+const _ = require('lodash')
+const { PollyClient, DescribeVoicesCommand, SynthesizeSpeechCommand } = require('@aws-sdk/client-polly')
+const debug = require('debug')('botium-speech-processing-polly-tts')
+
+const { pollyOptions, ttsFilename } = require('../utils')
+const { pcmtowav } = require('../convert/convert')
+
+const genderMap = {
+  Male: 'male',
+  Female: 'female'
+}
+
+class PollyTTS {
+  async voices (req) {
+    const pollyClient = new PollyClient(pollyOptions(req))
+
+    const voices = await pollyClient.send(new DescribeVoicesCommand({}))
+
+    const pollyVoices = []
+    voices.Voices.forEach(voice => {
+      pollyVoices.push({
+        name: voice.Id,
+        gender: genderMap[voice.Gender],
+        language: voice.LanguageCode
+      })
+    })
+    return pollyVoices
+  }
+
+  async languages (req) {
+    const voicesList = await this.voices(req)
+    return _.uniq(voicesList.map(v => v.language)).sort()
+  }
+
+  async tts (req, { language, voice, text }) {
+    const pollyClient = new PollyClient(pollyOptions(req))
+
+    if (!voice) {
+      const voicesList = await this.voices(req)
+      voice = voicesList.find(v => v.language === language).name
+    }
+
+    const synthesizeParams = {
+      OutputFormat: 'pcm',
+      Text: text,
+      LanguageCode: language,
+      VoiceId: voice
+    }
+
+    if (req.body.polly && req.body.polly.config) {
+      Object.assign(synthesizeParams, req.body.polly.config)
+    }
+
+    try {
+      const synthResult = await pollyClient.send(new SynthesizeSpeechCommand(synthesizeParams))
+
+      const chunks = []
+      for await (const chunk of synthResult.AudioStream) {
+        chunks.push(chunk)
+      }
+      const bufferRaw = Buffer.concat(chunks)
+      if (synthesizeParams.OutputFormat === 'pcm') {
+        const bufferWav = await pcmtowav(bufferRaw, { sampleRate: 16000, bitDepth: 16, channelCount: 1 })
+        return {
+          buffer: bufferWav,
+          name: `${ttsFilename(text)}.wav`
+        }
+      } else if (synthesizeParams.OutputFormat === 'mp3') {
+        return {
+          buffer: bufferRaw,
+          name: `${ttsFilename(text)}.mp3`
+        }
+      } else if (synthesizeParams.OutputFormat === 'ogg_vorbis') {
+        return {
+          buffer: bufferRaw,
+          name: `${ttsFilename(text)}.ogg`
+        }
+      }
+    } catch (err) {
+      debug(err)
+      throw new Error(`Polly TTS failed: ${err.message}`)
+    }
+  }
+}
+
+module.exports = PollyTTS
diff --git a/frontend/src/utils.js b/frontend/src/utils.js
@@ -83,6 +83,23 @@ const ibmTtsOptions = (req) => {
   throw new Error('IBM Cloud credentials not found')
 }
 
+const pollyOptions = (req) => {
+  const region = _.get(req, 'body.polly.credentials.region') || process.env.BOTIUM_SPEECH_AWS_REGION
+  const accessKeyId = _.get(req, 'body.polly.credentials.accessKeyId') || process.env.BOTIUM_SPEECH_AWS_ACCESS_KEY_ID
+  const secretAccessKey = _.get(req, 'body.polly.credentials.secretAccessKey') || process.env.BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY
+
+  if (region && accessKeyId && secretAccessKey) {
+    return {
+      region,
+      credentials: {
+        accessKeyId,
+        secretAccessKey
+      }
+    }
+  }
+  throw new Error('AWS Polly credentials not found')
+}
+
 const azureSpeechConfig = (req) => {
   const subscriptionKey = _.get(req, 'body.azure.credentials.subscriptionKey') || process.env.BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY
   const region = _.get(req, 'body.azure.credentials.region') || process.env.BOTIUM_SPEECH_AZURE_REGION
@@ -140,6 +157,7 @@ module.exports = {
   googleOptions,
   ibmSttOptions,
   ibmTtsOptions,
+  pollyOptions,
   azureSpeechConfig,
   applyExtraAzureSpeechConfig,
   getAzureErrorDetails,