Added support for AWS Transcribe Streaming

Florian Treml · Florian Treml · commit 99c007249d27 · 2022-01-21T17:43:28.000+01:00
diff --git a/frontend/package.json b/frontend/package.json
@@ -14,6 +14,7 @@
   "license": "MIT",
   "dependencies": {
     "@aws-sdk/client-polly": "^3.47.1",
+    "@aws-sdk/client-transcribe-streaming": "^3.47.1",
     "@google-cloud/speech": "^4.10.0",
     "@google-cloud/storage": "^5.18.0",
     "@google-cloud/text-to-speech": "^3.4.0",
diff --git a/frontend/src/routes.js b/frontend/src/routes.js
@@ -46,7 +46,8 @@ const sttEngines = {
   google: new (require('./stt/google'))(),
   kaldi: new (require('./stt/kaldi'))(),
   ibm: new (require('./stt/ibm'))(),
-  azure: new (require('./stt/azure'))()
+  azure: new (require('./stt/azure'))(),
+  awstranscribe: new (require('./stt/awstranscribe'))()
 }
 
 const multerMemoryStorage = multer.memoryStorage()
@@ -124,7 +125,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm, azure]
+ *           enum: [kaldi, google, ibm, azure, awstranscribe]
  *     responses:
  *       200:
  *         description: List of supported STT languages
@@ -176,7 +177,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm, azure]
+ *           enum: [kaldi, google, ibm, azure, awstranscribe]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
@@ -705,7 +706,7 @@ const wssStreams = {}
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm, azure]
+ *           enum: [kaldi, google, ibm, azure, awstranscribe]
  *     responses:
  *       200:
  *         description: Websocket Url to stream the audio to, and the uri to check status and end the stream
diff --git a/frontend/src/stt/awstranscribe.js b/frontend/src/stt/awstranscribe.js
@@ -0,0 +1,133 @@
+const _ = require('lodash')
+const { TranscribeStreamingClient, StartStreamTranscriptionCommand } = require('@aws-sdk/client-transcribe-streaming')
+const { PassThrough } = require('stream')
+const EventEmitter = require('events')
+
+const debug = require('debug')('botium-speech-processing-awstranscribe-stt')
+
+const { awstranscribeOptions } = require('../utils')
+
+const languageCodes = [
+  'af-ZA',
+  'ar-AE',
+  'ar-SA',
+  'zh-CN',
+  'zh-TW',
+  'da-DK',
+  'nl-NL',
+  'en-AU',
+  'en-GB',
+  'en-IN',
+  'en-IE',
+  'en-NZ',
+  'en-AB',
+  'en-ZA',
+  'en-US',
+  'en-WL',
+  'fr-FR',
+  'fr-CA',
+  'fa-IR',
+  'de-DE',
+  'de-CH',
+  'he-IL',
+  'hi-IN',
+  'id-ID',
+  'it-IT',
+  'ja-JP',
+  'ko-KR',
+  'ms-MY',
+  'pt-PT',
+  'pt-BR',
+  'ru-RU',
+  'es-ES',
+  'es-US',
+  'ta-IN',
+  'te-IN',
+  'th-TH',
+  'tr-TR'
+].sort()
+
+class AwsTranscribeSTT {
+  async languages (req) {
+    return languageCodes
+  }
+
+  async stt_OpenStream (req, { language }) {
+    const transcribeClient = new TranscribeStreamingClient(awstranscribeOptions(req))
+
+    let audioInputStream = new PassThrough()
+    const audioStream = async function * () {
+      for await (const payloadChunk of audioInputStream) {
+        const chunks = _.chunk(payloadChunk, 25000)
+        for (const chunk of chunks) {
+          yield { AudioEvent: { AudioChunk: Buffer.from(chunk) } }
+        }
+      }
+    }
+
+    const request = {
+      LanguageCode: language,
+      MediaEncoding: 'pcm',
+      MediaSampleRateHertz: 16000,
+      AudioStream: audioStream()
+    }
+    if (req.body && req.body.awstranscribe && req.body.awstranscribe.config) {
+      Object.assign(request, req.body.awstranscribe.config)
+    }
+
+    const events = new EventEmitter()
+    try {
+      const cmdResponse = await transcribeClient.send(new StartStreamTranscriptionCommand(request))
+      setTimeout(async () => {
+        try {
+          for await (const event of cmdResponse.TranscriptResultStream) {
+            const results = _.get(event, 'TranscriptEvent.Transcript.Results')
+            if (results && results.length > 0) {
+              for (const result of results) {
+                const event = {
+                  text: result.Alternatives[0].Transcript,
+                  final: !result.IsPartial,
+                  start: result.StartTime,
+                  end: result.EndTime,
+                  debug: result
+                }
+                events.emit('data', event)
+              }
+            }
+          }
+        } catch (err) {
+          events.emit('data', {
+            err: `${err.message}`
+          })
+        }
+        events.emit('close')
+      }, 0)
+    } catch (err) {
+      debug(err)
+      throw new Error(`AWS Transcribe STT streaming failed: ${err.message}`)
+    }
+    return {
+      events,
+      write: (buffer) => {
+        audioInputStream.push(buffer)
+      },
+      end: () => {
+        if (audioInputStream) {
+          audioInputStream.end()
+        }
+      },
+      close: () => {
+        if (audioInputStream) {
+          audioInputStream.destroy()
+        }
+        audioInputStream = null
+      }
+    }
+  }
+
+  async stt (req, { language, buffer, hint }) {
+
+  }
+}
+
+module.exports = AwsTranscribeSTT
diff --git a/frontend/src/swagger.json b/frontend/src/swagger.json
@@ -59,7 +59,8 @@
                 "kaldi",
                 "google",
                 "ibm",
-                "azure"
+                "azure",
+                "awstranscribe"
               ]
             }
           }
@@ -127,7 +128,8 @@
                 "kaldi",
                 "google",
                 "ibm",
-                "azure"
+                "azure",
+                "awstranscribe"
               ]
             }
           },
@@ -605,7 +607,8 @@
                 "kaldi",
                 "google",
                 "ibm",
-                "azure"
+                "azure",
+                "awstranscribe"
               ]
             }
           }
diff --git a/frontend/src/utils.js b/frontend/src/utils.js
@@ -100,6 +100,23 @@ const pollyOptions = (req) => {
   throw new Error('AWS Polly credentials not found')
 }
 
+const awstranscribeOptions = (req) => {
+  const region = _.get(req, 'body.awstranscribe.credentials.region') || process.env.BOTIUM_SPEECH_AWS_REGION
+  const accessKeyId = _.get(req, 'body.awstranscribe.credentials.accessKeyId') || process.env.BOTIUM_SPEECH_AWS_ACCESS_KEY_ID
+  const secretAccessKey = _.get(req, 'body.awstranscribe.credentials.secretAccessKey') || process.env.BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY
+
+  if (region && accessKeyId && secretAccessKey) {
+    return {
+      region,
+      credentials: {
+        accessKeyId,
+        secretAccessKey
+      }
+    }
+  }
+  throw new Error('AWS Transcribe credentials not found')
+}
+
 const azureSpeechConfig = (req) => {
   const subscriptionKey = _.get(req, 'body.azure.credentials.subscriptionKey') || process.env.BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY
   const region = _.get(req, 'body.azure.credentials.region') || process.env.BOTIUM_SPEECH_AZURE_REGION
@@ -158,6 +175,7 @@ module.exports = {
   ibmSttOptions,
   ibmTtsOptions,
   pollyOptions,
+  awstranscribeOptions,
   azureSpeechConfig,
   applyExtraAzureSpeechConfig,
   getAzureErrorDetails,

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,8 @@`
`59`	`59`	`"kaldi",`
`60`	`60`	`"google",`
`61`	`61`	`"ibm",`
`62`		`- "azure"`
	`62`	`+ "azure",`
	`63`	`+ "awstranscribe"`
`63`	`64`	`]`
`64`	`65`	`}`
`65`	`66`	`}`
`@@ -127,7 +128,8 @@`
`127`	`128`	`"kaldi",`
`128`	`129`	`"google",`
`129`	`130`	`"ibm",`
`130`		`- "azure"`
	`131`	`+ "azure",`
	`132`	`+ "awstranscribe"`
`131`	`133`	`]`
`132`	`134`	`}`
`133`	`135`	`},`
`@@ -605,7 +607,8 @@`
`605`	`607`	`"kaldi",`
`606`	`608`	`"google",`
`607`	`609`	`"ibm",`
`608`		`- "azure"`
	`610`	`+ "azure",`
	`611`	`+ "awstranscribe"`
`609`	`612`	`]`
`610`	`613`	`}`
`611`	`614`	`}`