codeforequity-at
diff --git a/‎connectors/ws/simple.js‎
Lines changed: 1 addition & 1 deletion b/‎connectors/ws/simple.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎frontend/package.json‎
Lines changed: 1 addition & 0 deletions b/‎frontend/package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎frontend/sample.wav‎
-103 KB b/‎frontend/sample.wav‎
-103 KB
diff --git a/‎frontend/src/routes.js‎
Lines changed: 22 additions & 13 deletions b/‎frontend/src/routes.js‎
Lines changed: 22 additions & 13 deletions
diff --git a/‎frontend/src/stt/azure.js‎
Lines changed: 146 additions & 0 deletions b/‎frontend/src/stt/azure.js‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎frontend/src/stt/google.js‎
Lines changed: 6 additions & 1 deletion b/‎frontend/src/stt/google.js‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎frontend/src/stt/kaldi.js‎
Lines changed: 0 additions & 1 deletion b/‎frontend/src/stt/kaldi.js‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎frontend/src/swagger.json‎
Lines changed: 22 additions & 7 deletions b/‎frontend/src/swagger.json‎
Lines changed: 22 additions & 7 deletions
@@ -19,7 +19,7 @@ const main = async () => {
   ws.on('message', (data) => {
     try {
       const dj = JSON.parse(data)
-      if (dj.final) console.log('received: %s', dj.text)
+      if (dj.final) console.log('received %s-%s: %s ', dj.start, dj.end, dj.text)
     } catch (err) {
     }
   })
 
@@ -26,6 +26,7 @@
     "express-winston": "^4.2.0",
     "ibm-watson": "^6.2.1",
     "lodash": "^4.17.21",
+    "microsoft-cognitiveservices-speech-sdk": "^1.19.0",
     "mkdirp": "^1.0.4",
     "multer": "^1.4.3",
     "mustache": "^4.2.0",
 
@@ -37,13 +37,15 @@ if (tmpPath) {
 const ttsEngines = {
   google: new (require('./tts/google'))(),
   ibm: new (require('./tts/ibm'))(),
+  azure: new (require('./tts/azure'))(),
   marytts: new (require('./tts/marytts'))(),
   picotts: new (require('./tts/picotts'))()
 }
 const sttEngines = {
   google: new (require('./stt/google'))(),
   kaldi: new (require('./stt/kaldi'))(),
-  ibm: new (require('./stt/ibm'))()
+  ibm: new (require('./stt/ibm'))(),
+  azure: new (require('./stt/azure'))()
 }
 
 const multerMemoryStorage = multer.memoryStorage()
@@ -121,7 +123,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm]
+ *           enum: [kaldi, google, ibm, azure]
  *     responses:
  *       200:
  *         description: List of supported STT languages
@@ -156,7 +158,13 @@ const router = express.Router()
  *         schema:
  *           type: string
  *       - name: hint
- *         description: Hint text for calculating the Levenshtein edit distance for the result text (word error rate)
+ *         description: Hint text for the Speech-to-text backend (supported by google and azure)
+ *         in: query
+ *         required: false
+ *         schema:
+ *           type: string
+ *       - name: wer
+ *         description: Text for calculating the Levenshtein edit distance for the result text (word error rate)
  *         in: query
  *         required: false
  *         schema:
@@ -167,9 +175,9 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm]
+ *           enum: [kaldi, google, ibm, azure]
  *       - name: cache
- *         description: Disable result cache
+ *         description: Use result cache (default Y)
  *         in: query
  *         required: false
  *         schema:
@@ -227,10 +235,11 @@ router.post('/api/stt/:language', async (req, res, next) => {
 
       const result = await stt.stt(req, {
         language: req.params.language,
-        buffer: buffer
+        buffer: buffer,
+        hint: req.query.hint
       })
-      if (req.query.hint) {
-        result.wer = await wer(req.query.hint, result.text)
+      if (req.query.wer) {
+        result.wer = await wer(req.query.wer, result.text)
       }
       res.json(result).end()
 
@@ -266,7 +275,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, marytts, picotts]
+ *           enum: [google, ibm, azure, marytts, picotts]
  *     responses:
  *       200:
  *         description: List of supported voices
@@ -307,7 +316,7 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, marytts, picotts]
+ *           enum: [google, ibm, azure, marytts, picotts]
  *     responses:
  *       200:
  *         description: List of supported TTS languages
@@ -359,9 +368,9 @@ router.post('/api/stt/:language', async (req, res, next) => {
  *         required: false
  *         schema:
  *           type: string
- *           enum: [google, ibm, marytts, picotts]
+ *           enum: [google, ibm, azure, marytts, picotts]
  *       - name: cache
- *         description: Disable result cache
+ *         description: Use result cache (default Y)
  *         in: query
  *         required: false
  *         schema:
@@ -692,7 +701,7 @@ const wssStreams = {}
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm]
+ *           enum: [kaldi, google, ibm, azure]
  *     responses:
  *       200:
  *         description: Websocket Url to stream the audio to, and the uri to check status and end the stream
 
@@ -0,0 +1,146 @@
+const _ = require('lodash')
+const request = require('request-promise-native')
+const cheerio = require('cheerio')
+const EventEmitter = require('events')
+const { ResultReason, AudioInputStream, AudioStreamFormat, AudioConfig, SpeechRecognizer, PhraseListGrammar, OutputFormat } = require('microsoft-cognitiveservices-speech-sdk')
+const debug = require('debug')('botium-speech-processing-azure')
+
+const { azureSpeechConfig, applyExtraAzureSpeechConfig, getAzureErrorDetails } = require('../utils')
+
+const AZURE_STT_LANGUAGES_URL = 'https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support'
+const downloadLanguageCodes = async () => {
+  debug(`Downloading language codes from ${AZURE_STT_LANGUAGES_URL}`)
+  const htmlString = await request(AZURE_STT_LANGUAGES_URL)
+  const $ = cheerio.load(htmlString)
+
+  const languageCodes = []
+  $('table:first-of-type tbody tr').each(function () {
+    const tds = $(this).find('td')
+    const languageCode = $(tds[1]).text().trim()
+    if (languageCode) {
+      languageCodes.push(languageCode)
+    }
+  })
+  return languageCodes
+}
+
+let languageCodes = null
+
+class AzureSTT {
+  async languages (req) {
+    if (!languageCodes) {
+      languageCodes = _.uniq(await downloadLanguageCodes()).sort()
+    }
+    return languageCodes
+  }
+
+  async stt_OpenStream (req, { language }) {
+    const speechConfig = azureSpeechConfig(req)
+
+    speechConfig.outputFormat = OutputFormat.Detailed
+    if (language) speechConfig.speechRecognitionLanguage = language
+
+    applyExtraAzureSpeechConfig(speechConfig, req)
+
+    let audioFormat = AudioStreamFormat.getDefaultInputFormat()
+    const extraAzureFormatConfig = _.get(req, 'body.azure.config.audioStreamFormat')
+    if (extraAzureFormatConfig) {
+      audioFormat = AudioStreamFormat.getWaveFormatPCM(extraAzureFormatConfig.samplesPerSecond || 16000, extraAzureFormatConfig.bitsPerSample || 16, extraAzureFormatConfig.channels || 1)
+    }
+    const pushStream = AudioInputStream.createPushStream(audioFormat)
+    const audioConfig = AudioConfig.fromStreamInput(pushStream)
+    const recognizer = new SpeechRecognizer(speechConfig, audioConfig)
+
+    const events = new EventEmitter()
+
+    const recognizedHandler = (s, e) => {
+      if (e.result.reason === ResultReason.RecognizedSpeech || e.result.reason === ResultReason.RecognizingSpeech) {
+        const event = {
+          text: e.result.text,
+          final: e.result.reason === ResultReason.RecognizedSpeech,
+          debug: e.result
+        }
+        event.start = _.round(e.result.offset / 10000000, 3)
+        event.end = _.round((e.result.offset + e.result.duration) / 10000000, 3)
+        events.emit('data', event)
+      }
+    }
+    recognizer.recognizing = recognizedHandler
+    recognizer.recognized = recognizedHandler
+    recognizer.sessionStopped = (s, e) => {
+      recognizer.stopContinuousRecognitionAsync()
+      events.emit('close')
+    }
+    recognizer.startContinuousRecognitionAsync()
+
+    return new Promise((resolve, reject) => {
+      recognizer.canceled = (s, e) => {
+        recognizer.stopContinuousRecognitionAsync()
+        reject(new Error(`Azure STT failed: ${getAzureErrorDetails(e)}`))
+      }
+      recognizer.sessionStarted = (s, e) => {
+        resolve({
+          events,
+          write: (buffer) => {
+            pushStream.write(buffer)
+          },
+          end: () => {
+          },
+          close: () => {
+            recognizer.stopContinuousRecognitionAsync()
+            pushStream.close()
+          }
+        })
+      }
+    })
+  }
+
+  async stt (req, { language, buffer, hint }) {
+    const speechConfig = azureSpeechConfig(req)
+
+    speechConfig.outputFormat = OutputFormat.Detailed
+    if (language) speechConfig.speechRecognitionLanguage = language
+
+    applyExtraAzureSpeechConfig(speechConfig, req)
+
+    let audioFormat = AudioStreamFormat.getDefaultInputFormat()
+    const extraAzureFormatConfig = _.get(req, 'body.azure.config.audioStreamFormat')
+    if (extraAzureFormatConfig) {
+      audioFormat = AudioStreamFormat.getWaveFormatPCM(extraAzureFormatConfig.samplesPerSecond || 16000, extraAzureFormatConfig.bitsPerSample || 16, extraAzureFormatConfig.channels || 1)
+    }
+
+    const pushStream = AudioInputStream.createPushStream(audioFormat)
+    pushStream.write(buffer)
+    pushStream.close()
+
+    return new Promise((resolve, reject) => {
+      const audioConfig = AudioConfig.fromStreamInput(pushStream)
+      const recognizer = new SpeechRecognizer(speechConfig, audioConfig)
+
+      if (hint && hint.length > 0) {
+        const phraseList = PhraseListGrammar.fromRecognizer(recognizer)
+        phraseList.addPhrase(hint)
+      }
+
+      recognizer.recognizeOnceAsync(
+        result => {
+          if (result.errorDetails) {
+            reject(new Error(`Azure STT failed: ${getAzureErrorDetails(result)}`))
+          } else {
+            resolve({
+              text: result.text || '',
+              debug: result
+            })
+          }
+          recognizer.close()
+        },
+        error => {
+          debug(error)
+          recognizer.close()
+          reject(new Error(`Azure STT failed: ${error}`))
+        })
+    })
+  }
+}
+
+module.exports = AzureSTT
@@ -117,7 +117,7 @@ class GoogleSTT {
     }
   }
 
-  async stt (req, { language, buffer }) {
+  async stt (req, { language, buffer, hint }) {
     const speechClient = new speech.SpeechClient(googleOptions(req))
     const storageClient = new storage.Storage(googleOptions(req))
 
@@ -128,6 +128,11 @@ class GoogleSTT {
       audio: {
       }
     }
+    if (hint && hint.length > 0) {
+      request.config.speechContexts = {
+        phrases: [hint]
+      }
+    }
     if (process.env.BOTIUM_SPEECH_GOOGLE_CONFIG) {
       try {
         const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_GOOGLE_CONFIG)
 
@@ -47,7 +47,6 @@ class KaldiSTT {
         ws.on('message', (data) => {
           try {
             const dj = JSON.parse(data)
-            console.log(dj)
             const hypotheses = dj.result && dj.result.hypotheses && dj.result.hypotheses[0]
             if (hypotheses && hypotheses.transcript) {
               const event = {
 
@@ -2,7 +2,7 @@
   "openapi": "3.0.0",
   "info": {
     "title": "Botium Speech Processing API",
-    "version": "1.1.1",
+    "version": "1.2.1",
     "description": "Botium Speech Processing API"
   },
   "basePath": "/",
@@ -58,7 +58,8 @@
               "enum": [
                 "kaldi",
                 "google",
-                "ibm"
+                "ibm",
+                "azure"
               ]
             }
           }
@@ -99,7 +100,16 @@
           },
           {
             "name": "hint",
-            "description": "Hint text for calculating the Levenshtein edit distance for the result text (word error rate)",
+            "description": "Hint text for the Speech-to-text backend (supported by google and azure)",
+            "in": "query",
+            "required": false,
+            "schema": {
+              "type": "string"
+            }
+          },
+          {
+            "name": "wer",
+            "description": "Text for calculating the Levenshtein edit distance for the result text (word error rate)",
             "in": "query",
             "required": false,
             "schema": {
@@ -116,13 +126,14 @@
               "enum": [
                 "kaldi",
                 "google",
-                "ibm"
+                "ibm",
+                "azure"
               ]
             }
           },
           {
             "name": "cache",
-            "description": "Disable result cache",
+            "description": "Use result cache (default Y)",
             "in": "query",
             "required": false,
             "schema": {
@@ -181,6 +192,7 @@
               "enum": [
                 "google",
                 "ibm",
+                "azure",
                 "marytts",
                 "picotts"
               ]
@@ -237,6 +249,7 @@
               "enum": [
                 "google",
                 "ibm",
+                "azure",
                 "marytts",
                 "picotts"
               ]
@@ -305,14 +318,15 @@
               "enum": [
                 "google",
                 "ibm",
+                "azure",
                 "marytts",
                 "picotts"
               ]
             }
           },
           {
             "name": "cache",
-            "description": "Disable result cache",
+            "description": "Use result cache (default Y)",
             "in": "query",
             "required": false,
             "schema": {
@@ -587,7 +601,8 @@
               "enum": [
                 "kaldi",
                 "google",
-                "ibm"
+                "ibm",
+                "azure"
               ]
             }
           }
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ const main = async () => {`
`19`	`19`	`ws.on('message', (data) => {`
`20`	`20`	`try {`
`21`	`21`	`const dj = JSON.parse(data)`
`22`		`- if (dj.final) console.log('received: %s', dj.text)`
	`22`	`+ if (dj.final) console.log('received %s-%s: %s ', dj.start, dj.end, dj.text)`
`23`	`23`	`} catch (err) {`
`24`	`24`	`}`
`25`	`25`	`})`
Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ class GoogleSTT {`
`117`	`117`	`}`
`118`	`118`	`}`
`119`	`119`
`120`		`- async stt (req, { language, buffer }) {`
	`120`	`+ async stt (req, { language, buffer, hint }) {`
`121`	`121`	`const speechClient = new speech.SpeechClient(googleOptions(req))`
`122`	`122`	`const storageClient = new storage.Storage(googleOptions(req))`
`123`	`123`
`@@ -128,6 +128,11 @@ class GoogleSTT {`
`128`	`128`	`audio: {`
`129`	`129`	`}`
`130`	`130`	`}`
	`131`	`+ if (hint && hint.length > 0) {`
	`132`	`+ request.config.speechContexts = {`
	`133`	`+ phrases: [hint]`
	`134`	`+ }`
	`135`	`+ }`
`131`	`136`	`if (process.env.BOTIUM_SPEECH_GOOGLE_CONFIG) {`
`132`	`137`	`try {`
`133`	`138`	`const defaultConfig = JSON.parse(process.env.BOTIUM_SPEECH_GOOGLE_CONFIG)`