improving element handling to avoid false starts before the audio is fully ready to stream

nfriedly · nfriedly · commit 2a20d93b33e8 · 2016-02-02T10:08:38.000-05:00
diff --git a/README.md b/README.md
@@ -113,3 +113,5 @@ In addition to the standard [Node.js stream events](https://nodejs.org/api/strea
 * run integration tests on travis (fall back to offline server for pull requests)
 * more tests in general
 * update node-sdk to use current version of this lib's RecognizeStream (and also provide the FormatStream + anything else that might be handy)
+* improve docs
+* look at supporting/migrating to https://streams.spec.whatwg.org/ / https://github.com/whatwg/streams once it's ready
diff --git a/dist/watson-speech.js b/dist/watson-speech.js
diff --git a/speech-to-text/format-stream.js b/speech-to-text/format-stream.js
@@ -38,12 +38,12 @@ var reRepeatedCharacter = /(.)\1{2,}/g; // detect the same character repeated th
 var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
 
 /**
- * Formats a single alternative of a final or interim result
+ * Formats one or more words, removing special symbols, junk, and spacing for some languages
  * @param text
  * @param isFinal
  * @returns {String}
  */
-FormatStream.prototype.format = function format(text, isFinal) {
+FormatStream.prototype.clean = function clean(text) {
   // clean out "junk"
   text = text.trim().replace(reHesitation, this.opts.hesitation)
     .replace(reRepeatedCharacter, '')
@@ -54,24 +54,35 @@ FormatStream.prototype.format = function format(text, isFinal) {
     return text;
   }
 
-  // capitalize first word
-  text = text.charAt(0).toUpperCase() + text.substring(1);
-
   // remove spaces for Japanese and Chinese
   if (this.isJaCn) {
     text = text.replace(/ /g,'');
   }
 
-  // if final, insert a period and restore the trailing space
-  if (isFinal) {
-      text = text + (this.isJaCn ? '。' : '. ');
-  }
   return text;
 };
 
+/**
+ * Capitalizes the first word of a sentence
+ * @param text
+ * @returns {string}
+ */
+FormatStream.prototype.capitalize = function capitalize(text) {
+  // capitalize first word, returns '' in the case of an empty word
+  return text.charAt(0).toUpperCase() + text.substring(1);
+};
+
+/**
+ * puts a period on the end of a sentence
+ * @param text
+ * @returns {string}
+ */
+FormatStream.prototype.period = function period(text) {
+  return text + (this.isJaCn ? '。' : '. ')
+};
 
 FormatStream.prototype._transform = function(chunk, encoding, next) {
-  this.push(this.format(chunk.toString(), true));
+  this.push(this.period(this.capitalize(this.clean(chunk.toString()))));
   next();
 };
 
@@ -82,9 +93,25 @@ FormatStream.prototype._transform = function(chunk, encoding, next) {
  */
 FormatStream.prototype.handleResult = function handleResult(result) {
   result = clone(result);
-  result.alternatives = result.alternatives.map(function(alternative) {
-    alternative.transcript = this.format(alternative.transcript, result.final);
-    return alternative;
+  result.alternatives = result.alternatives.map(function(alt) {
+    alt.transcript = this.capitalize(this.clean(alt.transcript));
+    if (result.final) {
+      alt.transcript = this.period(alt.transcript)
+    }
+    if (alt.timestamps) {
+      alt.timestamps = alt.timestamps.map(function(ts, i, arr) {
+        // timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]'
+        ts[0] = this.clean(ts[0]);
+        if (i===0) {
+          ts[0] = this.capitalize(ts[0])
+        }
+        if (i == arr.length-1 && result.final) {
+          ts[0] = this.period(ts[0])
+        }
+        return ts;
+      }, this);
+    }
+    return alt;
   }, this);
   this.emit('result', result);
 };
diff --git a/speech-to-text/media-element-audio-stream.js b/speech-to-text/media-element-audio-stream.js
@@ -22,20 +22,24 @@ function MediaElementAudioStream(source, opts) {
     // Possible values: null, 256, 512, 1024, 2048, 4096, 8192, 16384
     bufferSize:null,
     muteSource: false,
-    autoplay: true
+    autoplay: true,
+    crossOrigin: "anonymous" // required for cross-domain audio playback
   }, opts);
 
   // We can only emit one channel's worth of audio, so only one input. (Who has multiple microphones anyways?)
   var inputChannels = 1;
 
-  // we shouldn't need any output channels (going back to the browser), but chrome is buggy and won't give us any audio without one
+  // we shouldn't need any output channels (going back to the browser - that's what the gain node is for), but chrome is buggy and won't give us any audio without one
   var outputChannels = 1;
 
   Readable.call(this, opts);
 
   var self = this;
   var recording = true;
 
+  // I can't seem to find any documentation for this on <audio> elements, but it seems to be required for cross-domain usage (in addition to CORS headers)
+  //source.crossOrigin = opts.crossOrigin;
+
   /**
    * Convert and emit the raw audio data
    * @see https://developer.mozilla.org/en-US/docs/Web/API/ScriptProcessorNode/onaudioprocess
@@ -85,15 +89,29 @@ function MediaElementAudioStream(source, opts) {
   // other half of workaround for chrome bugs
   scriptProcessor.connect(context.destination);
 
-  this.stop = function() {
+  // https://developer.mozilla.org/en-US/docs/Web/Guide/Events/Media_events
+  function start() {
+    source.play();
+    source.removeEventListener("canplaythrough", start);
+  }
+  if (opts.autoplay) {
+    source.addEventListener("canplaythrough", start);
+  }
+
+  function end() {
     recording = false;
-    source.pause();
-    source.currentTime = 0;
+    scriptProcessor.disconnect();
     self.push(null);
     self.emit('close');
+  }
+  source.addEventListener("ended", end);
+
+  this.stop = function() {
+    source.pause();
+    end();
   };
 
-  source.addEventListener("ended", this.stop);
+  source.addEventListener("error", this.emit.bind(this, 'error'));
 
   process.nextTick(function() {
     self.emit('format', {
@@ -103,16 +121,13 @@ function MediaElementAudioStream(source, opts) {
       signed: true,
       float: true
     });
-    if (opts.autoplay) {
-      source.play();
-    }
   });
 
 }
 util.inherits(MediaElementAudioStream, Readable);
 
 MediaElementAudioStream.prototype._read = function(/* bytes */) {
-  // no-op, (flow-control doesn't really work on sound)
+  // no-op, (back-pressure flow-control doesn't really work on sound)
 };
 
 /**
diff --git a/test/spec.js b/test/spec.js
@@ -29,18 +29,18 @@ function getAudio() {
 // integration = testing against actual watson servers
 var offline = process.env.TEST_MODE !== 'integration';
 var chrome = navigator.userAgent.indexOf('Chrome') >=0;
+var firefox = navigator.userAgent.indexOf('Firefox') >=0;
 var travis = !!process.env.TRAVIS;
 
 describe("WatsonSpeechToText", function() {
 
   this.timeout(30*1000);
 
-  // not sure why, but I can't convince firefox or chrome to actually play <audio> elements during tests
-  // also, on travis, the element never appears to stop playing (or, more likely, it nevers starts in the first place)
+  // firefox on travis always times out for this test, not sure why (it might be due to travis's older version of ff)
   it('should transcribe <audio> elements', function(done) {
     getConfig().then(function(cfg) {
       var audioElement = new Audio();
-      audioElement.crossOrigin = true;
+      audioElement.crossOrigin = "anonymous";
       audioElement.src = "http://localhost:9877/audio.wav";
       cfg.element = audioElement;
       cfg.muteSource = true;