switch sources to audiobuffers, split internals of l16 stream

nfriedly · nfriedly · commit 04336b884a10 · 2016-02-05T16:47:40.000-05:00
l16 stream (formerly wav stream) now does the downsampling and 16-bit conversion ins separate steps
this is to allow for a future change to native downsampling

I also removed the wav header because it's not needed
diff --git a/README.md b/README.md
@@ -3,17 +3,22 @@ IBM Watson Speech To Text Browser Client Library
 
 Allows you to easily add voice recognition to any web app with minimal code. 
 
+**Warning** This library is still early-stage and may see significant breaking changes.
+
 **For Web Browsers Only** This library is primarily intended for use in browsers. 
 Check out [watson-developer-cloud](https://www.npmjs.com/package/watson-developer-cloud) to use Watson services (speech and others) from Node.js.
 
-**Warning** This library is still early-stage and may see significant breaking changes.
+However, a server-side component is required to generate auth tokens. 
+The examples/ folder includes a node.js one, and SDKs are available for [Node.js](https://github.com/watson-developer-cloud/node-sdk#authorization), 
+[Java](https://github.com/watson-developer-cloud/java-sdk), 
+[Python](https://github.com/watson-developer-cloud/python-sdk/blob/master/examples/authorization_v1.py), 
+and there is also a [REST API](http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/getting_started/gs-tokens.shtml).
 
 See several examples at https://github.com/watson-developer-cloud/speech-javascript-sdk/tree/master/examples
 
 This library is built with [browserify](http://browserify.org/) and easy to use in browserify-based projects (`npm install --save watson-speech`), but you can also grab the compiled bundle from the 
 `dist/` folder and use it as a standalone library.
 
-
 ## `WatsonSpeech.SpeechToText` Basic API
 
 Complete API docs should be published at http://watson-developer-cloud.github.io/speech-javascript-sdk/
@@ -101,7 +106,8 @@ Inherits `.stop()` method and `result` event from the `RecognizeStream`.
 
 * Fix bugs around `.stop()
 * Solidify API
-* (eventually) add text-to-speech support
+* support objectMode instead of having random events
+*  add text-to-speech support
 * add an example that includes alternatives and word confidence scores
 * automate dist/ generation (and possibly move it)
 * enable eslint
@@ -111,5 +117,5 @@ Inherits `.stop()` method and `result` event from the `RecognizeStream`.
 * more tests in general
 * update node-sdk to use current version of this lib's RecognizeStream (and also provide the FormatStream + anything else that might be handy)
 * improve docs
-* look at supporting/migrating to https://streams.spec.whatwg.org/ / https://github.com/whatwg/streams once it's ready
-* Add Text to Speech once CORS support is avaliable.
+
+
diff --git a/speech-to-text/index.js b/speech-to-text/index.js
@@ -7,15 +7,15 @@ module.exports = {
   recognizeElement: require('./recognize-element'),
 
   // individual components to build more customized solutions
-  WebAudioWavStream: require('./webaudio-wav-stream'),
+  WebAudioL16Stream: require('./webaudio-l16-stream'),
   MediaElementAudioStream: require('./media-element-audio-stream'),
   RecognizeStream: require('./recognize-stream'),
   FilePlayer: require('./file-player'),
   getUserMedia: require('./getusermedia'),
   FormatStream: require('./format-stream'),
   TimingStream: require('./timing-stream'),
 
-  // external (provided here to allow the lib to be used standalone w/out browserify)
+  // external components provided here to allow the lib to be used standalone (w/out browserify)
   MicrophoneStream: require('microphone-stream'),
-  Buffer: Buffer // may be needed to send data to the streams
+  Buffer: Buffer
 };
diff --git a/speech-to-text/media-element-audio-stream.js b/speech-to-text/media-element-audio-stream.js
@@ -20,10 +20,12 @@ function MediaElementAudioStream(source, opts) {
     // buffer size to balance between latency and audio quality."
     // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
     // Possible values: null, 256, 512, 1024, 2048, 4096, 8192, 16384
-    bufferSize:null,
+    // however, webkitAudioContext (safari) requires it to be set
+    bufferSize: (typeof AudioContext != "undefined" ? null : 4096),
     muteSource: false,
     autoplay: true,
-    crossOrigin: "anonymous" // required for cross-domain audio playback
+    crossOrigin: "anonymous", // required for cross-domain audio playback
+    objectMode: true // true = emit AudioBuffers w/ audio + some metadata, false = emite node.js Buffers (with binary data only
   }, opts);
 
   // We can only emit one channel's worth of audio, so only one input. (Who has multiple microphones anyways?)
@@ -38,7 +40,7 @@ function MediaElementAudioStream(source, opts) {
   var recording = true;
 
   // I can't seem to find any documentation for this on <audio> elements, but it seems to be required for cross-domain usage (in addition to CORS headers)
-  //source.crossOrigin = opts.crossOrigin;
+  source.crossOrigin = opts.crossOrigin;
 
   /**
    * Convert and emit the raw audio data
@@ -48,32 +50,15 @@ function MediaElementAudioStream(source, opts) {
   function processAudio(e) {
     // onaudioprocess can be called at least once after we've stopped
     if (recording) {
-
-      var raw = e.inputBuffer.getChannelData(0);
-
-      /**
-       * @event MicrophoneStream#raw
-       * @param {Float32Array} data raw audio data from browser - each sample is a number from -1 to 1
-       */
-      self.emit('raw', raw);
-
-      // Standard (non-object mode) Node.js streams only accepts Buffers or Strings
-      var nodebuffer = new Buffer(raw.buffer);
-
-      /**
-       * Emit the readable/data event with a node-style buffer.
-       * Note: this is essentially a new DataView on the same underlying ArrayBuffer.
-       * The raw audio data is not actually coppied or changed.
-       *
-       * @event MicrophoneStream#data
-       * @param {Buffer} chunk node-style buffer with audio data; buffers are essentially a Uint8Array
-       */
-      self.push(nodebuffer);
+      // todo: interleave channels in binary mode
+      self.push( opts.objectMode ? e.inputBuffer : new Buffer(e.inputBuffer.getChannelData(0)) );
     }
   }
 
-  var context = new AudioContext();
-  var audioInput = context.createMediaElementSource(source);
+  var AudioContext = window.AudioContext || window.webkitAudioContext;
+  // cache the source node & context since it's not possible to recreate it later
+  var context = source.context = source.context || new AudioContext();
+  var audioInput = source.node  = source.node || context.createMediaElementSource(source);
   var scriptProcessor = context.createScriptProcessor(opts.bufferSize, inputChannels, outputChannels);
 
   scriptProcessor.onaudioprocess = processAudio;
@@ -84,23 +69,40 @@ function MediaElementAudioStream(source, opts) {
     gain.connect(context.destination);
   }
 
-  audioInput.connect(scriptProcessor);
-
-  // other half of workaround for chrome bugs
-  scriptProcessor.connect(context.destination);
+  /**
+   * Setup script processor to extract audio and also re-connect it via a no-op gain node if desired
+   *
+   * Delayed to avoid processing the stream of silence received before the file begins playing
+   *
+   */
+  function connect() {
+    audioInput.connect(scriptProcessor);
+    // other half of workaround for chrome bugs
+    scriptProcessor.connect(context.destination);
+    source.removeEventListener("playing", connect);
+  }
+  source.addEventListener("playing", connect);
 
   // https://developer.mozilla.org/en-US/docs/Web/Guide/Events/Media_events
+  // https://developer.mozilla.org/en-US/docs/Web/API/HTMLMediaElement/readyState
   function start() {
     source.play();
     source.removeEventListener("canplaythrough", start);
   }
   if (opts.autoplay) {
-    source.addEventListener("canplaythrough", start);
+    // play immediately if we have enough data, otherwise wait for the canplaythrough event
+    if(source.readyState === source.HAVE_ENOUGH_DATA) {
+      source.play();
+    } else {
+      source.addEventListener("canplaythrough", start);
+    }
   }
 
   function end() {
     recording = false;
     scriptProcessor.disconnect();
+    audioInput.disconnect();
+    //context.close(); // this prevents us from re-using the same audio element until the page is refreshed
     self.push(null);
     self.emit('close');
   }
@@ -114,6 +116,7 @@ function MediaElementAudioStream(source, opts) {
   source.addEventListener("error", this.emit.bind(this, 'error'));
 
   process.nextTick(function() {
+    // this is more useful for binary mode than object mode, but it won't hurt either way
     self.emit('format', {
       channels: 1,
       bitDepth: 32,
diff --git a/speech-to-text/recognize-element.js b/speech-to-text/recognize-element.js
@@ -16,7 +16,7 @@
 
 'use strict';
 var MediaElementAudioStream = require('./media-element-audio-stream');
-var WebAudioTo16leStream = require('./webaudio-wav-stream');
+var L16 = require('./webaudio-l16-stream');
 var RecognizeStream = require('./recognize-stream.js');
 
 /**
@@ -33,13 +33,13 @@ module.exports = function recognizeElement(options) {
     throw new Error("WatsonSpeechToText: missing required parameter: opts.token");
   }
 
-  //options['content-type'] = 'audio/l16;rate=16000';
+  options['content-type'] = 'audio/l16;rate=16000'; // raw wav audio (no header)
   var recognizeStream = new RecognizeStream(options);
 
   var sourceStream = new MediaElementAudioStream(options.element , options);
 
   sourceStream
-    .pipe(new WebAudioTo16leStream())
+    .pipe(new L16())
     .pipe(recognizeStream);
 
   recognizeStream.on('stop', sourceStream.stop.bind(sourceStream));
diff --git a/speech-to-text/recognize-microphone.js b/speech-to-text/recognize-microphone.js
@@ -20,8 +20,7 @@
 var getUserMedia = require('./getusermedia');
 var MicrophoneStream = require('microphone-stream');
 var RecognizeStream = require('./recognize-stream.js');
-var WebAudioTo16leStream = require('./webaudio-wav-stream.js');
-
+var L16 = require('./webaudio-l16-stream.js');
 
 /**
  * Create and return a RecognizeStream from the user's microphone
@@ -37,13 +36,16 @@ module.exports = function recognizeMicrophone(options) {
     throw new Error("WatsonSpeechToText: missing required parameter: opts.token");
   }
 
-  //options['content-type'] = 'audio/l16;rate=16000';
+  options['content-type'] = 'audio/l16;rate=16000'; // raw wav audio (no header)
   var recognizeStream = new RecognizeStream(options);
 
   getUserMedia({video: false, audio: true}).then(function(mic) {
-    var micStream = new MicrophoneStream(mic, options);
+    var micStream = new MicrophoneStream(mic, {
+      objectMode: true,
+      bufferSize: options.bufferSize
+    });
     micStream
-      .pipe(new WebAudioTo16leStream())
+      .pipe(new L16())
       .pipe(recognizeStream);
 
     recognizeStream.on('stop', micStream.stop.bind(micStream));
diff --git a/speech-to-text/webaudio-l16-stream.js b/speech-to-text/webaudio-l16-stream.js
diff --git a/speech-to-text/webaudio-wav-stream.js b/speech-to-text/webaudio-wav-stream.js