SpeakerStream: handle early speaker_labels, include keywords, alternatives, etc

nfriedly · nfriedly · commit 244bd2d63534 · 2016-12-18T14:31:56.000-05:00
* Can now gracefully handle early speaker_labels and recover once the matching result arrives
* Includes original result w/ keywords, alternatives, etc
  * may not be matched to the exact correct result currently
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+### v0.27.0
+* TimingStream rewrite - now emits exact results received from the service, always in the exact order recieved
+  * old version created extra interim results and could emit speaker_labels before their matching final result in certain circumstances
+  * emitAt now defaults to END to allow for interim results even when final is cached
+* SpeakerStream now emits keywords, alternatives, etc, although sometimes on a slightly earlier result then where the word is mentioned
+* SpeakerStream now gracefully handles situations where labels arrive before the matching final result
+
 ### v0.26.0
 * Renamed RecognizeStream 'connect' event to 'open' to match 'close' event
 * Removed deprecated connection-close event
diff --git a/speech-to-text/speaker-stream.js b/speech-to-text/speaker-stream.js
@@ -21,18 +21,19 @@ var Transform = require('stream').Transform;
 var util = require('util');
 var pullAllWith = require('lodash.pullallwith');
 var noTimestamps = require('./no-timestamps');
+var clone = require('clone');
 
 /**
  * Object-Mode stream that splits up results by speaker.
  *
  * Output format is similar to existing results formats, but with an extra speaker field,
  *
  * Output results array will usually contain multiple results.
- * All results are interim until the final batch; the text will not change, but the speaker may, and so the text may move from one interim result to another.
+ * All results are interim until the final batch; the text may change (if options.speakerlessInterim is enabled) or move from one interim result to another.
  *
- * Note: when combined with a TimingStream, data events may contain a combination of final and interim results (with the last one sometimes being interim)
+ * Keywords, words_alternatives, and other features may appear on results that come slightly earlier than the timestamp due to the way things are split up.
  *
- * Ignores interim results from the service.
+ * Ignores interim results from the service unless options.speakerlessInterim is enabled.
  *
  * @constructor
  * @param {Object} options
@@ -55,7 +56,7 @@ function SpeakerStream(options) {
    * @type {Array<Array>}
    * @private
    */
-  this.timestamps = [];
+  this.results = [];
   /**
    * speaker_labels is an array of objects.
    * Example:
@@ -82,6 +83,12 @@ function SpeakerStream(options) {
    * @private
    */
   this.speaker_labels = [];
+
+  this.mismatchErrorEmitted = false;
+
+  // flag to signal that labels were recieved before results, and therefore
+  // the stream needs to emit on the next batch of final results
+  this.extraLabels = false;
 }
 util.inherits(SpeakerStream, Transform);
 
@@ -103,60 +110,111 @@ SpeakerStream.ERROR_MISMATCH = 'MISMATCH';
  */
 SpeakerStream.prototype.buildMessage = function() {
   var final = this.isFinal();
-  var errored = false;
+  this.extraLabels = false;
 
+  // first match all speaker_labeles to the appropriate word and result
   // assumes that each speaker_label will have a matching word timestamp at the same index
   // stops processing and emits an error if this assumption is violated
-  var pairs = this.speaker_labels.map(function(label, i) {
-    var timestamp = this.timestamps[i];
-    if (!timestamp || timestamp[FROM] !== label.from || timestamp[TO] !== label.to) {
-      if (!errored) {
+  var resultIndex = 0;
+  var timestampIndex = -1;
+  // eslint-disable-next-line camelcase
+  var words = this.speaker_labels.map(function(speaker_label) {
+    var result = this.results[resultIndex];
+    timestampIndex++;
+    var timestamp = result.alternatives[0].timestamps[timestampIndex];
+    if (!timestamp) {
+      timestampIndex = 0;
+      resultIndex++;
+      result = this.results[resultIndex];
+      timestamp = result && result.alternatives[0].timestamps[timestampIndex];
+    }
+    if (!timestamp) {
+      // this shouldn't happen normally, but the TimingStream could inadvertently cause a
+      // speaker_labels to be emitted before a result
+      this.extraLabels = true;
+      return null;
+    }
+    if (timestamp[FROM] !== speaker_label.from || timestamp[TO] !== speaker_label.to) {
+      if (!this.mismatchErrorEmitted) {
         var err = new Error('Mismatch between speaker_label and word timestamp');
         err.name = SpeakerStream.ERROR_MISMATCH;
-        err.speaker_label = label;
+        // eslint-disable-next-line camelcase
+        err.speaker_label = speaker_label;
         err.timestamp = timestamp;
+        // eslint-disable-next-line camelcase
         err.speaker_labels = this.speaker_labels;
-        err.timestamps = this.timestamps;
+        err.results = this.results;
         this.emit('error', err);
-        errored = true;
+        this.mismatchErrorEmitted = true; // If one is off, then a bunch probably are. Just emit one error.
       }
       return null;
     }
-    return [timestamp, label];
+    return {
+      timestamp: timestamp,
+      speaker: speaker_label.speaker,
+      result: result
+    };
   }, this);
 
-  if (errored) {
+  // assume that there's nothing new to emit right now,
+  // wait for new results to match our new labels
+  if (this.extraLabels) {
     return;
   }
 
-  var results = pairs.reduce(function(arr, pair) {
-    // this turns our pairs into something that looks like a regular results object, only with a speaker field
-    // each result represents a single "line" from a particular speaker
-    // todo: consider also splitting results up at pauses (where they are split when they arrive from the service) - FormatStream helps here
-    var currentResult = arr[arr.length - 1];
-    if (!currentResult || currentResult.speaker !== pair[1].speaker) {
-      // new speaker - start a new result
-      // todo: consider trying to include word alternatives and other features in these results
-      currentResult = {
-        speaker: pair[1].speaker,
-        alternatives: [{
-          transcript: pair[0][WORD] + ' ',
-          timestamps: [
-            pair[0]
-          ]
-        }],
-        final: final
+  // filter out any nulls
+  words = words.filter(function(w) {
+    return w;
+  });
+
+  // group the words together into utterances by speaker
+  var utterances = words.reduce(function(arr, word) {
+    var utterance = arr[arr.length - 1];
+    // any time the speaker changes or the (original) result changes, create a new utterance
+    if (!utterance || utterance.speaker !== word.speaker || utterance.result !== word.result) {
+      utterance = {
+        speaker: word.speaker,
+        timestamps: [word.timestamp],
+        result: word.result
       };
       // and add it to the list
-      arr.push(currentResult);
+      arr.push(utterance);
     } else {
       // otherwise just append the current word to the current result
-      currentResult.alternatives[0].transcript += pair[0][WORD] + ' ';
-      currentResult.alternatives[0].timestamps.push(pair[0]);
+      utterance.timestamps.push(word.timestamp);
     }
     return arr;
   }, []);
 
+  // create new results
+  var results = utterances.map(function(utterance, i) {
+
+    // if this is the first usage of this result, clone the original (to keep keywords and such)
+    // otherwise create a new one
+    var result;
+    var lastUtterance = utterances[i - 1] || {};
+    if (utterance.result === lastUtterance.result) {
+      result = {alternatives: [{}]};
+    } else {
+      result = clone(utterance.result);
+    }
+
+    // update the result object
+    // set the speaker
+    result.speaker = utterance.speaker;
+    // overwrite the transcript and timestamps on the first alternative
+    var alt = result.alternatives[0];
+    alt.transcript = utterance.timestamps.map(function(ts) {
+      return ts[WORD];
+    }).join(' ') + ' ';
+    alt.timestamps = utterance.timestamps;
+    // overwrite the final value
+    result.final = final;
+    // todo: split up words_alternatives, keywords, etc and copy to appropriate result for time
+
+    return result;
+  });
+
   // result_index is always 0 because the results always includes the entire conversation so far.
   return {results: results, result_index: 0};
 };
@@ -175,7 +233,7 @@ SpeakerStream.prototype.handleResults = function(data) {
   data.results.filter(function(result) {
     return result.final;
   }).forEach(function(result) {
-    this.timestamps = this.timestamps.concat(result.alternatives[0].timestamps);
+    this.results.push(result);
   }, this);
 };
 
@@ -220,6 +278,10 @@ SpeakerStream.prototype._transform = function(data, encoding, next) {
       message = this.buildMessage();
       message.results = message.results.concat(data.results);
     }
+    // clean up if things got out of order
+    if (this.extraLabels && data.results.length && data.results[0].final === true) {
+      message = this.buildMessage();
+    }
   }
   if (Array.isArray(data.speaker_labels)) {
     this.handleSpeakerLabels(data);
@@ -247,18 +309,23 @@ SpeakerStream.prototype._transform = function(data, encoding, next) {
  * @private
  */
 SpeakerStream.prototype._flush = function(done) {
-  if (this.timestamps.length !== this.speaker_labels.length) {
+  var timestamps = this.results.map(function(r) {
+    return r.alternatives[0].timestamps;
+  }).reduce(function(a,b) {
+    return a.concat(b);
+  }, []);
+  if (timestamps.length !== this.speaker_labels.length) {
     var msg;
-    if (this.timestamps.length && !this.speaker_labels.length) {
+    if (timestamps.length && !this.speaker_labels.length) {
       msg = 'No speaker_labels found. SpeakerStream requires speaker_labels to be enabled.';
     } else {
-      msg = 'Mismatch between number of word timestamps (' + this.timestamps.length + ') and number of speaker_labels (' +
+      msg = 'Mismatch between number of word timestamps (' + timestamps.length + ') and number of speaker_labels (' +
         this.speaker_labels.length + ') - some data may be lost.';
     }
     var err = new Error(msg);
     err.name = SpeakerStream.ERROR_MISMATCH;
     err.speaker_labels = this.speaker_labels;
-    err.timestamps = this.timestamps;
+    err.timestamps = this.results;
     this.emit('error', err);
   }
   done();
diff --git a/speech-to-text/timing-stream.js b/speech-to-text/timing-stream.js
@@ -14,13 +14,13 @@ var noTimestamps = require('./no-timestamps');
  * @todo: fix TimingStream to work with the output of the SpeakerStream
  *
  * @param {Object} [opts]
- * @param {*} [opts.emitAt=TimingStream.START] - set to TimingStream.END to only emit text that has been completely spoken.
+ * @param {*} [opts.emitAt=TimingStream.END] - set to TimingStream.START for a more subtitles-like output where results are returned as soon as the utterance begins
  * @param {Number} [opts.delay=0] - Additional delay (in seconds) to apply before emitting words, useful for precise syncing to audio tracks. May be negative
  * @constructor
  */
 function TimingStream(opts) {
   this.options = defaults(opts, {
-    emitAt: TimingStream.START,
+    emitAt: TimingStream.END,
     delay: 0,
     allowHalfOpen: true, // keep the readable side open after the source closes
     writableObjectMode: true
diff --git a/test/speaker-stream-spec.js b/test/speaker-stream-spec.js
@@ -242,7 +242,105 @@ describe('SpeakerStream', function() {
     stream.end();
   });
 
+  it('should handle early speaker_labels gracefully', function(done) {
+    // there is/was a bug in the timing stream that could cause this in certain scenarios
+    var stream = new SpeakerStream();
+    stream.on('error', done);
+    var actual = [];
+    stream.on('data', function(data) {
+      actual.push(data);
+    });
+
+    var expected = [{
+      results: [{
+        speaker: 0,
+        alternatives: [{
+          timestamps: [
+            ['hi', 0.06, 0.28],
+          ],
+          transcript: 'hi '
+        }],
+        final: false
+      }],
+      result_index: 0
+    }, {
+      results: [{
+        speaker: 0,
+        alternatives: [{
+          timestamps: [
+            ['hi', 0.06, 0.28],
+          ],
+          transcript: 'hi '
+        }],
+        final: true
+      },
+        {
+          speaker: 1,
+          alternatives: [{
+            timestamps: [
+              ['hello', 0.28, 0.37],
+            ],
+            transcript: 'hello '
+          }],
+          final: true
+        }],
+      result_index: 0
+    }];
+
+    stream.on('end', function() {
+      assert.deepEqual(actual, expected);
+      done();
+    });
+
+    stream.write({
+      results: [{
+        alternatives: [{
+          timestamps: [
+            ['hi', 0.06, 0.28],
+          ],
+          transcript: 'hi '
+        }],
+        final: true,
+      }],
+      result_index: 0
+    });
+    stream.write({
+      speaker_labels: [{
+        from: 0.06,
+        to: 0.28,
+        speaker: 0,
+        confidence: 0.512,
+        final: false
+      }]
+    });
+    // this one is early
+    stream.write({
+      speaker_labels: [{
+        from: 0.28,
+        to: 0.37,
+        speaker: 1,
+        confidence: 0.512,
+        final: true
+      }]
+    });
+    // or, this is late
+    stream.write({
+      results: [{
+        alternatives: [{
+          timestamps: [
+            ['hello', 0.28, 0.37],
+          ],
+          transcript: 'hello '
+        }],
+        final: true
+      }],
+      result_index: 0
+    });
+    stream.end();
+  });
+
   describe('with TimingStream', function() {
+    var TimingStream = require('../speech-to-text/timing-stream.js');
     var clock;
     beforeEach(function() {
       clock = sinon.useFakeTimers();
@@ -252,9 +350,8 @@ describe('SpeakerStream', function() {
       clock.restore();
     });
 
-    it('should produce the same output with results from a TimingStream', function(done) {
+    it('should produce the same output with and without a TimingStream', function(done) {
       var inputMessages = require('./resources/car_loan_stream.json');
-      var TimingStream = require('../speech-to-text/timing-stream.js');
       var actualSpeakerStream = new SpeakerStream();
       var expectedSpeakerStream = new SpeakerStream();
       var timingStream = new TimingStream({objectMode: true});
@@ -290,7 +387,6 @@ describe('SpeakerStream', function() {
     });
   });
 
-
   it('should provide early results when options.speakerlessInterim=true', function(done) {
     var stream = new SpeakerStream({speakerlessInterim: true});
     stream.on('error', done);
diff --git a/test/timing-stream-spec.js b/test/timing-stream-spec.js