@@ -16,14 +16,14 @@ var defaults = require('defaults');
16
16
*
17
17
* @param {Object } opts
18
18
* @param {String } opts.model - some models / languages need special handling
19
- * @param {String } [opts.hesitation='\u2026 '] - what to put down for a "hesitation" event, defaults to an ellipsis ( ...)
19
+ * @param {String } [opts.hesitation=''] - what to put down for a "hesitation" event, also consider \u2026 ( ellipsis: ...)
20
20
* @param {Boolean } [options.objectMode=false] - emit `result` objects instead of string Buffers for the `data` events.
21
21
* @constructor
22
22
*/
23
23
function FormatStream ( opts ) {
24
24
this . options = defaults ( opts , {
25
25
model : '' , // some models should have all spaces removed
26
- hesitation : '\u2026' , // ellipsis
26
+ hesitation : '' ,
27
27
decodeStrings : false // false = don't convert strings to buffers before passing to _write
28
28
} ) ;
29
29
Transform . call ( this , this . options ) ;
@@ -33,7 +33,7 @@ function FormatStream(opts) {
33
33
}
34
34
util . inherits ( FormatStream , Transform ) ;
35
35
36
- var reHesitation = / % H E S I T A T I O N / g; // when the service detects a " hesitation" pause, it literally puts the string "%HESITATION" into the transcription
36
+ var reHesitation = / % H E S I T A T I O N ? / g; // http://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml# hesitation - D_ is handled below
37
37
var reRepeatedCharacter = / ( [ a - z ] ) \1{ 2 , } / ig; // detect the same character repeated three or more times and remove it
38
38
var reDUnderscoreWords = / D _ [ ^ \s ] + / g; // replace D_(anything)
39
39
@@ -45,7 +45,7 @@ var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
45
45
*/
46
46
FormatStream . prototype . clean = function clean ( text ) {
47
47
// clean out "junk"
48
- text = text . replace ( reHesitation , this . options . hesitation )
48
+ text = text . replace ( reHesitation , this . options . hesitation ? this . options . hesitation . trim ( ) + ' ' : this . options . hesitation )
49
49
. replace ( reRepeatedCharacter , '' )
50
50
. replace ( reDUnderscoreWords , '' ) ;
51
51
@@ -54,7 +54,7 @@ FormatStream.prototype.clean = function clean(text) {
54
54
text = text . replace ( / / g, '' ) ;
55
55
}
56
56
57
- return text . trim ( ) ;
57
+ return text . trim ( ) + ' ' ; // we want exactly 1 space at the end
58
58
} ;
59
59
60
60
/**
@@ -73,12 +73,13 @@ FormatStream.prototype.capitalize = function capitalize(text) {
73
73
* @returns {string }
74
74
*/
75
75
FormatStream . prototype . period = function period ( text ) {
76
+ text = text . trim ( ) ;
76
77
// don't put a period down if the clean stage remove all of the text
77
78
if ( ! text ) {
78
79
return ' ' ;
79
80
}
80
81
// just add a space if the sentence ends in an ellipse
81
- if ( this . options . hesitation && text . substr ( - 1 ) === this . options . hesitation ) {
82
+ if ( text . substr ( - 1 ) === '\u2026' ) {
82
83
return text + ' ' ;
83
84
}
84
85
return text + ( this . isJaCn ? '。' : '. ' ) ;
@@ -119,17 +120,23 @@ FormatStream.prototype.formatString = function(str, isInterim) {
119
120
FormatStream . prototype . formatResult = function formatResult ( data ) {
120
121
data = clone ( data ) ;
121
122
if ( Array . isArray ( data . results ) ) {
122
- data . results . forEach ( function ( result ) {
123
+ data . results . forEach ( function ( result , i ) {
124
+
125
+ // if there are multiple interim results (as produced by the speaker stream),
126
+ // treat the text as final in all but the last result
127
+ var textFinal = result . final || ( i !== ( data . results . length - 1 ) ) ;
128
+
123
129
result . alternatives = result . alternatives . map ( function ( alt ) {
124
- alt . transcript = this . formatString ( alt . transcript , ! result . final ) ;
130
+ alt . transcript = this . formatString ( alt . transcript , ! textFinal ) ;
125
131
if ( alt . timestamps ) {
126
- alt . timestamps = alt . timestamps . map ( function ( ts , i , arr ) {
132
+ alt . timestamps = alt . timestamps . map ( function ( ts , j , arr ) {
127
133
// timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]'
128
134
ts [ 0 ] = this . clean ( ts [ 0 ] ) ;
129
- if ( i === 0 ) {
135
+ if ( j === 0 ) {
130
136
ts [ 0 ] = this . capitalize ( ts [ 0 ] ) ;
131
137
}
132
- if ( i === arr . length - 1 && result . final ) {
138
+
139
+ if ( j === arr . length - 1 && textFinal ) {
133
140
ts [ 0 ] = this . period ( ts [ 0 ] ) ;
134
141
}
135
142
return ts ;
0 commit comments