Skip to content

Commit 258488a

Browse files
committed
Fixed FormatStream & tests around repeated characters to avoid breaking numbers."
1 parent 8189ebf commit 258488a

File tree

2 files changed

+51
-12
lines changed

2 files changed

+51
-12
lines changed

speech-to-text/format-stream.js

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ function FormatStream(opts) {
3131
}
3232
util.inherits(FormatStream, Transform);
3333

34-
var reHesitation = /%HESITATION\s/g; // when the service tetects a "hesitation" pause, it literally puts the string "%HESITATION" into the transcription
35-
var reRepeatedCharacter = /(.)\1{2,}/g; // detect the same character repeated three or more times and remove it
34+
var reHesitation = /%HESITATION\s/g; // when the service detects a "hesitation" pause, it literally puts the string "%HESITATION" into the transcription
35+
var reRepeatedCharacter = /([a-z])\1{2,}/ig; // detect the same character repeated three or more times and remove it
3636
var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
3737

3838
/**
@@ -43,21 +43,16 @@ var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
4343
*/
4444
FormatStream.prototype.clean = function clean(text) {
4545
// clean out "junk"
46-
text = text.trim().replace(reHesitation, this.options.hesitation)
46+
text = text.replace(reHesitation, this.options.hesitation)
4747
.replace(reRepeatedCharacter, '')
4848
.replace(reDUnderscoreWords,'');
4949

50-
// short-circuit if there's no actual text (avoids getting multiple periods after a pause)
51-
if (!text) {
52-
return text;
53-
}
54-
5550
// remove spaces for Japanese and Chinese
5651
if (this.isJaCn) {
5752
text = text.replace(/ /g,'');
5853
}
5954

60-
return text;
55+
return text.trim();
6156
};
6257

6358
/**
@@ -71,11 +66,15 @@ FormatStream.prototype.capitalize = function capitalize(text) {
7166
};
7267

7368
/**
74-
* puts a period on the end of a sentence
69+
* Puts a period on the end of a sentence
7570
* @param {String} text
7671
* @returns {string}
7772
*/
7873
FormatStream.prototype.period = function period(text) {
74+
// don't put a period down if the clean stage remove all of the text
75+
if (!text) {
76+
return ' ';
77+
}
7978
return text + (this.isJaCn ? '。' : '. ');
8079
};
8180

test/format-stream-spec.js

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ describe('FormatStream', function() {
88

99
it('should format strings', function(done) {
1010
var stream = new FormatStream();
11+
stream.setEncoding('utf8');
1112
var source = 'foo bar ';
1213
var expected = 'Foo bar. ';
1314
stream.on('data', function(actual) {
14-
assert(actual, expected);
15+
assert.equal(actual, expected);
1516
done();
1617
});
1718
stream.on('error', done);
@@ -20,6 +21,7 @@ describe('FormatStream', function() {
2021

2122
it('should format objects', function(done) {
2223
var stream = new FormatStream({objectMode: true});
24+
stream.setEncoding('utf8');
2325
var source = {alternatives:
2426
[{
2527
confidence: 0.881,
@@ -33,13 +35,51 @@ describe('FormatStream', function() {
3335
final: true}],
3436
result_index: 0};
3537
stream.on('data', function(actual) {
36-
assert(actual, expected);
38+
assert.equal(actual, expected);
3739
done();
3840
});
3941
stream.on('error', done);
4042
stream.write(source);
4143
});
4244

45+
it('should drop repeated characters', function(done) {
46+
var stream = new FormatStream();
47+
stream.setEncoding('utf8');
48+
var source = 'I, uh mmmmmmmmm ';
49+
var expected = 'I, uh. ';
50+
stream.on('data', function(actual) {
51+
assert.equal(actual, expected);
52+
done();
53+
});
54+
stream.on('error', done);
55+
stream.write(source);
56+
});
57+
58+
it('should not add a period to empty text', function(done) {
59+
var stream = new FormatStream();
60+
stream.setEncoding('utf8');
61+
var source = 'mmmmmmmmm '; // this will be stripped by the repeated character check
62+
var expected = ' ';
63+
stream.on('data', function(actual) {
64+
assert.equal(actual, expected);
65+
done();
66+
});
67+
stream.on('error', done);
68+
stream.write(source);
69+
});
70+
71+
it('should not drop portions of numbers when smart formatting is enabled', function(done) {
72+
var stream = new FormatStream();
73+
stream.setEncoding('utf8');
74+
var source = '1000101 ';
75+
var expected = '1000101. ';
76+
stream.on('data', function(actual) {
77+
assert.equal(actual, expected);
78+
done();
79+
});
80+
stream.on('error', done);
81+
stream.write(source);
82+
});
4383

4484
/*
4585
{ results:

0 commit comments

Comments
 (0)