Skip to content

Commit 2a20d93

Browse files
committed
improving element handling to avoid false starts before the audio is fully ready to stream
1 parent faf27d0 commit 2a20d93

File tree

5 files changed

+155
-49
lines changed

5 files changed

+155
-49
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,5 @@ In addition to the standard [Node.js stream events](https://nodejs.org/api/strea
113113
* run integration tests on travis (fall back to offline server for pull requests)
114114
* more tests in general
115115
* update node-sdk to use current version of this lib's RecognizeStream (and also provide the FormatStream + anything else that might be handy)
116+
* improve docs
117+
* look at supporting/migrating to https://streams.spec.whatwg.org/ / https://github.com/whatwg/streams once it's ready

dist/watson-speech.js

Lines changed: 85 additions & 23 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

speech-to-text/format-stream.js

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ var reRepeatedCharacter = /(.)\1{2,}/g; // detect the same character repeated th
3838
var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
3939

4040
/**
41-
* Formats a single alternative of a final or interim result
41+
* Formats one or more words, removing special symbols, junk, and spacing for some languages
4242
* @param text
4343
* @param isFinal
4444
* @returns {String}
4545
*/
46-
FormatStream.prototype.format = function format(text, isFinal) {
46+
FormatStream.prototype.clean = function clean(text) {
4747
// clean out "junk"
4848
text = text.trim().replace(reHesitation, this.opts.hesitation)
4949
.replace(reRepeatedCharacter, '')
@@ -54,24 +54,35 @@ FormatStream.prototype.format = function format(text, isFinal) {
5454
return text;
5555
}
5656

57-
// capitalize first word
58-
text = text.charAt(0).toUpperCase() + text.substring(1);
59-
6057
// remove spaces for Japanese and Chinese
6158
if (this.isJaCn) {
6259
text = text.replace(/ /g,'');
6360
}
6461

65-
// if final, insert a period and restore the trailing space
66-
if (isFinal) {
67-
text = text + (this.isJaCn ? '。' : '. ');
68-
}
6962
return text;
7063
};
7164

65+
/**
66+
* Capitalizes the first word of a sentence
67+
* @param text
68+
* @returns {string}
69+
*/
70+
FormatStream.prototype.capitalize = function capitalize(text) {
71+
// capitalize first word, returns '' in the case of an empty word
72+
return text.charAt(0).toUpperCase() + text.substring(1);
73+
};
74+
75+
/**
76+
* puts a period on the end of a sentence
77+
* @param text
78+
* @returns {string}
79+
*/
80+
FormatStream.prototype.period = function period(text) {
81+
return text + (this.isJaCn ? '。' : '. ')
82+
};
7283

7384
FormatStream.prototype._transform = function(chunk, encoding, next) {
74-
this.push(this.format(chunk.toString(), true));
85+
this.push(this.period(this.capitalize(this.clean(chunk.toString()))));
7586
next();
7687
};
7788

@@ -82,9 +93,25 @@ FormatStream.prototype._transform = function(chunk, encoding, next) {
8293
*/
8394
FormatStream.prototype.handleResult = function handleResult(result) {
8495
result = clone(result);
85-
result.alternatives = result.alternatives.map(function(alternative) {
86-
alternative.transcript = this.format(alternative.transcript, result.final);
87-
return alternative;
96+
result.alternatives = result.alternatives.map(function(alt) {
97+
alt.transcript = this.capitalize(this.clean(alt.transcript));
98+
if (result.final) {
99+
alt.transcript = this.period(alt.transcript)
100+
}
101+
if (alt.timestamps) {
102+
alt.timestamps = alt.timestamps.map(function(ts, i, arr) {
103+
// timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]'
104+
ts[0] = this.clean(ts[0]);
105+
if (i===0) {
106+
ts[0] = this.capitalize(ts[0])
107+
}
108+
if (i == arr.length-1 && result.final) {
109+
ts[0] = this.period(ts[0])
110+
}
111+
return ts;
112+
}, this);
113+
}
114+
return alt;
88115
}, this);
89116
this.emit('result', result);
90117
};

speech-to-text/media-element-audio-stream.js

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,24 @@ function MediaElementAudioStream(source, opts) {
2222
// Possible values: null, 256, 512, 1024, 2048, 4096, 8192, 16384
2323
bufferSize:null,
2424
muteSource: false,
25-
autoplay: true
25+
autoplay: true,
26+
crossOrigin: "anonymous" // required for cross-domain audio playback
2627
}, opts);
2728

2829
// We can only emit one channel's worth of audio, so only one input. (Who has multiple microphones anyways?)
2930
var inputChannels = 1;
3031

31-
// we shouldn't need any output channels (going back to the browser), but chrome is buggy and won't give us any audio without one
32+
// we shouldn't need any output channels (going back to the browser - that's what the gain node is for), but chrome is buggy and won't give us any audio without one
3233
var outputChannels = 1;
3334

3435
Readable.call(this, opts);
3536

3637
var self = this;
3738
var recording = true;
3839

40+
// I can't seem to find any documentation for this on <audio> elements, but it seems to be required for cross-domain usage (in addition to CORS headers)
41+
//source.crossOrigin = opts.crossOrigin;
42+
3943
/**
4044
* Convert and emit the raw audio data
4145
* @see https://developer.mozilla.org/en-US/docs/Web/API/ScriptProcessorNode/onaudioprocess
@@ -85,15 +89,29 @@ function MediaElementAudioStream(source, opts) {
8589
// other half of workaround for chrome bugs
8690
scriptProcessor.connect(context.destination);
8791

88-
this.stop = function() {
92+
// https://developer.mozilla.org/en-US/docs/Web/Guide/Events/Media_events
93+
function start() {
94+
source.play();
95+
source.removeEventListener("canplaythrough", start);
96+
}
97+
if (opts.autoplay) {
98+
source.addEventListener("canplaythrough", start);
99+
}
100+
101+
function end() {
89102
recording = false;
90-
source.pause();
91-
source.currentTime = 0;
103+
scriptProcessor.disconnect();
92104
self.push(null);
93105
self.emit('close');
106+
}
107+
source.addEventListener("ended", end);
108+
109+
this.stop = function() {
110+
source.pause();
111+
end();
94112
};
95113

96-
source.addEventListener("ended", this.stop);
114+
source.addEventListener("error", this.emit.bind(this, 'error'));
97115

98116
process.nextTick(function() {
99117
self.emit('format', {
@@ -103,16 +121,13 @@ function MediaElementAudioStream(source, opts) {
103121
signed: true,
104122
float: true
105123
});
106-
if (opts.autoplay) {
107-
source.play();
108-
}
109124
});
110125

111126
}
112127
util.inherits(MediaElementAudioStream, Readable);
113128

114129
MediaElementAudioStream.prototype._read = function(/* bytes */) {
115-
// no-op, (flow-control doesn't really work on sound)
130+
// no-op, (back-pressure flow-control doesn't really work on sound)
116131
};
117132

118133
/**

test/spec.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,18 @@ function getAudio() {
2929
// integration = testing against actual watson servers
3030
var offline = process.env.TEST_MODE !== 'integration';
3131
var chrome = navigator.userAgent.indexOf('Chrome') >=0;
32+
var firefox = navigator.userAgent.indexOf('Firefox') >=0;
3233
var travis = !!process.env.TRAVIS;
3334

3435
describe("WatsonSpeechToText", function() {
3536

3637
this.timeout(30*1000);
3738

38-
// not sure why, but I can't convince firefox or chrome to actually play <audio> elements during tests
39-
// also, on travis, the element never appears to stop playing (or, more likely, it nevers starts in the first place)
39+
// firefox on travis always times out for this test, not sure why (it might be due to travis's older version of ff)
4040
it('should transcribe <audio> elements', function(done) {
4141
getConfig().then(function(cfg) {
4242
var audioElement = new Audio();
43-
audioElement.crossOrigin = true;
43+
audioElement.crossOrigin = "anonymous";
4444
audioElement.src = "http://localhost:9877/audio.wav";
4545
cfg.element = audioElement;
4646
cfg.muteSource = true;

0 commit comments

Comments
 (0)