Skip to content

Commit 04336b8

Browse files
committed
switch sources to audiobuffers, split internals of l16 stream
l16 stream (formerly wav stream) now does the downsampling and 16-bit conversion ins separate steps this is to allow for a future change to native downsampling I also removed the wav header because it's not needed
1 parent bb48e63 commit 04336b8

File tree

7 files changed

+247
-214
lines changed

7 files changed

+247
-214
lines changed

README.md

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,22 @@ IBM Watson Speech To Text Browser Client Library
33

44
Allows you to easily add voice recognition to any web app with minimal code.
55

6+
**Warning** This library is still early-stage and may see significant breaking changes.
7+
68
**For Web Browsers Only** This library is primarily intended for use in browsers.
79
Check out [watson-developer-cloud](https://www.npmjs.com/package/watson-developer-cloud) to use Watson services (speech and others) from Node.js.
810

9-
**Warning** This library is still early-stage and may see significant breaking changes.
11+
However, a server-side component is required to generate auth tokens.
12+
The examples/ folder includes a node.js one, and SDKs are available for [Node.js](https://github.com/watson-developer-cloud/node-sdk#authorization),
13+
[Java](https://github.com/watson-developer-cloud/java-sdk),
14+
[Python](https://github.com/watson-developer-cloud/python-sdk/blob/master/examples/authorization_v1.py),
15+
and there is also a [REST API](http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/getting_started/gs-tokens.shtml).
1016

1117
See several examples at https://github.com/watson-developer-cloud/speech-javascript-sdk/tree/master/examples
1218

1319
This library is built with [browserify](http://browserify.org/) and easy to use in browserify-based projects (`npm install --save watson-speech`), but you can also grab the compiled bundle from the
1420
`dist/` folder and use it as a standalone library.
1521

16-
1722
## `WatsonSpeech.SpeechToText` Basic API
1823

1924
Complete API docs should be published at http://watson-developer-cloud.github.io/speech-javascript-sdk/
@@ -101,7 +106,8 @@ Inherits `.stop()` method and `result` event from the `RecognizeStream`.
101106

102107
* Fix bugs around `.stop()
103108
* Solidify API
104-
* (eventually) add text-to-speech support
109+
* support objectMode instead of having random events
110+
* add text-to-speech support
105111
* add an example that includes alternatives and word confidence scores
106112
* automate dist/ generation (and possibly move it)
107113
* enable eslint
@@ -111,5 +117,5 @@ Inherits `.stop()` method and `result` event from the `RecognizeStream`.
111117
* more tests in general
112118
* update node-sdk to use current version of this lib's RecognizeStream (and also provide the FormatStream + anything else that might be handy)
113119
* improve docs
114-
* look at supporting/migrating to https://streams.spec.whatwg.org/ / https://github.com/whatwg/streams once it's ready
115-
* Add Text to Speech once CORS support is avaliable.
120+
121+

speech-to-text/index.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@ module.exports = {
77
recognizeElement: require('./recognize-element'),
88

99
// individual components to build more customized solutions
10-
WebAudioWavStream: require('./webaudio-wav-stream'),
10+
WebAudioL16Stream: require('./webaudio-l16-stream'),
1111
MediaElementAudioStream: require('./media-element-audio-stream'),
1212
RecognizeStream: require('./recognize-stream'),
1313
FilePlayer: require('./file-player'),
1414
getUserMedia: require('./getusermedia'),
1515
FormatStream: require('./format-stream'),
1616
TimingStream: require('./timing-stream'),
1717

18-
// external (provided here to allow the lib to be used standalone w/out browserify)
18+
// external components provided here to allow the lib to be used standalone (w/out browserify)
1919
MicrophoneStream: require('microphone-stream'),
20-
Buffer: Buffer // may be needed to send data to the streams
20+
Buffer: Buffer
2121
};

speech-to-text/media-element-audio-stream.js

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ function MediaElementAudioStream(source, opts) {
2020
// buffer size to balance between latency and audio quality."
2121
// https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
2222
// Possible values: null, 256, 512, 1024, 2048, 4096, 8192, 16384
23-
bufferSize:null,
23+
// however, webkitAudioContext (safari) requires it to be set
24+
bufferSize: (typeof AudioContext != "undefined" ? null : 4096),
2425
muteSource: false,
2526
autoplay: true,
26-
crossOrigin: "anonymous" // required for cross-domain audio playback
27+
crossOrigin: "anonymous", // required for cross-domain audio playback
28+
objectMode: true // true = emit AudioBuffers w/ audio + some metadata, false = emite node.js Buffers (with binary data only
2729
}, opts);
2830

2931
// We can only emit one channel's worth of audio, so only one input. (Who has multiple microphones anyways?)
@@ -38,7 +40,7 @@ function MediaElementAudioStream(source, opts) {
3840
var recording = true;
3941

4042
// I can't seem to find any documentation for this on <audio> elements, but it seems to be required for cross-domain usage (in addition to CORS headers)
41-
//source.crossOrigin = opts.crossOrigin;
43+
source.crossOrigin = opts.crossOrigin;
4244

4345
/**
4446
* Convert and emit the raw audio data
@@ -48,32 +50,15 @@ function MediaElementAudioStream(source, opts) {
4850
function processAudio(e) {
4951
// onaudioprocess can be called at least once after we've stopped
5052
if (recording) {
51-
52-
var raw = e.inputBuffer.getChannelData(0);
53-
54-
/**
55-
* @event MicrophoneStream#raw
56-
* @param {Float32Array} data raw audio data from browser - each sample is a number from -1 to 1
57-
*/
58-
self.emit('raw', raw);
59-
60-
// Standard (non-object mode) Node.js streams only accepts Buffers or Strings
61-
var nodebuffer = new Buffer(raw.buffer);
62-
63-
/**
64-
* Emit the readable/data event with a node-style buffer.
65-
* Note: this is essentially a new DataView on the same underlying ArrayBuffer.
66-
* The raw audio data is not actually coppied or changed.
67-
*
68-
* @event MicrophoneStream#data
69-
* @param {Buffer} chunk node-style buffer with audio data; buffers are essentially a Uint8Array
70-
*/
71-
self.push(nodebuffer);
53+
// todo: interleave channels in binary mode
54+
self.push( opts.objectMode ? e.inputBuffer : new Buffer(e.inputBuffer.getChannelData(0)) );
7255
}
7356
}
7457

75-
var context = new AudioContext();
76-
var audioInput = context.createMediaElementSource(source);
58+
var AudioContext = window.AudioContext || window.webkitAudioContext;
59+
// cache the source node & context since it's not possible to recreate it later
60+
var context = source.context = source.context || new AudioContext();
61+
var audioInput = source.node = source.node || context.createMediaElementSource(source);
7762
var scriptProcessor = context.createScriptProcessor(opts.bufferSize, inputChannels, outputChannels);
7863

7964
scriptProcessor.onaudioprocess = processAudio;
@@ -84,23 +69,40 @@ function MediaElementAudioStream(source, opts) {
8469
gain.connect(context.destination);
8570
}
8671

87-
audioInput.connect(scriptProcessor);
88-
89-
// other half of workaround for chrome bugs
90-
scriptProcessor.connect(context.destination);
72+
/**
73+
* Setup script processor to extract audio and also re-connect it via a no-op gain node if desired
74+
*
75+
* Delayed to avoid processing the stream of silence received before the file begins playing
76+
*
77+
*/
78+
function connect() {
79+
audioInput.connect(scriptProcessor);
80+
// other half of workaround for chrome bugs
81+
scriptProcessor.connect(context.destination);
82+
source.removeEventListener("playing", connect);
83+
}
84+
source.addEventListener("playing", connect);
9185

9286
// https://developer.mozilla.org/en-US/docs/Web/Guide/Events/Media_events
87+
// https://developer.mozilla.org/en-US/docs/Web/API/HTMLMediaElement/readyState
9388
function start() {
9489
source.play();
9590
source.removeEventListener("canplaythrough", start);
9691
}
9792
if (opts.autoplay) {
98-
source.addEventListener("canplaythrough", start);
93+
// play immediately if we have enough data, otherwise wait for the canplaythrough event
94+
if(source.readyState === source.HAVE_ENOUGH_DATA) {
95+
source.play();
96+
} else {
97+
source.addEventListener("canplaythrough", start);
98+
}
9999
}
100100

101101
function end() {
102102
recording = false;
103103
scriptProcessor.disconnect();
104+
audioInput.disconnect();
105+
//context.close(); // this prevents us from re-using the same audio element until the page is refreshed
104106
self.push(null);
105107
self.emit('close');
106108
}
@@ -114,6 +116,7 @@ function MediaElementAudioStream(source, opts) {
114116
source.addEventListener("error", this.emit.bind(this, 'error'));
115117

116118
process.nextTick(function() {
119+
// this is more useful for binary mode than object mode, but it won't hurt either way
117120
self.emit('format', {
118121
channels: 1,
119122
bitDepth: 32,

speech-to-text/recognize-element.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
'use strict';
1818
var MediaElementAudioStream = require('./media-element-audio-stream');
19-
var WebAudioTo16leStream = require('./webaudio-wav-stream');
19+
var L16 = require('./webaudio-l16-stream');
2020
var RecognizeStream = require('./recognize-stream.js');
2121

2222
/**
@@ -33,13 +33,13 @@ module.exports = function recognizeElement(options) {
3333
throw new Error("WatsonSpeechToText: missing required parameter: opts.token");
3434
}
3535

36-
//options['content-type'] = 'audio/l16;rate=16000';
36+
options['content-type'] = 'audio/l16;rate=16000'; // raw wav audio (no header)
3737
var recognizeStream = new RecognizeStream(options);
3838

3939
var sourceStream = new MediaElementAudioStream(options.element , options);
4040

4141
sourceStream
42-
.pipe(new WebAudioTo16leStream())
42+
.pipe(new L16())
4343
.pipe(recognizeStream);
4444

4545
recognizeStream.on('stop', sourceStream.stop.bind(sourceStream));

speech-to-text/recognize-microphone.js

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@
2020
var getUserMedia = require('./getusermedia');
2121
var MicrophoneStream = require('microphone-stream');
2222
var RecognizeStream = require('./recognize-stream.js');
23-
var WebAudioTo16leStream = require('./webaudio-wav-stream.js');
24-
23+
var L16 = require('./webaudio-l16-stream.js');
2524

2625
/**
2726
* Create and return a RecognizeStream from the user's microphone
@@ -37,13 +36,16 @@ module.exports = function recognizeMicrophone(options) {
3736
throw new Error("WatsonSpeechToText: missing required parameter: opts.token");
3837
}
3938

40-
//options['content-type'] = 'audio/l16;rate=16000';
39+
options['content-type'] = 'audio/l16;rate=16000'; // raw wav audio (no header)
4140
var recognizeStream = new RecognizeStream(options);
4241

4342
getUserMedia({video: false, audio: true}).then(function(mic) {
44-
var micStream = new MicrophoneStream(mic, options);
43+
var micStream = new MicrophoneStream(mic, {
44+
objectMode: true,
45+
bufferSize: options.bufferSize
46+
});
4547
micStream
46-
.pipe(new WebAudioTo16leStream())
48+
.pipe(new L16())
4749
.pipe(recognizeStream);
4850

4951
recognizeStream.on('stop', micStream.stop.bind(micStream));

0 commit comments

Comments
 (0)