Skip to content

Commit b48a85c

Browse files
authored
Merge pull request #807 from watson-developer-cloud/tts-websockets
feat(text-to-speech): add support for using `synthesize`
2 parents b2f2e96 + 062179d commit b48a85c

File tree

7 files changed

+339
-10
lines changed

7 files changed

+339
-10
lines changed

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,7 @@ fs.createReadStream('./resources/speech.wav')
602602
603603
### Text to Speech
604604
605-
Use the [Text to Speech][text_to_speech] service to synthesize text into a .wav file.
605+
Use the [Text to Speech][text_to_speech] service to synthesize text into an audio file.
606606
607607
```js
608608
var TextToSpeechV1 = require('watson-developer-cloud/text-to-speech/v1');
@@ -632,8 +632,16 @@ textToSpeech
632632
fs.writeFileSync('audio.wav', audio);
633633
console.log('audio.wav written with a corrected wav header');
634634
});
635+
636+
637+
// or, using WebSockets
638+
textToSpeech.synthesizeUsingWebSocket(params);
639+
synthStream.pipe(fs.createWriteStream('./audio.ogg'));
640+
// see more information in examples/text_to_speech_websocket.js
635641
```
636642
643+
644+
637645
### Tone Analyzer
638646
639647
Use the [Tone Analyzer][tone_analyzer] service to analyze the

examples/.eslintrc.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
module.exports = {
2-
"parserOptions": { "ecmaVersion": 5 },
2+
"parserOptions": { "ecmaVersion": 6 },
33
"rules": {
44
"no-console": "off",
55
"node/no-missing-require": "off",
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
'use strict';
2+
3+
const fs = require('fs');
4+
const TextToSpeechV1 = require('watson-developer-cloud/text-to-speech/v1');
5+
6+
const textToSpeech = new TextToSpeechV1({
7+
// if left unspecified here, the SDK will fall back to the TEXT_TO_SPEECH_USERNAME and TEXT_TO_SPEECH_PASSWORD
8+
// environment properties, and then IBM Cloud's VCAP_SERVICES environment property
9+
// username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
10+
// password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE'
11+
});
12+
13+
// specify the text to synthesize
14+
const params = {
15+
text: 'Hello, world.',
16+
accept: 'audio/ogg;codecs=opus',
17+
};
18+
19+
// synthesizeUsingWebSocket returns a Readable Stream that can be piped or listened to
20+
const synthesizeStream = textToSpeech.synthesizeUsingWebSocket(params);
21+
22+
// the output of the stream can be piped to any writable stream, like an audio file
23+
synthesizeStream.pipe(fs.createWriteStream('./speech.ogg'));
24+
25+
// if the stream is not being piped anywhere and is only being listened to, the stream needs
26+
// to be explicitly set to flowing mode:
27+
28+
// synthesizeStream.resume();
29+
30+
// the 'message' event is emitted when data is processed and returned from the service
31+
// the 'message' parameter is the entire response frame of information returned from the
32+
// service. it is mainly useful for debugging
33+
// the 'data' parameter is the data payload contained within the message. it is typically
34+
// binary audio data, but if the text includes SSML marks or the request includes the
35+
// 'timings' parameter, 'data' could be a string containing marks or timing information
36+
synthesizeStream.on('message', (message, data) => {
37+
console.log(data);
38+
});
39+
40+
// the 'error' event is emitted if there is an error during the connection
41+
// 'err' is the Error object describing the error
42+
synthesizeStream.on('error', err => {
43+
console.log(err);
44+
});
45+
46+
// the 'close' event is emitted once, when the connection is terminated by the service
47+
// the 'code' parameter is the status code. 1000 is the code for a normal termination
48+
// the 'reason' parameter provides a string description of how the connection closed
49+
synthesizeStream.on('close', (code, reason) => {
50+
console.log(code);
51+
});

lib/recognize-stream.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ interface RecognizeStream extends Duplex {
5454
}
5555

5656
/**
57-
* pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in it's `data` events.
57+
* pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in its `data` events.
5858
* Also emits `results` events with interim results and other data.
5959
*
6060
* Cannot be instantiated directly, instead created by calling #recognizeUsingWebSocket()

lib/synthesize-stream.ts

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/**
2+
* Copyright 2014 IBM Corp. All Rights Reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License
15+
*/
16+
17+
import extend = require('extend');
18+
import pick = require('object.pick');
19+
import { Readable } from 'stream';
20+
import websocket = require ('websocket');
21+
import qs = require('./querystring');
22+
23+
const w3cWebSocket = websocket.w3cwebsocket;
24+
25+
const PAYLOAD_PARAMS_ALLOWED = [
26+
'text',
27+
'accept',
28+
'timings'
29+
];
30+
31+
const QUERY_PARAMS_ALLOWED = [
32+
'watson-token',
33+
'voice',
34+
'customization_id',
35+
'x-watson-learning-opt-out',
36+
'x-watson-metadata'
37+
];
38+
39+
interface SynthesizeStream extends Readable {
40+
_readableState;
41+
}
42+
43+
/**
44+
* pipe()-able Node.js Readable stream - accepts text in the constructor and emits binary audio data in its 'message' events
45+
*
46+
* Cannot be instantiated directly, instead created by calling #synthesizeUsingWebSocket()
47+
*
48+
* Uses WebSockets under the hood.
49+
* @param {Object} options
50+
* @constructor
51+
*/
52+
class SynthesizeStream extends Readable {
53+
54+
static WEBSOCKET_CONNECTION_ERROR: string = 'WebSocket connection error';
55+
56+
private options;
57+
private socket;
58+
private initialized: boolean;
59+
private authenticated: boolean;
60+
61+
62+
/**
63+
* pipe()-able Node.js Readable stream - accepts text and emits binary audio data in its 'message' events
64+
*
65+
* Uses WebSockets under the hood.
66+
*
67+
*
68+
* Note that the WebSocket connection is not established until the first chunk of data is recieved. This allows for IAM token request management by the SDK.
69+
*
70+
* @param {Object} options
71+
* @param {String} options.text - The text that us to be synthesized. Provide plain text or text that is annotated with SSML. SSML input can include the SSML <mark> element. Pass a maximum of 5 KB of text.
72+
* @param {String} options.accept - The requested audio format (MIME type) of the audio.
73+
* @param {String[]} [options.timings] - An array that specifies whether the service is to return word timing information for all strings of the input text
74+
* @param {String} [options.voice='en-US_MichaelVoice'] - The voice that is to be used for the synthesis.
75+
* @param {String} [options.customization_id] - The customization ID (GUID) of a custom voice model that is to be used for the synthesis.
76+
* @param {String} [options.url='wss://stream.watsonplatform.net/speech-to-text/api'] base URL for service
77+
* @param {String} [options.watson-token] - Auth token
78+
* @param {Object} [options.headers] - Only works in Node.js, not in browsers. Allows for custom headers to be set, including an Authorization header (preventing the need for auth tokens)
79+
* @param {Boolean} [options.x-watson-learning-opt-out=false] - set to true to opt-out of allowing Watson to use this request to improve it's services
80+
* @param {String} [options.x-watson-metadata] - Associates a customer ID with data that is passed over the connection.
81+
* @param {IamTokenManagerV1} [options.token_manager] - Token manager for authenticating with IAM
82+
* @param {Boolean} [options.rejectUnauthorized] - If true, disable SSL verification for the WebSocket connection
83+
*
84+
* @constructor
85+
*/
86+
constructor(options) {
87+
super(options);
88+
this.options = options;
89+
this.initialized = false;
90+
this.authenticated = options.token_manager ? false : true;
91+
}
92+
93+
initialize() {
94+
const options = this.options;
95+
96+
const queryParams = pick(options, QUERY_PARAMS_ALLOWED);
97+
const queryString = qs.stringify(queryParams);
98+
99+
const url =
100+
(options.url || 'wss://stream.watsonplatform.net/text-to-speech/api')
101+
.replace(/^http/, 'ws') +
102+
'/v1/synthesize' +
103+
queryString;
104+
105+
const socket = (this.socket = new w3cWebSocket(
106+
url,
107+
null,
108+
null,
109+
options.headers,
110+
null,
111+
{ tlsOptions: { rejectUnauthorized: options.rejectUnauthorized }}
112+
));
113+
114+
// use class context within arrow functions
115+
const self = this;
116+
117+
socket.onopen = () => {
118+
const payload = pick(options, PAYLOAD_PARAMS_ALLOWED);
119+
socket.send(JSON.stringify(payload));
120+
/**
121+
* emitted once the WebSocket connection has been established
122+
* @event SynthesizeStream#open
123+
*/
124+
self.emit('open');
125+
};
126+
127+
socket.onmessage = message => {
128+
const chunk = message.data;
129+
// some messages are strings - emit those unencoded, but push them to
130+
// the stream as binary
131+
const data = typeof chunk === 'string' ? chunk : Buffer.from(chunk);
132+
/**
133+
* Emit any messages received over the wire, mainly used for debugging.
134+
*
135+
* @event SynthesizeStream#message
136+
* @param {Object} message - frame object received from service
137+
* @param {Object} data - a data attribute of the frame that's either a string or a Buffer/TypedArray
138+
*/
139+
self.emit('message', message, data);
140+
self.push(Buffer.from(chunk));
141+
};
142+
143+
socket.onerror = event => {
144+
const err = new Error('WebSocket connection error');
145+
err.name = SynthesizeStream.WEBSOCKET_CONNECTION_ERROR;
146+
err['event'] = event;
147+
self.emit('error', err);
148+
self.push(null);
149+
};
150+
151+
socket.onclose = event => {
152+
self.push(null);
153+
/**
154+
* @event SynthesizeStream#close
155+
* @param {Number} reasonCode
156+
* @param {String} description
157+
*/
158+
self.emit('close', event.code, event.reason);
159+
};
160+
161+
this.initialized = true;
162+
}
163+
164+
_read() {
165+
// even though we aren't controlling the read from websocket,
166+
// we can take advantage of the fact that _read is async and hack
167+
// this funtion to retrieve a token if the service is using IAM auth
168+
this.setAuthorizationHeaderToken(err => {
169+
if (err) {
170+
this.emit('error', err);
171+
this.push(null);
172+
return;
173+
}
174+
175+
if (!this.initialized) {
176+
this.initialize();
177+
}
178+
});
179+
}
180+
181+
/**
182+
* This function retrieves an IAM access token and stores it in the
183+
* request header before calling the callback function, which will
184+
* execute the next iteration of `_read()`
185+
*
186+
*
187+
* @private
188+
* @param {Function} callback
189+
*/
190+
setAuthorizationHeaderToken(callback) {
191+
if (!this.authenticated) {
192+
this.options.token_manager.getToken((err, token) => {
193+
if (err) {
194+
callback(err);
195+
}
196+
const authHeader = { authorization: 'Bearer ' + token };
197+
this.options.headers = extend(authHeader, this.options.headers);
198+
this.authenticated = true;
199+
callback(null);
200+
});
201+
} else {
202+
callback(null);
203+
}
204+
}
205+
}
206+
207+
export = SynthesizeStream;

test/integration/text_to_speech.test.js

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,39 @@ describe('text_to_speech_integration', function() {
1616
text_to_speech.voices(null, done);
1717
});
1818

19-
it('synthesize()', function(done) {
19+
describe('synthesize', function() {
2020
const params = {
2121
text: 'test',
2222
accept: 'audio/wav',
2323
};
24-
// wav.Reader parses the wav header and will throw if it isn't valid
25-
const reader = new wav.Reader();
26-
text_to_speech
27-
.synthesize(params)
28-
.pipe(reader)
29-
.on('format', done.bind(null, null));
24+
25+
it('synthesize using http', function(done) {
26+
// wav.Reader parses the wav header and will throw if it isn't valid
27+
const reader = new wav.Reader();
28+
text_to_speech
29+
.synthesize(params)
30+
.pipe(reader)
31+
.on('format', done.bind(null, null));
32+
});
33+
34+
it('synthesize using websocket', function(done) {
35+
const synthStream = text_to_speech.synthesizeUsingWebSocket(params);
36+
synthStream.resume();
37+
38+
synthStream.on('message', function(message, data) {
39+
expect(data).not.toBeNull();
40+
});
41+
42+
synthStream.on('error', function(err) {
43+
// fail assertation
44+
throw err;
45+
});
46+
47+
synthStream.on('close', function(code, reason) {
48+
expect(code).toBe(1000);
49+
done();
50+
});
51+
});
3052
});
3153

3254
it('pronunciation()', function(done) {

0 commit comments

Comments
 (0)