Merge pull request #807 from watson-developer-cloud/tts-websockets

dpopp07 · web-flow · commit b48a85c68414 · 2018-12-06T09:44:53.000-06:00
feat(text-to-speech): add support for using `synthesize`
diff --git a/README.md b/README.md
@@ -602,7 +602,7 @@ fs.createReadStream('./resources/speech.wav')
 
 ### Text to Speech
 
-Use the [Text to Speech][text_to_speech] service to synthesize text into a .wav file.
+Use the [Text to Speech][text_to_speech] service to synthesize text into an audio file.
 
 ```js
 var TextToSpeechV1 = require('watson-developer-cloud/text-to-speech/v1');
@@ -632,8 +632,16 @@ textToSpeech
     fs.writeFileSync('audio.wav', audio);
     console.log('audio.wav written with a corrected wav header');
 });
+
+
+// or, using WebSockets
+textToSpeech.synthesizeUsingWebSocket(params);
+synthStream.pipe(fs.createWriteStream('./audio.ogg'));
+// see more information in examples/text_to_speech_websocket.js
 ```
 
+
+
 ### Tone Analyzer
 
 Use the [Tone Analyzer][tone_analyzer] service to analyze the
diff --git a/examples/.eslintrc.js b/examples/.eslintrc.js
@@ -1,5 +1,5 @@
 module.exports = {
-  "parserOptions": { "ecmaVersion": 5 },
+  "parserOptions": { "ecmaVersion": 6 },
   "rules": {
     "no-console": "off",
     "node/no-missing-require": "off",
diff --git a/examples/text_to_speech_websocket.js b/examples/text_to_speech_websocket.js
@@ -0,0 +1,51 @@
+'use strict';
+
+const fs = require('fs');
+const TextToSpeechV1 = require('watson-developer-cloud/text-to-speech/v1');
+
+const textToSpeech = new TextToSpeechV1({
+  // if left unspecified here, the SDK will fall back to the TEXT_TO_SPEECH_USERNAME and TEXT_TO_SPEECH_PASSWORD
+  // environment properties, and then IBM Cloud's VCAP_SERVICES environment property
+  // username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
+  // password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE'
+});
+
+// specify the text to synthesize
+const params = {
+  text: 'Hello, world.',
+  accept: 'audio/ogg;codecs=opus',
+};
+
+// synthesizeUsingWebSocket returns a Readable Stream that can be piped or listened to
+const synthesizeStream = textToSpeech.synthesizeUsingWebSocket(params);
+
+// the output of the stream can be piped to any writable stream, like an audio file
+synthesizeStream.pipe(fs.createWriteStream('./speech.ogg'));
+
+// if the stream is not being piped anywhere and is only being listened to, the stream needs
+//   to be explicitly set to flowing mode:
+
+// synthesizeStream.resume();
+
+// the 'message' event is emitted when data is processed and returned from the service
+// the 'message' parameter is the entire response frame of information returned from the
+//   service. it is mainly useful for debugging
+// the 'data' parameter is the data payload contained within the message. it is typically
+//   binary audio data, but if the text includes SSML marks or the request includes the
+//   'timings' parameter, 'data' could be a string containing marks or timing information
+synthesizeStream.on('message', (message, data) => {
+  console.log(data);
+});
+
+// the 'error' event is emitted if there is an error during the connection
+// 'err' is the Error object describing the error
+synthesizeStream.on('error', err => {
+  console.log(err);
+});
+
+// the 'close' event is emitted once, when the connection is terminated by the service
+// the 'code' parameter is the status code. 1000 is the code for a normal termination
+// the 'reason' parameter provides a string description of how the connection closed
+synthesizeStream.on('close', (code, reason) => {
+  console.log(code);
+});
diff --git a/lib/recognize-stream.ts b/lib/recognize-stream.ts
@@ -54,7 +54,7 @@ interface RecognizeStream extends Duplex {
 }
 
 /**
- * pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in it's `data` events.
+ * pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in its `data` events.
  * Also emits `results` events with interim results and other data.
  *
  * Cannot be instantiated directly, instead created by calling #recognizeUsingWebSocket()
diff --git a/lib/synthesize-stream.ts b/lib/synthesize-stream.ts
@@ -0,0 +1,207 @@
+/**
+ * Copyright 2014 IBM Corp. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+import extend = require('extend');
+import pick = require('object.pick');
+import { Readable } from 'stream';
+import websocket = require ('websocket');
+import qs = require('./querystring');
+
+const w3cWebSocket = websocket.w3cwebsocket;
+
+const PAYLOAD_PARAMS_ALLOWED = [
+  'text',
+  'accept',
+  'timings'
+];
+
+const QUERY_PARAMS_ALLOWED = [
+  'watson-token',
+  'voice',
+  'customization_id',
+  'x-watson-learning-opt-out',
+  'x-watson-metadata'
+];
+
+interface SynthesizeStream extends Readable {
+  _readableState;
+}
+
+/**
+ * pipe()-able Node.js Readable stream - accepts text in the constructor and emits binary audio data in its 'message' events
+ *
+ * Cannot be instantiated directly, instead created by calling #synthesizeUsingWebSocket()
+ *
+ * Uses WebSockets under the hood.
+ * @param {Object} options
+ * @constructor
+ */
+class SynthesizeStream extends Readable {
+
+  static WEBSOCKET_CONNECTION_ERROR: string = 'WebSocket connection error';
+
+  private options;
+  private socket;
+  private initialized: boolean;
+  private authenticated: boolean;
+
+
+  /**
+   * pipe()-able Node.js Readable stream - accepts text and emits binary audio data in its 'message' events
+   *
+   * Uses WebSockets under the hood.
+   *
+   *
+   * Note that the WebSocket connection is not established until the first chunk of data is recieved. This allows for IAM token request management by the SDK.
+   *
+   * @param {Object} options
+   * @param {String} options.text - The text that us to be synthesized. Provide plain text or text that is annotated with SSML. SSML input can include the SSML <mark> element. Pass a maximum of 5 KB of text. 
+   * @param {String} options.accept - The requested audio format (MIME type) of the audio.
+   * @param {String[]} [options.timings] - An array that specifies whether the service is to return word timing information for all strings of the input text
+   * @param {String} [options.voice='en-US_MichaelVoice'] - The voice that is to be used for the synthesis.
+   * @param {String} [options.customization_id] - The customization ID (GUID) of a custom voice model that is to be used for the synthesis.
+   * @param {String} [options.url='wss://stream.watsonplatform.net/speech-to-text/api'] base URL for service
+   * @param {String} [options.watson-token] - Auth token
+   * @param {Object} [options.headers] - Only works in Node.js, not in browsers. Allows for custom headers to be set, including an Authorization header (preventing the need for auth tokens)
+   * @param {Boolean} [options.x-watson-learning-opt-out=false] - set to true to opt-out of allowing Watson to use this request to improve it's services
+   * @param {String} [options.x-watson-metadata] - Associates a customer ID with data that is passed over the connection.
+   * @param {IamTokenManagerV1} [options.token_manager] - Token manager for authenticating with IAM
+   * @param {Boolean} [options.rejectUnauthorized] - If true, disable SSL verification for the WebSocket connection
+   *
+   * @constructor
+   */
+  constructor(options) {
+    super(options);
+    this.options = options;
+    this.initialized = false;
+    this.authenticated = options.token_manager ? false : true;
+  }
+
+  initialize() {
+    const options = this.options;
+
+    const queryParams = pick(options, QUERY_PARAMS_ALLOWED);
+    const queryString = qs.stringify(queryParams);
+
+    const url =
+      (options.url || 'wss://stream.watsonplatform.net/text-to-speech/api')
+        .replace(/^http/, 'ws') + 
+        '/v1/synthesize' +
+        queryString;
+
+    const socket = (this.socket = new w3cWebSocket(
+      url,
+      null,
+      null,
+      options.headers,
+      null,
+      { tlsOptions: { rejectUnauthorized: options.rejectUnauthorized }}
+    ));
+
+    // use class context within arrow functions
+    const self = this;
+
+    socket.onopen = () => {
+      const payload = pick(options, PAYLOAD_PARAMS_ALLOWED);
+      socket.send(JSON.stringify(payload));
+      /**
+       * emitted once the WebSocket connection has been established
+       * @event SynthesizeStream#open
+       */
+      self.emit('open');
+    };
+
+    socket.onmessage = message => {
+      const chunk = message.data;
+      // some messages are strings - emit those unencoded, but push them to
+      // the stream as binary
+      const data = typeof chunk === 'string' ? chunk : Buffer.from(chunk);
+      /**
+       * Emit any messages received over the wire, mainly used for debugging.
+       *
+       * @event SynthesizeStream#message
+       * @param {Object} message - frame object received from service
+       * @param {Object} data - a data attribute of the frame that's either a string or a Buffer/TypedArray
+       */
+      self.emit('message', message, data);
+      self.push(Buffer.from(chunk));
+    };
+
+    socket.onerror = event => {
+      const err = new Error('WebSocket connection error');
+      err.name = SynthesizeStream.WEBSOCKET_CONNECTION_ERROR;
+      err['event'] = event;
+      self.emit('error', err);
+      self.push(null);
+    };
+
+    socket.onclose = event => {
+      self.push(null);
+      /**
+       * @event SynthesizeStream#close
+       * @param {Number} reasonCode
+       * @param {String} description
+       */
+      self.emit('close', event.code, event.reason);
+    };
+
+    this.initialized = true;
+  }
+
+  _read() {
+    // even though we aren't controlling the read from websocket,
+    // we can take advantage of the fact that _read is async and hack
+    // this funtion to retrieve a token if the service is using IAM auth
+    this.setAuthorizationHeaderToken(err => {
+      if (err) {
+        this.emit('error', err);
+        this.push(null);
+        return;
+      }
+
+      if (!this.initialized) {
+        this.initialize();
+      }
+    });
+  }
+
+  /**
+   * This function retrieves an IAM access token and stores it in the
+   * request header before calling the callback function, which will
+   * execute the next iteration of `_read()`
+   *
+   *
+   * @private
+   * @param {Function} callback
+   */
+   setAuthorizationHeaderToken(callback) {
+    if (!this.authenticated) {
+      this.options.token_manager.getToken((err, token) => {
+        if (err) {
+          callback(err);
+        }
+        const authHeader = { authorization: 'Bearer ' + token };
+        this.options.headers = extend(authHeader, this.options.headers);
+        this.authenticated = true;
+        callback(null);
+      });
+    } else {
+      callback(null);
+    }
+  }
+}
+
+export = SynthesizeStream;
diff --git a/test/integration/text_to_speech.test.js b/test/integration/text_to_speech.test.js
@@ -16,17 +16,39 @@ describe('text_to_speech_integration', function() {
     text_to_speech.voices(null, done);
   });
 
-  it('synthesize()', function(done) {
+  describe('synthesize', function() {
     const params = {
       text: 'test',
       accept: 'audio/wav',
     };
-    // wav.Reader parses the wav header and will throw if it isn't valid
-    const reader = new wav.Reader();
-    text_to_speech
-      .synthesize(params)
-      .pipe(reader)
-      .on('format', done.bind(null, null));
+
+    it('synthesize using http', function(done) {
+      // wav.Reader parses the wav header and will throw if it isn't valid
+      const reader = new wav.Reader();
+      text_to_speech
+        .synthesize(params)
+        .pipe(reader)
+        .on('format', done.bind(null, null));
+    });
+
+    it('synthesize using websocket', function(done) {
+      const synthStream = text_to_speech.synthesizeUsingWebSocket(params);
+      synthStream.resume();
+
+      synthStream.on('message', function(message, data) {
+        expect(data).not.toBeNull();
+      });
+
+      synthStream.on('error', function(err) {
+        // fail assertation
+        throw err;
+      });
+
+      synthStream.on('close', function(code, reason) {
+        expect(code).toBe(1000);
+        done();
+      });
+    });
   });
 
   it('pronunciation()', function(done) {
diff --git a/text-to-speech/v1.ts b/text-to-speech/v1.ts

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ interface RecognizeStream extends Duplex {`
`54`	`54`	`}`
`55`	`55`
`56`	`56`	`/**`
`57`		- * pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in it's `data` events.
	`57`	+ * pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in its `data` events.
`58`	`58`	* Also emits `results` events with interim results and other data.
`59`	`59`	`*`
`60`	`60`	`* Cannot be instantiated directly, instead created by calling #recognizeUsingWebSocket()`