Merge pull request #604 from watson-developer-cloud/tts-websocket

ehdsouza · web-flow · commit abaffc76e185 · 2018-11-30T10:33:50.000-05:00
feat(Text to Speech): Synthesize using web sockets
diff --git a/README.md b/README.md
@@ -2,7 +2,6 @@
 
 [![Build Status](https://travis-ci.org/watson-developer-cloud/python-sdk.svg?branch=master)](https://travis-ci.org/watson-developer-cloud/python-sdk)
 [![Slack](https://wdc-slack-inviter.mybluemix.net/badge.svg)](https://wdc-slack-inviter.mybluemix.net)
-[![codecov.io](https://codecov.io/github/watson-developer-cloud/python-sdk/coverage.svg?branch=master)](https://codecov.io/github/watson-developer-cloud/python-sdk?branch=master)
 [![Latest Stable Version](https://img.shields.io/pypi/v/watson-developer-cloud.svg)](https://pypi.python.org/pypi/watson-developer-cloud)
 [![CLA assistant](https://cla-assistant.io/readme/badge/watson-developer-cloud/python-sdk)](https://cla-assistant.io/watson-developer-cloud/python-sdk)
 
@@ -251,13 +250,37 @@ This would give an output of `DetailedResponse` having the structure:
 ```
 You can use the `get_result()`, `get_headers()` and get_status_code() to return the result, headers and status code respectively.
 
+## Using Websockets
+The Text to Speech service supports synthesizing text to spoken audio using web sockets with the `synthesize_using_websocket`. The Speech to Text service supports recognizing speech to text using web sockets with the `recognize_using_websocket`. These methods need a custom callback class to listen to events. Below is an example of `synthesize_using_websocket`. Note: The service accepts one request per connection.
+
+```py
+from watson_developer_cloud.websocket import SynthesizeCallback
+
+class MySynthesizeCallback(SynthesizeCallback):
+    def __init__(self):
+        SynthesizeCallback.__init__(self)
+
+    def on_audio_stream(self, audio_stream):
+        return audio_stream
+
+    def on_data(self, data):
+        return data
+
+my_callback = MySynthesizeCallback()
+service.synthesize_using_websocket('I like to pet dogs',
+                                   my_callback,
+                                   accept='audio/wav',
+                                   voice='en-US_AllisonVoice'
+                                  )
+```
+
 ## Dependencies
 
 * [requests]
 * `python_dateutil` >= 2.5.3
 * [responses] for testing
 * Following for web sockets support in speech to text
-   * `websocket-client` 0.47.0
+   * `websocket-client` 0.52.0
 
 ## Contributing
 
diff --git a/examples/speaker_text_to_speech.py b/examples/speaker_text_to_speech.py
@@ -0,0 +1,114 @@
+# You need to install pyaudio to run this example
+# pip install pyaudio
+
+# In this example, the websocket connection is opened with a text
+# passed in the request. When the service responds with the synthesized
+# audio, the pyaudio would play it in a blocking mode
+
+from watson_developer_cloud import TextToSpeechV1
+from watson_developer_cloud.websocket import SynthesizeCallback
+import pyaudio
+
+# If service instance provides API key authentication
+service = TextToSpeechV1(
+    ## url is optional, and defaults to the URL below. Use the correct URL for your region.
+    url='https://stream.watsonplatform.net/text-to-speech/api',
+    iam_apikey='your_apikey')
+
+# service = TextToSpeechV1(
+#     ## url is optional, and defaults to the URL below. Use the correct URL for your region.
+#     # url='https://stream.watsonplatform.net/text-to-speech/api,
+#     username='YOUR SERVICE USERNAME',
+#     password='YOUR SERVICE PASSWORD')
+
+class Play(object):
+    """
+    Wrapper to play the audio in a blocking mode
+    """
+    def __init__(self):
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 22050
+        self.chunk = 1024
+        self.pyaudio = None
+        self.stream = None
+
+    def start_streaming(self):
+        self.pyaudio = pyaudio.PyAudio()
+        self.stream = self._open_stream()
+        self._start_stream()
+
+    def _open_stream(self):
+        stream = self.pyaudio.open(
+            format=self.format,
+            channels=self.channels,
+            rate=self.rate,
+            output=True,
+            frames_per_buffer=self.chunk,
+            start=False
+        )
+        return stream
+
+    def _start_stream(self):
+        self.stream.start_stream()
+
+    def write_stream(self, audio_stream):
+        self.stream.write(audio_stream)
+
+    def complete_playing(self):
+        self.stream.stop_stream()
+        self.stream.close()
+        self.pyaudio.terminate()
+
+class MySynthesizeCallback(SynthesizeCallback):
+    def __init__(self):
+        SynthesizeCallback.__init__(self)
+        self.play = Play()
+
+    def on_connected(self):
+        print 'Opening stream to play'
+        self.play.start_streaming()
+
+    def on_error(self, error):
+        print 'Error received: {}'.format(error)
+
+    def on_timing_information(self, timing_information):
+        print timing_information
+
+    def on_audio_stream(self, audio_stream):
+        self.play.write_stream(audio_stream)
+
+    def on_close(self):
+        print 'Completed synthesizing'
+        self.play.complete_playing()
+
+test_callback = MySynthesizeCallback()
+
+# An example SSML text
+SSML_sorry_text = """<speak version=\"1.0\">
+        <emphasis> I am sorry, I know how it feels.</emphasis>
+        </speak>"""
+
+# Another example of SSML text
+SSML_text = """
+   <speak>
+        I have been assigned to handle your order status request.
+       <express-as type=\"Apology\">
+        I am sorry to inform you that the items you requested are backordered.
+        We apologize for the inconvenience.
+       </express-as>
+      <express-as type=\"Uncertainty\">
+        We don't know when the items will become available. Maybe next week,
+        but we are not sure at this time.
+      </express-as>
+      <express-as type=\"GoodNews\">
+        But because we want you to be a satisfied customer, we are giving you
+        a 50% discount on your order!
+      </express-as>
+   </speak>"""
+
+service.synthesize_using_websocket(SSML_text,
+                                   test_callback,
+                                   accept='audio/wav',
+                                   voice="en-US_AllisonVoice"
+                                  )
diff --git a/examples/text_to_speech_v1.py b/examples/text_to_speech_v1.py
@@ -3,6 +3,7 @@
 import json
 from os.path import join, dirname
 from watson_developer_cloud import TextToSpeechV1
+from watson_developer_cloud.websocket import SynthesizeCallback
 
 # If service instance provides API key authentication
 # service = TextToSpeechV1(
@@ -63,3 +64,36 @@
 
 # response = service.delete_voice_model('YOUR CUSTOMIZATION ID').get_result()
 # print(response)
+
+# Synthesize using websocket. Note: The service accepts one request per connection
+file_path = join(dirname(__file__), "../resources/dog.wav")
+class MySynthesizeCallback(SynthesizeCallback):
+    def __init__(self):
+        SynthesizeCallback.__init__(self)
+        self.fd = open(file_path, 'ab')
+
+    def on_connected(self):
+        print('Connection was successful')
+
+    def on_error(self, error):
+        print('Error received: {}'.format(error))
+
+    def on_content_type(self, content_type):
+        print('Content type: {}'.format(content_type))
+
+    def on_timing_information(self, timing_information):
+        print(timing_information)
+
+    def on_audio_stream(self, audio_stream):
+        self.fd.write(audio_stream)
+
+    def on_close(self):
+        self.fd.close()
+        print('Done synthesizing. Closing the connection')
+
+my_callback = MySynthesizeCallback()
+service.synthesize_using_websocket('I like to pet dogs',
+                                   my_callback,
+                                   accept='audio/wav',
+                                   voice='en-US_AllisonVoice'
+                                  )
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -17,4 +17,4 @@ Sphinx>=1.3.1
 bumpversion>=0.5.3
 
 # Web sockets
-websocket-client==0.47.0
+websocket-client==0.52.0
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 requests>=2.0,<3.0
 python_dateutil>=2.5.3
-websocket-client==0.47.0
+websocket-client==0.52.0
diff --git a/setup.py b/setup.py
@@ -64,7 +64,7 @@ def run_tests(self):
       version=__version__,
       description='Client library to use the IBM Watson Services',
       license='Apache 2.0',
-      install_requires=['requests>=2.0, <3.0', 'python_dateutil>=2.5.3', 'websocket-client==0.47.0'],
+      install_requires=['requests>=2.0, <3.0', 'python_dateutil>=2.5.3', 'websocket-client==0.52.0'],
       tests_require=['responses', 'pytest', 'python_dotenv', 'pytest-rerunfailures', 'tox'],
       cmdclass={'test': PyTest},
       author='Jeffrey Stylos',
diff --git a/test/integration/test_speech_to_text_v1.py b/test/integration/test_speech_to_text_v1.py
@@ -97,12 +97,12 @@ def on_error(self, error):
             def on_transcription(self, transcript):
                 self.transcript = transcript
 
-        testCallback = MyRecognizeCallback()
+        test_callback = MyRecognizeCallback()
         with open(os.path.join(os.path.dirname(__file__), '../../resources/speech.wav'), 'rb') as audio_file:
             audio_source = AudioSource(audio_file, False)
-            t = threading.Thread(target=self.speech_to_text.recognize_using_websocket, args=(audio_source, "audio/l16; rate=44100", testCallback))
+            t = threading.Thread(target=self.speech_to_text.recognize_using_websocket, args=(audio_source, "audio/l16; rate=44100", test_callback))
             t.start()
             t.join()
-        assert testCallback.error is None
-        assert testCallback.transcript is not None
-        assert testCallback.transcript[0]['transcript'] == 'thunderstorms could produce large hail isolated tornadoes and heavy rain '
+        assert test_callback.error is None
+        assert test_callback.transcript is not None
+        assert test_callback.transcript[0]['transcript'] == 'thunderstorms could produce large hail isolated tornadoes and heavy rain '
diff --git a/test/integration/test_text_to_speech_v1.py b/test/integration/test_text_to_speech_v1.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 import unittest
 import watson_developer_cloud
+from watson_developer_cloud.websocket import SynthesizeCallback
 import pytest
 import os
 
@@ -67,3 +68,34 @@ def test_custom_words(self):
         self.text_to_speech.delete_word(customization_id, 'ACLs')
         word = self.text_to_speech.get_word(customization_id, 'MACLs').get_result()
         assert word['translation'] == 'mackles'
+
+    def test_synthesize_using_websocket(self):
+        file = 'tongue_twister.wav'
+        class MySynthesizeCallback(SynthesizeCallback):
+            def __init__(self):
+                SynthesizeCallback.__init__(self)
+                self.fd = None
+                self.error = None
+
+            def on_connected(self):
+                self.fd = open(file, 'ab')
+
+            def on_error(self, error):
+                self.error = error
+
+            def on_audio_stream(self, audio_stream):
+                self.fd.write(audio_stream)
+
+            def on_close(self):
+                self.fd.close()
+
+        test_callback = MySynthesizeCallback()
+        self.text_to_speech.synthesize_using_websocket('She sells seashells by the seashore',
+                                                       test_callback,
+                                                       accept='audio/wav',
+                                                       voice='en-GB_KateVoice'
+                                                      )
+        assert test_callback.error is None
+        assert test_callback.fd is not None
+        assert os.stat(file).st_size > 0
+        os.remove(file)
diff --git a/watson_developer_cloud/__init__.py b/watson_developer_cloud/__init__.py
@@ -31,5 +31,6 @@
 from .discovery_v1 import DiscoveryV1
 from .version import __version__
 from .speech_to_text_v1_adapter import SpeechToTextV1Adapter as SpeechToTextV1
+from .text_to_speech_adapter_v1 import TextToSpeechV1Adapter as TextToSpeechV1
 from .visual_recognition_v3_adapter import VisualRecognitionV3Adapter as VisualRecognitionV3
 from .discovery_v1_adapter import DiscoveryV1Adapter as DiscoveryV1
diff --git a/watson_developer_cloud/text_to_speech_adapter_v1.py b/watson_developer_cloud/text_to_speech_adapter_v1.py
diff --git a/watson_developer_cloud/websocket/__init__.py b/watson_developer_cloud/websocket/__init__.py
diff --git a/watson_developer_cloud/websocket/synthesize_callback.py b/watson_developer_cloud/websocket/synthesize_callback.py
diff --git a/watson_developer_cloud/websocket/synthesize_listener.py b/watson_developer_cloud/websocket/synthesize_listener.py