Skip to content

Commit abaffc7

Browse files
authored
Merge pull request #604 from watson-developer-cloud/tts-websocket
feat(Text to Speech): Synthesize using web sockets
2 parents f2c1acb + 28d67d2 commit abaffc7

File tree

13 files changed

+521
-10
lines changed

13 files changed

+521
-10
lines changed

README.md

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
[![Build Status](https://travis-ci.org/watson-developer-cloud/python-sdk.svg?branch=master)](https://travis-ci.org/watson-developer-cloud/python-sdk)
44
[![Slack](https://wdc-slack-inviter.mybluemix.net/badge.svg)](https://wdc-slack-inviter.mybluemix.net)
5-
[![codecov.io](https://codecov.io/github/watson-developer-cloud/python-sdk/coverage.svg?branch=master)](https://codecov.io/github/watson-developer-cloud/python-sdk?branch=master)
65
[![Latest Stable Version](https://img.shields.io/pypi/v/watson-developer-cloud.svg)](https://pypi.python.org/pypi/watson-developer-cloud)
76
[![CLA assistant](https://cla-assistant.io/readme/badge/watson-developer-cloud/python-sdk)](https://cla-assistant.io/watson-developer-cloud/python-sdk)
87

@@ -251,13 +250,37 @@ This would give an output of `DetailedResponse` having the structure:
251250
```
252251
You can use the `get_result()`, `get_headers()` and get_status_code() to return the result, headers and status code respectively.
253252

253+
## Using Websockets
254+
The Text to Speech service supports synthesizing text to spoken audio using web sockets with the `synthesize_using_websocket`. The Speech to Text service supports recognizing speech to text using web sockets with the `recognize_using_websocket`. These methods need a custom callback class to listen to events. Below is an example of `synthesize_using_websocket`. Note: The service accepts one request per connection.
255+
256+
```py
257+
from watson_developer_cloud.websocket import SynthesizeCallback
258+
259+
class MySynthesizeCallback(SynthesizeCallback):
260+
def __init__(self):
261+
SynthesizeCallback.__init__(self)
262+
263+
def on_audio_stream(self, audio_stream):
264+
return audio_stream
265+
266+
def on_data(self, data):
267+
return data
268+
269+
my_callback = MySynthesizeCallback()
270+
service.synthesize_using_websocket('I like to pet dogs',
271+
my_callback,
272+
accept='audio/wav',
273+
voice='en-US_AllisonVoice'
274+
)
275+
```
276+
254277
## Dependencies
255278

256279
* [requests]
257280
* `python_dateutil` >= 2.5.3
258281
* [responses] for testing
259282
* Following for web sockets support in speech to text
260-
* `websocket-client` 0.47.0
283+
* `websocket-client` 0.52.0
261284

262285
## Contributing
263286

examples/speaker_text_to_speech.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# You need to install pyaudio to run this example
2+
# pip install pyaudio
3+
4+
# In this example, the websocket connection is opened with a text
5+
# passed in the request. When the service responds with the synthesized
6+
# audio, the pyaudio would play it in a blocking mode
7+
8+
from watson_developer_cloud import TextToSpeechV1
9+
from watson_developer_cloud.websocket import SynthesizeCallback
10+
import pyaudio
11+
12+
# If service instance provides API key authentication
13+
service = TextToSpeechV1(
14+
## url is optional, and defaults to the URL below. Use the correct URL for your region.
15+
url='https://stream.watsonplatform.net/text-to-speech/api',
16+
iam_apikey='your_apikey')
17+
18+
# service = TextToSpeechV1(
19+
# ## url is optional, and defaults to the URL below. Use the correct URL for your region.
20+
# # url='https://stream.watsonplatform.net/text-to-speech/api,
21+
# username='YOUR SERVICE USERNAME',
22+
# password='YOUR SERVICE PASSWORD')
23+
24+
class Play(object):
25+
"""
26+
Wrapper to play the audio in a blocking mode
27+
"""
28+
def __init__(self):
29+
self.format = pyaudio.paInt16
30+
self.channels = 1
31+
self.rate = 22050
32+
self.chunk = 1024
33+
self.pyaudio = None
34+
self.stream = None
35+
36+
def start_streaming(self):
37+
self.pyaudio = pyaudio.PyAudio()
38+
self.stream = self._open_stream()
39+
self._start_stream()
40+
41+
def _open_stream(self):
42+
stream = self.pyaudio.open(
43+
format=self.format,
44+
channels=self.channels,
45+
rate=self.rate,
46+
output=True,
47+
frames_per_buffer=self.chunk,
48+
start=False
49+
)
50+
return stream
51+
52+
def _start_stream(self):
53+
self.stream.start_stream()
54+
55+
def write_stream(self, audio_stream):
56+
self.stream.write(audio_stream)
57+
58+
def complete_playing(self):
59+
self.stream.stop_stream()
60+
self.stream.close()
61+
self.pyaudio.terminate()
62+
63+
class MySynthesizeCallback(SynthesizeCallback):
64+
def __init__(self):
65+
SynthesizeCallback.__init__(self)
66+
self.play = Play()
67+
68+
def on_connected(self):
69+
print 'Opening stream to play'
70+
self.play.start_streaming()
71+
72+
def on_error(self, error):
73+
print 'Error received: {}'.format(error)
74+
75+
def on_timing_information(self, timing_information):
76+
print timing_information
77+
78+
def on_audio_stream(self, audio_stream):
79+
self.play.write_stream(audio_stream)
80+
81+
def on_close(self):
82+
print 'Completed synthesizing'
83+
self.play.complete_playing()
84+
85+
test_callback = MySynthesizeCallback()
86+
87+
# An example SSML text
88+
SSML_sorry_text = """<speak version=\"1.0\">
89+
<emphasis> I am sorry, I know how it feels.</emphasis>
90+
</speak>"""
91+
92+
# Another example of SSML text
93+
SSML_text = """
94+
<speak>
95+
I have been assigned to handle your order status request.
96+
<express-as type=\"Apology\">
97+
I am sorry to inform you that the items you requested are backordered.
98+
We apologize for the inconvenience.
99+
</express-as>
100+
<express-as type=\"Uncertainty\">
101+
We don't know when the items will become available. Maybe next week,
102+
but we are not sure at this time.
103+
</express-as>
104+
<express-as type=\"GoodNews\">
105+
But because we want you to be a satisfied customer, we are giving you
106+
a 50% discount on your order!
107+
</express-as>
108+
</speak>"""
109+
110+
service.synthesize_using_websocket(SSML_text,
111+
test_callback,
112+
accept='audio/wav',
113+
voice="en-US_AllisonVoice"
114+
)

examples/text_to_speech_v1.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
from os.path import join, dirname
55
from watson_developer_cloud import TextToSpeechV1
6+
from watson_developer_cloud.websocket import SynthesizeCallback
67

78
# If service instance provides API key authentication
89
# service = TextToSpeechV1(
@@ -63,3 +64,36 @@
6364

6465
# response = service.delete_voice_model('YOUR CUSTOMIZATION ID').get_result()
6566
# print(response)
67+
68+
# Synthesize using websocket. Note: The service accepts one request per connection
69+
file_path = join(dirname(__file__), "../resources/dog.wav")
70+
class MySynthesizeCallback(SynthesizeCallback):
71+
def __init__(self):
72+
SynthesizeCallback.__init__(self)
73+
self.fd = open(file_path, 'ab')
74+
75+
def on_connected(self):
76+
print('Connection was successful')
77+
78+
def on_error(self, error):
79+
print('Error received: {}'.format(error))
80+
81+
def on_content_type(self, content_type):
82+
print('Content type: {}'.format(content_type))
83+
84+
def on_timing_information(self, timing_information):
85+
print(timing_information)
86+
87+
def on_audio_stream(self, audio_stream):
88+
self.fd.write(audio_stream)
89+
90+
def on_close(self):
91+
self.fd.close()
92+
print('Done synthesizing. Closing the connection')
93+
94+
my_callback = MySynthesizeCallback()
95+
service.synthesize_using_websocket('I like to pet dogs',
96+
my_callback,
97+
accept='audio/wav',
98+
voice='en-US_AllisonVoice'
99+
)

requirements-dev.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ Sphinx>=1.3.1
1717
bumpversion>=0.5.3
1818

1919
# Web sockets
20-
websocket-client==0.47.0
20+
websocket-client==0.52.0

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
requests>=2.0,<3.0
22
python_dateutil>=2.5.3
3-
websocket-client==0.47.0
3+
websocket-client==0.52.0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def run_tests(self):
6464
version=__version__,
6565
description='Client library to use the IBM Watson Services',
6666
license='Apache 2.0',
67-
install_requires=['requests>=2.0, <3.0', 'python_dateutil>=2.5.3', 'websocket-client==0.47.0'],
67+
install_requires=['requests>=2.0, <3.0', 'python_dateutil>=2.5.3', 'websocket-client==0.52.0'],
6868
tests_require=['responses', 'pytest', 'python_dotenv', 'pytest-rerunfailures', 'tox'],
6969
cmdclass={'test': PyTest},
7070
author='Jeffrey Stylos',

test/integration/test_speech_to_text_v1.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,12 @@ def on_error(self, error):
9797
def on_transcription(self, transcript):
9898
self.transcript = transcript
9999

100-
testCallback = MyRecognizeCallback()
100+
test_callback = MyRecognizeCallback()
101101
with open(os.path.join(os.path.dirname(__file__), '../../resources/speech.wav'), 'rb') as audio_file:
102102
audio_source = AudioSource(audio_file, False)
103-
t = threading.Thread(target=self.speech_to_text.recognize_using_websocket, args=(audio_source, "audio/l16; rate=44100", testCallback))
103+
t = threading.Thread(target=self.speech_to_text.recognize_using_websocket, args=(audio_source, "audio/l16; rate=44100", test_callback))
104104
t.start()
105105
t.join()
106-
assert testCallback.error is None
107-
assert testCallback.transcript is not None
108-
assert testCallback.transcript[0]['transcript'] == 'thunderstorms could produce large hail isolated tornadoes and heavy rain '
106+
assert test_callback.error is None
107+
assert test_callback.transcript is not None
108+
assert test_callback.transcript[0]['transcript'] == 'thunderstorms could produce large hail isolated tornadoes and heavy rain '

test/integration/test_text_to_speech_v1.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# coding: utf-8
22
import unittest
33
import watson_developer_cloud
4+
from watson_developer_cloud.websocket import SynthesizeCallback
45
import pytest
56
import os
67

@@ -67,3 +68,34 @@ def test_custom_words(self):
6768
self.text_to_speech.delete_word(customization_id, 'ACLs')
6869
word = self.text_to_speech.get_word(customization_id, 'MACLs').get_result()
6970
assert word['translation'] == 'mackles'
71+
72+
def test_synthesize_using_websocket(self):
73+
file = 'tongue_twister.wav'
74+
class MySynthesizeCallback(SynthesizeCallback):
75+
def __init__(self):
76+
SynthesizeCallback.__init__(self)
77+
self.fd = None
78+
self.error = None
79+
80+
def on_connected(self):
81+
self.fd = open(file, 'ab')
82+
83+
def on_error(self, error):
84+
self.error = error
85+
86+
def on_audio_stream(self, audio_stream):
87+
self.fd.write(audio_stream)
88+
89+
def on_close(self):
90+
self.fd.close()
91+
92+
test_callback = MySynthesizeCallback()
93+
self.text_to_speech.synthesize_using_websocket('She sells seashells by the seashore',
94+
test_callback,
95+
accept='audio/wav',
96+
voice='en-GB_KateVoice'
97+
)
98+
assert test_callback.error is None
99+
assert test_callback.fd is not None
100+
assert os.stat(file).st_size > 0
101+
os.remove(file)

watson_developer_cloud/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,6 @@
3131
from .discovery_v1 import DiscoveryV1
3232
from .version import __version__
3333
from .speech_to_text_v1_adapter import SpeechToTextV1Adapter as SpeechToTextV1
34+
from .text_to_speech_adapter_v1 import TextToSpeechV1Adapter as TextToSpeechV1
3435
from .visual_recognition_v3_adapter import VisualRecognitionV3Adapter as VisualRecognitionV3
3536
from .discovery_v1_adapter import DiscoveryV1Adapter as DiscoveryV1

0 commit comments

Comments
 (0)