Skip to content

Commit ee540fd

Browse files
Merge pull request #1015 from watson-developer-cloud/tts-websocket
Add WebSocket support for Text to Speech synthesize endpoint
2 parents 1970def + 377ec45 commit ee540fd

File tree

14 files changed

+740
-0
lines changed

14 files changed

+740
-0
lines changed

text-to-speech/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,44 @@ Voices voices = service.listVoices().execute();
2727
System.out.println(voices);
2828
```
2929

30+
## Usage with WebSockets
31+
The Watson Text to Speech service supports the use of WebSockets as an alternative to the `synthesize()` method, which converts text to speech. Here is an example of using the WebSocket version of the method to get an audio file:
32+
```java
33+
TextToSpeech service = new TextToSpeech();
34+
service.setUsernameAndPassword("<username>", "<password>");
35+
36+
String text = "It's beginning to look a lot like Christmas";
37+
SynthesizeOptions synthesizeOptions = new SynthesizeOptions.Builder()
38+
.text(text)
39+
.accept(SynthesizeOptions.Accept.AUDIO_OGG_CODECS_OPUS)
40+
.build();
41+
42+
// a callback is defined to handle certain events, like an audio transmission or a timing marker
43+
// in this case, we'll build up a byte array of all the received bytes to build the resulting file
44+
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
45+
service.synthesizeUsingWebSocket(synthesizeOptions, new BaseSynthesizeCallback() {
46+
@Override
47+
public void onAudioStream(byte[] bytes) {
48+
// append to our byte array
49+
try {
50+
byteArrayOutputStream.write(bytes);
51+
} catch (IOException e) {
52+
e.printStackTrace();
53+
}
54+
}
55+
});
56+
57+
// quick way to wait for synthesis to complete, since synthesizeUsingWebSocket() runs asynchronously
58+
Thread.sleep(5000);
59+
60+
// create file with audio data
61+
String filename = "synthesize_websocket_test.ogg";
62+
OutputStream fileOutputStream = new FileOutputStream(filename);
63+
byteArrayOutputStream.writeTo(fileOutputStream);
64+
65+
// clean up
66+
byteArrayOutputStream.close();
67+
fileOutputStream.close();
68+
```
69+
3070
[text_to_speech]: https://console.bluemix.net/docs/services/text-to-speech/index.html

text-to-speech/src/main/java/com/ibm/watson/developer_cloud/text_to_speech/v1/TextToSpeech.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,16 @@
3939
import com.ibm.watson.developer_cloud.text_to_speech.v1.model.VoiceModels;
4040
import com.ibm.watson.developer_cloud.text_to_speech.v1.model.Voices;
4141
import com.ibm.watson.developer_cloud.text_to_speech.v1.model.Words;
42+
import com.ibm.watson.developer_cloud.text_to_speech.v1.websocket.SynthesizeCallback;
43+
import com.ibm.watson.developer_cloud.text_to_speech.v1.websocket.TextToSpeechWebSocketListener;
4244
import com.ibm.watson.developer_cloud.util.GsonSingleton;
4345
import com.ibm.watson.developer_cloud.util.ResponseConverterUtils;
4446
import com.ibm.watson.developer_cloud.util.Validator;
47+
import okhttp3.HttpUrl;
48+
import okhttp3.OkHttpClient;
49+
import okhttp3.Request;
50+
import okhttp3.WebSocket;
51+
4552
import java.io.InputStream;
4653

4754
/**
@@ -260,6 +267,29 @@ public ServiceCall<InputStream> synthesize(SynthesizeOptions synthesizeOptions)
260267
return createServiceCall(builder.build(), ResponseConverterUtils.getInputStream());
261268
}
262269

270+
public WebSocket synthesizeUsingWebSocket(SynthesizeOptions synthesizeOptions, SynthesizeCallback callback) {
271+
Validator.notNull(synthesizeOptions, "synthesizeOptions cannot be null");
272+
Validator.notNull(callback, "callback cannot be null");
273+
274+
HttpUrl.Builder urlBuilder = HttpUrl.parse(getEndPoint() + "/v1/synthesize").newBuilder();
275+
276+
if (synthesizeOptions.voice() != null) {
277+
urlBuilder.addQueryParameter("voice", synthesizeOptions.voice());
278+
}
279+
if (synthesizeOptions.customizationId() != null) {
280+
urlBuilder.addQueryParameter("customization_id", synthesizeOptions.customizationId());
281+
}
282+
283+
String url = urlBuilder.toString().replace("https://", "wss://");
284+
Request.Builder builder = new Request.Builder().url(url);
285+
286+
setAuthentication(builder);
287+
setDefaultHeaders(builder);
288+
289+
OkHttpClient client = configureHttpClient();
290+
return client.newWebSocket(builder.build(), new TextToSpeechWebSocketListener(synthesizeOptions, callback));
291+
}
292+
263293
/**
264294
* Get pronunciation.
265295
*
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package com.ibm.watson.developer_cloud.text_to_speech.v1.model;
2+
3+
import com.google.gson.annotations.JsonAdapter;
4+
import com.ibm.watson.developer_cloud.service.model.GenericModel;
5+
import com.ibm.watson.developer_cloud.text_to_speech.v1.util.MarkTimingTypeAdapter;
6+
7+
@JsonAdapter(MarkTimingTypeAdapter.class)
8+
public class MarkTiming extends GenericModel {
9+
private String mark;
10+
private Double time;
11+
12+
public String getMark() {
13+
return mark;
14+
}
15+
16+
public Double getTime() {
17+
return time;
18+
}
19+
20+
public void setMark(String mark) {
21+
this.mark = mark;
22+
}
23+
24+
public void setTime(Double time) {
25+
this.time = time;
26+
}
27+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package com.ibm.watson.developer_cloud.text_to_speech.v1.model;
2+
3+
import com.ibm.watson.developer_cloud.service.model.GenericModel;
4+
5+
import java.util.List;
6+
7+
public class Marks extends GenericModel {
8+
private List<MarkTiming> marks;
9+
10+
public List<MarkTiming> getMarks() {
11+
return marks;
12+
}
13+
}

text-to-speech/src/main/java/com/ibm/watson/developer_cloud/text_to_speech/v1/model/SynthesizeOptions.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import com.ibm.watson.developer_cloud.service.model.GenericModel;
1616
import com.ibm.watson.developer_cloud.util.Validator;
1717

18+
import java.util.List;
19+
1820
/**
1921
* The synthesize options.
2022
*/
@@ -94,6 +96,7 @@ public interface Voice {
9496
private String accept;
9597
private String voice;
9698
private String customizationId;
99+
private List<String> timings;
97100

98101
/**
99102
* Builder.
@@ -103,12 +106,14 @@ public static class Builder {
103106
private String accept;
104107
private String voice;
105108
private String customizationId;
109+
private List<String> timings;
106110

107111
private Builder(SynthesizeOptions synthesizeOptions) {
108112
text = synthesizeOptions.text;
109113
accept = synthesizeOptions.accept;
110114
voice = synthesizeOptions.voice;
111115
customizationId = synthesizeOptions.customizationId;
116+
timings = synthesizeOptions.timings;
112117
}
113118

114119
/**
@@ -178,6 +183,17 @@ public Builder customizationId(String customizationId) {
178183
this.customizationId = customizationId;
179184
return this;
180185
}
186+
187+
/**
188+
* Set the timings.
189+
*
190+
* @param timings the timings
191+
* @return the SynthesizeOptions builder
192+
*/
193+
public Builder timings(List<String> timings) {
194+
this.timings = timings;
195+
return this;
196+
}
181197
}
182198

183199
private SynthesizeOptions(Builder builder) {
@@ -186,6 +202,7 @@ private SynthesizeOptions(Builder builder) {
186202
accept = builder.accept;
187203
voice = builder.voice;
188204
customizationId = builder.customizationId;
205+
timings = builder.timings;
189206
}
190207

191208
/**
@@ -247,4 +264,20 @@ public String voice() {
247264
public String customizationId() {
248265
return customizationId;
249266
}
267+
268+
/**
269+
* Gets the timings.
270+
*
271+
* An array that specifies whether the service is to return word timing information for all strings of the input
272+
* text. Specify `words` as the element of the array to request word timing information. The service returns the
273+
* start and end time of each word of the input. Specify an empty array or omit the parameter to receive no word
274+
* timing information. Not supported for Japanese input text.
275+
*
276+
* NOTE: This parameter only works for the `synthesizeUsingWebSocket` method.
277+
*
278+
* @return the timings
279+
*/
280+
public List<String> timings() {
281+
return timings;
282+
}
250283
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package com.ibm.watson.developer_cloud.text_to_speech.v1.model;
2+
3+
import com.ibm.watson.developer_cloud.service.model.GenericModel;
4+
5+
import java.util.List;
6+
7+
public class Timings extends GenericModel {
8+
private List<WordTiming> words;
9+
10+
public List<WordTiming> getWords() {
11+
return words;
12+
}
13+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package com.ibm.watson.developer_cloud.text_to_speech.v1.model;
2+
3+
import com.google.gson.annotations.JsonAdapter;
4+
import com.ibm.watson.developer_cloud.service.model.GenericModel;
5+
import com.ibm.watson.developer_cloud.text_to_speech.v1.util.WordTimingTypeAdapter;
6+
7+
@JsonAdapter(WordTimingTypeAdapter.class)
8+
public class WordTiming extends GenericModel {
9+
private String word;
10+
private Double startTime;
11+
private Double endTime;
12+
13+
public String getWord() {
14+
return word;
15+
}
16+
17+
public Double getStartTime() {
18+
return startTime;
19+
}
20+
21+
public Double getEndTime() {
22+
return endTime;
23+
}
24+
25+
public void setWord(String word) {
26+
this.word = word;
27+
}
28+
29+
public void setStartTime(Double startTime) {
30+
this.startTime = startTime;
31+
}
32+
33+
public void setEndTime(Double endTime) {
34+
this.endTime = endTime;
35+
}
36+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package com.ibm.watson.developer_cloud.text_to_speech.v1.util;
2+
3+
import com.google.gson.TypeAdapter;
4+
import com.google.gson.stream.JsonReader;
5+
import com.google.gson.stream.JsonToken;
6+
import com.google.gson.stream.JsonWriter;
7+
import com.ibm.watson.developer_cloud.text_to_speech.v1.model.MarkTiming;
8+
9+
import java.io.IOException;
10+
11+
public class MarkTimingTypeAdapter extends TypeAdapter<MarkTiming> {
12+
/*
13+
* (non-Javadoc)
14+
* @see com.google.gson.TypeAdapter#read(com.google.gson.stream.JsonReader)
15+
*/
16+
@Override
17+
public MarkTiming read(JsonReader in) throws IOException {
18+
if (in.peek() == JsonToken.NULL) {
19+
in.nextNull();
20+
return null;
21+
}
22+
23+
final MarkTiming markTiming = new MarkTiming();
24+
in.beginArray();
25+
26+
if (in.peek() == JsonToken.STRING) {
27+
markTiming.setMark(in.nextString());
28+
}
29+
if (in.peek() == JsonToken.NUMBER) {
30+
markTiming.setTime(in.nextDouble());
31+
}
32+
33+
in.endArray();
34+
return markTiming;
35+
}
36+
37+
/*
38+
* (non-Javadoc)
39+
* @see com.google.gson.TypeAdapter#write(com.google.gson.stream.JsonWriter, java.lang.Object)
40+
*/
41+
@Override
42+
public void write(JsonWriter out, MarkTiming markTiming) throws IOException {
43+
out.beginArray();
44+
45+
out.value(markTiming.getMark());
46+
out.value(markTiming.getTime());
47+
48+
out.endArray();
49+
out.flush();
50+
}
51+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package com.ibm.watson.developer_cloud.text_to_speech.v1.util;
2+
3+
import com.google.gson.TypeAdapter;
4+
import com.google.gson.stream.JsonReader;
5+
import com.google.gson.stream.JsonToken;
6+
import com.google.gson.stream.JsonWriter;
7+
import com.ibm.watson.developer_cloud.text_to_speech.v1.model.WordTiming;
8+
9+
import java.io.IOException;
10+
11+
public class WordTimingTypeAdapter extends TypeAdapter<WordTiming> {
12+
/*
13+
* (non-Javadoc)
14+
* @see com.google.gson.TypeAdapter#read(com.google.gson.stream.JsonReader)
15+
*/
16+
@Override
17+
public WordTiming read(JsonReader in) throws IOException {
18+
if (in.peek() == JsonToken.NULL) {
19+
in.nextNull();
20+
return null;
21+
}
22+
23+
final WordTiming wordTiming = new WordTiming();
24+
in.beginArray();
25+
26+
if (in.peek() == JsonToken.STRING) {
27+
wordTiming.setWord(in.nextString());
28+
}
29+
if (in.peek() == JsonToken.NUMBER) {
30+
wordTiming.setStartTime(in.nextDouble());
31+
}
32+
if (in.peek() == JsonToken.NUMBER) {
33+
wordTiming.setEndTime(in.nextDouble());
34+
}
35+
36+
in.endArray();
37+
return wordTiming;
38+
}
39+
40+
/*
41+
* (non-Javadoc)
42+
* @see com.google.gson.TypeAdapter#write(com.google.gson.stream.JsonWriter, java.lang.Object)
43+
*/
44+
@Override
45+
public void write(JsonWriter out, WordTiming wordTiming) throws IOException {
46+
out.beginArray();
47+
48+
out.value(wordTiming.getWord());
49+
out.value(wordTiming.getStartTime());
50+
out.value(wordTiming.getEndTime());
51+
52+
out.endArray();
53+
out.flush();
54+
}
55+
}

0 commit comments

Comments
 (0)