Skip to content

Commit 62f8cc6

Browse files
committed
java synthesizer events etc
1 parent cc3e7fe commit 62f8cc6

File tree

2 files changed

+117
-20
lines changed
  • articles/cognitive-services/Speech-Service/includes/how-to/speech-synthesis

2 files changed

+117
-20
lines changed

articles/cognitive-services/Speech-Service/includes/how-to/speech-synthesis/java.md

Lines changed: 101 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,17 +198,113 @@ public static void main(String[] args) {
198198
199199
## Subscribe to synthesizer events
200200

201+
You might want more insights about the text-to-speech processing and results. For example, you might want to know when the synthesizer starts and stops, or you might want to know about other events encountered during synthesis.
202+
201203
While using the [SpeechSynthesizer](/java/api/com.microsoft.cognitiveservices.speech.speechsynthesizer) for text-to-speech, you can subscribe to the events in this table:
202204

203205
[!INCLUDE [Event types](events.md)]
204206

205-
Here's an example that shows how to subscribe to the `BookmarkReached` event for speech synthesis.
207+
Here's an example that shows how to subscribe to events for speech synthesis. You can follow the instructions in the [quickstart](../../../get-started-text-to-speech.md?pivots=java), but replace the contents of that `SpeechSynthesis.java` file with the following Java code.
206208

207209
```java
208-
speechSynthesizer.BookmarkReached.addEventListener((o, e) -> {
209-
System.out.print("Bookmark reached. Audio offset: " + e.getAudioOffset() / 10000 + "ms, ");
210-
System.out.println("bookmark text: " + e.getText() + ".");
211-
});
210+
import com.microsoft.cognitiveservices.speech.*;
211+
import com.microsoft.cognitiveservices.speech.audio.*;
212+
213+
import java.util.Scanner;
214+
import java.util.concurrent.ExecutionException;
215+
216+
public class SpeechSynthesis {
217+
private static String speechKey = System.getenv("SPEECH_KEY");
218+
private static String speechRegion = System.getenv("SPEECH_REGION");
219+
220+
public static void main(String[] args) throws InterruptedException, ExecutionException {
221+
222+
SpeechConfig speechConfig = SpeechConfig.fromSubscription(speechKey, speechRegion);
223+
224+
String speechSynthesisVoiceName = "en-US-JennyNeural";
225+
226+
String ssml = String.format("<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts'>"
227+
.concat(String.format("<voice name='%s'>", speechSynthesisVoiceName))
228+
.concat("<mstts:viseme type='redlips_front'/>")
229+
.concat("The rainbow has seven colors: <bookmark mark='colors_list_begin'/>Red, orange, yellow, green, blue, indigo, and violet.<bookmark mark='colors_list_end'/>.")
230+
.concat("</voice>")
231+
.concat("</speak>"));
232+
233+
SpeechSynthesizer speechSynthesizer = new SpeechSynthesizer(speechConfig);
234+
{
235+
// Subscribe to events
236+
237+
speechSynthesizer.BookmarkReached.addEventListener((o, e) -> {
238+
System.out.println("BookmarkReached event:");
239+
System.out.println("\tAudioOffset: " + ((e.getAudioOffset() + 5000) / 10000) + "ms");
240+
System.out.println("\tText: " + e.getText());
241+
});
242+
243+
speechSynthesizer.SynthesisCanceled.addEventListener((o, e) -> {
244+
System.out.println("SynthesisCanceled event");
245+
});
246+
247+
speechSynthesizer.SynthesisCompleted.addEventListener((o, e) -> {
248+
SpeechSynthesisResult result = e.getResult();
249+
byte[] audioData = result.getAudioData();
250+
System.out.println("SynthesisCompleted event:");
251+
System.out.println("\tAudioData: " + audioData.length + " bytes");
252+
System.out.println("\tAudioDuration: " + result.getAudioDuration());
253+
result.close();
254+
});
255+
256+
speechSynthesizer.SynthesisStarted.addEventListener((o, e) -> {
257+
System.out.println("SynthesisStarted event");
258+
});
259+
260+
speechSynthesizer.Synthesizing.addEventListener((o, e) -> {
261+
SpeechSynthesisResult result = e.getResult();
262+
byte[] audioData = result.getAudioData();
263+
System.out.println("Synthesizing event:");
264+
System.out.println("\tAudioData: " + audioData.length + " bytes");
265+
result.close();
266+
});
267+
268+
speechSynthesizer.VisemeReceived.addEventListener((o, e) -> {
269+
System.out.println("VisemeReceived event:");
270+
System.out.println("\tAudioOffset: " + ((e.getAudioOffset() + 5000) / 10000) + "ms");
271+
System.out.println("\tVisemeId: " + e.getVisemeId());
272+
});
273+
274+
speechSynthesizer.WordBoundary.addEventListener((o, e) -> {
275+
System.out.println("WordBoundary event:");
276+
System.out.println("\tBoundaryType: " + e.getBoundaryType());
277+
System.out.println("\tAudioOffset: " + ((e.getAudioOffset() + 5000) / 10000) + "ms");
278+
System.out.println("\tDuration: " + e.getDuration());
279+
System.out.println("\tText: " + e.getText());
280+
System.out.println("\tTextOffset: " + e.getTextOffset());
281+
System.out.println("\tWordLength: " + e.getWordLength());
282+
});
283+
284+
// Synthesize the SSML
285+
System.out.println("SSML to synthesize:");
286+
System.out.println(ssml);
287+
SpeechSynthesisResult speechRecognitionResult = speechSynthesizer.SpeakSsmlAsync(ssml).get();
288+
289+
if (speechRecognitionResult.getReason() == ResultReason.SynthesizingAudioCompleted) {
290+
System.out.println("SynthesizingAudioCompleted result");
291+
}
292+
else if (speechRecognitionResult.getReason() == ResultReason.Canceled) {
293+
SpeechSynthesisCancellationDetails cancellation = SpeechSynthesisCancellationDetails.fromResult(speechRecognitionResult);
294+
System.out.println("CANCELED: Reason=" + cancellation.getReason());
295+
296+
if (cancellation.getReason() == CancellationReason.Error) {
297+
System.out.println("CANCELED: ErrorCode=" + cancellation.getErrorCode());
298+
System.out.println("CANCELED: ErrorDetails=" + cancellation.getErrorDetails());
299+
System.out.println("CANCELED: Did you set the speech resource key and region values?");
300+
}
301+
}
302+
}
303+
speechSynthesizer.close();
304+
305+
System.exit(0);
306+
}
307+
}
212308
```
213309

214310
You can find more text-to-speech samples at [GitHub](https://aka.ms/csspeech/samples).

articles/cognitive-services/Speech-Service/includes/how-to/speech-synthesis/python.md

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -164,40 +164,44 @@ import os
164164
import azure.cognitiveservices.speech as speechsdk
165165

166166
def speech_synthesizer_bookmark_reached_cb(evt: speechsdk.SessionEventArgs):
167-
print('BookmarkReached event')
167+
print('BookmarkReached event:')
168168
print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000) / 10000))
169169
print('\tText: {}'.format(evt.text))
170170

171171
def speech_synthesizer_synthesis_canceled_cb(evt: speechsdk.SessionEventArgs):
172172
print('SynthesisCanceled event')
173173

174174
def speech_synthesizer_synthesis_completed_cb(evt: speechsdk.SessionEventArgs):
175-
print('SynthesisCompleted event')
175+
print('SynthesisCompleted event:')
176176
print('\tAudioData: {} bytes'.format(len(evt.result.audio_data)))
177177
print('\tAudioDuration: {}'.format(evt.result.audio_duration))
178178

179179
def speech_synthesizer_synthesis_started_cb(evt: speechsdk.SessionEventArgs):
180180
print('SynthesisStarted event')
181181

182+
def speech_synthesizer_synthesizing_cb(evt: speechsdk.SessionEventArgs):
183+
print('Synthesizing event:')
184+
print('\tAudioData: {} bytes'.format(len(evt.result.audio_data)))
185+
186+
def speech_synthesizer_viseme_received_cb(evt: speechsdk.SessionEventArgs):
187+
print('VisemeReceived event:')
188+
print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000) / 10000))
189+
print('\tVisemeId: {}'.format(evt.viseme_id))
190+
182191
def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
183-
print('WordBoundary event')
192+
print('WordBoundary event:')
184193
print('\tBoundaryType: {}'.format(evt.boundary_type))
185194
print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000) / 10000))
186195
print('\tDuration: {}'.format(evt.duration))
187196
print('\tText: {}'.format(evt.text))
188197
print('\tTextOffset: {}'.format(evt.text_offset))
189198
print('\tWordLength: {}'.format(evt.word_length))
190199

191-
def speech_synthesizer_synthesizing_cb(evt: speechsdk.SessionEventArgs):
192-
print('Synthesizing event')
193-
print('\tAudioData: {} bytes'.format(len(evt.result.audio_data)))
200+
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
194201

195-
def speech_synthesizer_viseme_received_cb(evt: speechsdk.SessionEventArgs):
196-
print('VisemeReceived event')
197-
print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000) / 10000))
198-
print('\tVisemeId: {}'.format(evt.viseme_id))
202+
# Required for WordBoundary event sentences.
203+
speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary, value='true')
199204

200-
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
201205
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
202206
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
203207

@@ -206,12 +210,9 @@ speech_synthesizer.bookmark_reached.connect(speech_synthesizer_bookmark_reached_
206210
speech_synthesizer.synthesis_canceled.connect(speech_synthesizer_synthesis_canceled_cb)
207211
speech_synthesizer.synthesis_completed.connect(speech_synthesizer_synthesis_completed_cb)
208212
speech_synthesizer.synthesis_started.connect(speech_synthesizer_synthesis_started_cb)
209-
speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb)
210213
speech_synthesizer.synthesizing.connect(speech_synthesizer_synthesizing_cb)
211214
speech_synthesizer.viseme_received.connect(speech_synthesizer_viseme_received_cb)
212-
213-
# Required for WordBoundary event sentences.
214-
speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary, value='true')
215+
speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb)
215216

216217
# The language of the voice that speaks.
217218
speech_synthesis_voice_name='en-US-JennyNeural'

0 commit comments

Comments
 (0)