Skip to content

Commit a162667

Browse files
committed
chore(Speech to Text): Apply manual changes
1 parent 845120a commit a162667

File tree

3 files changed

+166
-11
lines changed

3 files changed

+166
-11
lines changed

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,15 @@
8181
import com.ibm.watson.speech_to_text.v1.model.UpgradeLanguageModelOptions;
8282
import com.ibm.watson.speech_to_text.v1.model.Word;
8383
import com.ibm.watson.speech_to_text.v1.model.Words;
84+
import com.ibm.watson.speech_to_text.v1.websocket.RecognizeCallback;
85+
import com.ibm.watson.speech_to_text.v1.websocket.SpeechToTextWebSocketListener;
86+
import okhttp3.HttpUrl;
87+
import okhttp3.OkHttpClient;
88+
import okhttp3.Request;
89+
import okhttp3.WebSocket;
90+
8491
import java.util.Map;
8592
import java.util.Map.Entry;
86-
import okhttp3.MultipartBody;
8793

8894
/**
8995
* The IBM® Speech to Text service provides APIs that use IBM's speech-recognition capabilities to produce
@@ -354,6 +360,54 @@ public ServiceCall<SpeechRecognitionResults> recognize(RecognizeOptions recogniz
354360
return createServiceCall(builder.build(), responseConverter);
355361
}
356362

363+
/**
364+
* Sends audio and returns transcription results for recognition requests over a WebSocket connection. Requests and
365+
* responses are enabled over a single TCP connection that abstracts much of the complexity of the request to offer
366+
* efficient implementation, low latency, high throughput, and an asynchronous response. By default, only final
367+
* results are returned for any request; to enable interim results, set the interimResults parameter to true.
368+
*
369+
* The service imposes a data size limit of 100 MB per utterance (per recognition request). You can send multiple
370+
* utterances over a single WebSocket connection. The service automatically detects the endianness of the incoming
371+
* audio and, for audio that includes multiple channels, downmixes the audio to one-channel mono during transcoding.
372+
* (For the audio/l16 format, you can specify the endianness.)
373+
*
374+
* @param recognizeOptions the recognize options
375+
* @param callback the {@link RecognizeCallback} instance where results will be sent
376+
* @return the {@link WebSocket}
377+
*/
378+
public WebSocket recognizeUsingWebSocket(RecognizeOptions recognizeOptions, RecognizeCallback callback) {
379+
com.ibm.cloud.sdk.core.util.Validator.notNull(recognizeOptions, "recognizeOptions cannot be null");
380+
com.ibm.cloud.sdk.core.util.Validator.notNull(recognizeOptions.audio(), "audio cannot be null");
381+
com.ibm.cloud.sdk.core.util.Validator.notNull(callback, "callback cannot be null");
382+
383+
HttpUrl.Builder urlBuilder = HttpUrl.parse(getEndPoint() + "/v1/recognize").newBuilder();
384+
385+
if (recognizeOptions.model() != null) {
386+
urlBuilder.addQueryParameter("model", recognizeOptions.model());
387+
}
388+
if (recognizeOptions.customizationId() != null) {
389+
urlBuilder.addQueryParameter("customization_id", recognizeOptions.customizationId());
390+
}
391+
if (recognizeOptions.languageCustomizationId() != null) {
392+
urlBuilder.addQueryParameter("language_customization_id", recognizeOptions.languageCustomizationId());
393+
}
394+
if (recognizeOptions.acousticCustomizationId() != null) {
395+
urlBuilder.addQueryParameter("acoustic_customization_id", recognizeOptions.acousticCustomizationId());
396+
}
397+
if (recognizeOptions.baseModelVersion() != null) {
398+
urlBuilder.addQueryParameter("base_model_version", recognizeOptions.baseModelVersion());
399+
}
400+
401+
String url = urlBuilder.toString().replace("https://", "wss://");
402+
Request.Builder builder = new Request.Builder().url(url);
403+
404+
setAuthentication(builder);
405+
setDefaultHeaders(builder);
406+
407+
OkHttpClient client = configureHttpClient();
408+
return client.newWebSocket(builder.build(), new SpeechToTextWebSocketListener(recognizeOptions, callback));
409+
}
410+
357411
/**
358412
* Register a callback.
359413
*
@@ -1096,11 +1150,7 @@ public ServiceCall<Void> addCorpus(AddCorpusOptions addCorpusOptions) {
10961150
if (addCorpusOptions.allowOverwrite() != null) {
10971151
builder.query("allow_overwrite", String.valueOf(addCorpusOptions.allowOverwrite()));
10981152
}
1099-
MultipartBody.Builder multipartBuilder = new MultipartBody.Builder();
1100-
multipartBuilder.setType(MultipartBody.FORM);
1101-
okhttp3.RequestBody corpusFileBody = RequestUtils.inputStreamBody(addCorpusOptions.corpusFile(), "text/plain");
1102-
multipartBuilder.addFormDataPart("corpus_file", "filename", corpusFileBody);
1103-
builder.body(multipartBuilder.build());
1153+
builder.body(RequestUtils.inputStreamBody(addCorpusOptions.corpusFile(), "text/plain"));
11041154
ResponseConverter<Void> responseConverter = ResponseConverterUtils.getVoid();
11051155
return createServiceCall(builder.build(), responseConverter);
11061156
}

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptions.java

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import java.util.ArrayList;
2020
import java.util.List;
2121

22+
import com.google.gson.annotations.SerializedName;
2223
import com.ibm.cloud.sdk.core.service.model.GenericModel;
2324

2425
/**
@@ -132,7 +133,8 @@ public interface Model {
132133
String ZH_CN_NARROWBANDMODEL = "zh-CN_NarrowbandModel";
133134
}
134135

135-
private InputStream audio;
136+
private transient InputStream audio;
137+
@SerializedName("content-type")
136138
private String contentType;
137139
private String model;
138140
private String languageCustomizationId;
@@ -153,6 +155,9 @@ public interface Model {
153155
private String grammarName;
154156
private Boolean redaction;
155157
private Boolean audioMetrics;
158+
private Boolean interimResults;
159+
private Boolean processingMetrics;
160+
private Float processingMetricsInterval;
156161

157162
/**
158163
* Builder.
@@ -179,6 +184,9 @@ public static class Builder {
179184
private String grammarName;
180185
private Boolean redaction;
181186
private Boolean audioMetrics;
187+
private Boolean interimResults;
188+
private Boolean processingMetrics;
189+
private Float processingMetricsInterval;
182190

183191
private Builder(RecognizeOptions recognizeOptions) {
184192
this.audio = recognizeOptions.audio;
@@ -202,6 +210,9 @@ private Builder(RecognizeOptions recognizeOptions) {
202210
this.grammarName = recognizeOptions.grammarName;
203211
this.redaction = recognizeOptions.redaction;
204212
this.audioMetrics = recognizeOptions.audioMetrics;
213+
this.interimResults = recognizeOptions.interimResults;
214+
this.processingMetrics = recognizeOptions.processingMetrics;
215+
this.processingMetricsInterval = recognizeOptions.processingMetricsInterval;
205216
}
206217

207218
/**
@@ -488,6 +499,45 @@ public Builder audio(File audio) throws FileNotFoundException {
488499
this.audio = new FileInputStream(audio);
489500
return this;
490501
}
502+
503+
/**
504+
* Set the interimResults.
505+
*
506+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
507+
*
508+
* @param interimResults the interimResults
509+
* @return the interimResults
510+
*/
511+
public Builder interimResults(Boolean interimResults) {
512+
this.interimResults = interimResults;
513+
return this;
514+
}
515+
516+
/**
517+
* Set the processingMetrics.
518+
*
519+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
520+
*
521+
* @param processingMetrics the processingMetrics
522+
* @return the processingMetrics
523+
*/
524+
public Builder processingMetrics(Boolean processingMetrics) {
525+
this.processingMetrics = processingMetrics;
526+
return this;
527+
}
528+
529+
/**
530+
* Set the processingMetricsInterval.
531+
*
532+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
533+
*
534+
* @param processingMetricsInterval the processingMetricsInterval
535+
* @return the processingMetricsInterval
536+
*/
537+
public Builder processingMetricsInterval(Float processingMetricsInterval) {
538+
this.processingMetricsInterval = processingMetricsInterval;
539+
return this;
540+
}
491541
}
492542

493543
private RecognizeOptions(Builder builder) {
@@ -514,6 +564,9 @@ private RecognizeOptions(Builder builder) {
514564
grammarName = builder.grammarName;
515565
redaction = builder.redaction;
516566
audioMetrics = builder.audioMetrics;
567+
interimResults = builder.interimResults;
568+
processingMetrics = builder.processingMetrics;
569+
processingMetricsInterval = builder.processingMetricsInterval;
517570
}
518571

519572
/**
@@ -840,4 +893,56 @@ public Boolean redaction() {
840893
public Boolean audioMetrics() {
841894
return audioMetrics;
842895
}
896+
897+
/**
898+
* Gets the interimResults.
899+
*
900+
* If `true`, the service returns interim results as a stream of `SpeechRecognitionResults` objects. By default,
901+
* the service returns a single `SpeechRecognitionResults` object with final results only.
902+
*
903+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
904+
*
905+
* @return the interimResults
906+
*/
907+
public Boolean interimResults() {
908+
return interimResults;
909+
}
910+
911+
/**
912+
* Gets the processingMetrics.
913+
*
914+
* If `true`, requests processing metrics about the service's transcription of the input audio. The service returns
915+
* processing metrics at the interval specified by the `processing_metrics_interval` parameter. It also returns
916+
* processing metrics for transcription events, for example, for final and interim results. By default, the service
917+
* returns no processing metrics.
918+
*
919+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
920+
*
921+
* @return the processingMetrics
922+
*/
923+
public Boolean processingMetrics() {
924+
return processingMetrics;
925+
}
926+
927+
/**
928+
* Gets the processingMetricsInterval.
929+
*
930+
* Specifies the interval in real wall-clock seconds at which the service is to return processing metrics. The
931+
* parameter is ignored unless the `processing_metrics` parameter is set to `true`.
932+
*
933+
* The parameter accepts a minimum value of 0.1 seconds. The level of precision is not restricted, so you can
934+
* specify values such as 0.25 and 0.125.
935+
*
936+
* The service does not impose a maximum value. If you want to receive processing metrics only for transcription
937+
* events instead of at periodic intervals, set the value to a large number. If the value is larger than the
938+
* duration of the audio, the service returns processing metrics only for transcription events.
939+
*
940+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
941+
*
942+
* @return the processingMetricsInterval
943+
*/
944+
public Float processingMetricsInterval() {
945+
return processingMetricsInterval;
946+
}
947+
843948
}

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/SpeechRecognitionAlternative.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ public class SpeechRecognitionAlternative extends GenericModel {
2424

2525
private String transcript;
2626
private Double confidence;
27-
private List<String> timestamps;
27+
private List<SpeechTimestamp> timestamps;
2828
@SerializedName("word_confidence")
29-
private List<String> wordConfidence;
29+
private List<SpeechWordConfidence> wordConfidence;
3030

3131
/**
3232
* Gets the transcript.
@@ -60,7 +60,7 @@ public Double getConfidence() {
6060
*
6161
* @return the timestamps
6262
*/
63-
public List<String> getTimestamps() {
63+
public List<SpeechTimestamp> getTimestamps() {
6464
return timestamps;
6565
}
6666

@@ -73,7 +73,7 @@ public List<String> getTimestamps() {
7373
*
7474
* @return the wordConfidence
7575
*/
76-
public List<String> getWordConfidence() {
76+
public List<SpeechWordConfidence> getWordConfidence() {
7777
return wordConfidence;
7878
}
7979
}

0 commit comments

Comments
 (0)