Skip to content

Commit f733552

Browse files
committed
chore(Speech to Text): Apply manual changes
1 parent faefce2 commit f733552

File tree

4 files changed

+167
-21
lines changed

4 files changed

+167
-21
lines changed

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,15 @@
8383
import com.ibm.watson.speech_to_text.v1.model.UpgradeLanguageModelOptions;
8484
import com.ibm.watson.speech_to_text.v1.model.Word;
8585
import com.ibm.watson.speech_to_text.v1.model.Words;
86+
import com.ibm.watson.speech_to_text.v1.websocket.RecognizeCallback;
87+
import com.ibm.watson.speech_to_text.v1.websocket.SpeechToTextWebSocketListener;
88+
import okhttp3.HttpUrl;
89+
import okhttp3.OkHttpClient;
90+
import okhttp3.Request;
91+
import okhttp3.WebSocket;
92+
8693
import java.util.Map;
8794
import java.util.Map.Entry;
88-
import okhttp3.MultipartBody;
89-
import okhttp3.RequestBody;
9095

9196
/**
9297
* The IBM® Speech to Text service provides APIs that use IBM's speech-recognition capabilities to produce
@@ -690,6 +695,54 @@ public ServiceCall<RecognitionJobs> checkJobs() {
690695
return checkJobs(null);
691696
}
692697

698+
/**
699+
* Sends audio and returns transcription results for recognition requests over a WebSocket connection. Requests and
700+
* responses are enabled over a single TCP connection that abstracts much of the complexity of the request to offer
701+
* efficient implementation, low latency, high throughput, and an asynchronous response. By default, only final
702+
* results are returned for any request; to enable interim results, set the interimResults parameter to true.
703+
*
704+
* The service imposes a data size limit of 100 MB per utterance (per recognition request). You can send multiple
705+
* utterances over a single WebSocket connection. The service automatically detects the endianness of the incoming
706+
* audio and, for audio that includes multiple channels, downmixes the audio to one-channel mono during transcoding.
707+
* (For the audio/l16 format, you can specify the endianness.)
708+
*
709+
* @param recognizeOptions the recognize options
710+
* @param callback the {@link RecognizeCallback} instance where results will be sent
711+
* @return the {@link WebSocket}
712+
*/
713+
public WebSocket recognizeUsingWebSocket(RecognizeOptions recognizeOptions, RecognizeCallback callback) {
714+
Validator.notNull(recognizeOptions, "recognizeOptions cannot be null");
715+
Validator.notNull(recognizeOptions.audio(), "audio cannot be null");
716+
Validator.notNull(callback, "callback cannot be null");
717+
718+
HttpUrl.Builder urlBuilder = HttpUrl.parse(getEndPoint() + "/v1/recognize").newBuilder();
719+
720+
if (recognizeOptions.model() != null) {
721+
urlBuilder.addQueryParameter("model", recognizeOptions.model());
722+
}
723+
if (recognizeOptions.customizationId() != null) {
724+
urlBuilder.addQueryParameter("customization_id", recognizeOptions.customizationId());
725+
}
726+
if (recognizeOptions.languageCustomizationId() != null) {
727+
urlBuilder.addQueryParameter("language_customization_id", recognizeOptions.languageCustomizationId());
728+
}
729+
if (recognizeOptions.acousticCustomizationId() != null) {
730+
urlBuilder.addQueryParameter("acoustic_customization_id", recognizeOptions.acousticCustomizationId());
731+
}
732+
if (recognizeOptions.baseModelVersion() != null) {
733+
urlBuilder.addQueryParameter("base_model_version", recognizeOptions.baseModelVersion());
734+
}
735+
736+
String url = urlBuilder.toString().replace("https://", "wss://");
737+
Request.Builder builder = new Request.Builder().url(url);
738+
739+
setAuthentication(builder);
740+
setDefaultHeaders(builder);
741+
742+
OkHttpClient client = configureHttpClient();
743+
return client.newWebSocket(builder.build(), new SpeechToTextWebSocketListener(recognizeOptions, callback));
744+
}
745+
693746
/**
694747
* Check a job.
695748
*
@@ -926,10 +979,6 @@ public ServiceCall<Void> deleteLanguageModel(DeleteLanguageModelOptions deleteLa
926979
* * The service is currently handling another request for the custom model, such as another training request or a
927980
* request to add a corpus or grammar to the model.
928981
* * No training data have been added to the custom model.
929-
* * The custom model contains one or more invalid corpora, grammars, or words (for example, a custom word has an
930-
* invalid sounds-like pronunciation). You can correct the invalid resources or set the `strict` parameter to `false`
931-
* to exclude the invalid resources from the training. The model must contain at least one valid resource for training
932-
* to succeed.
933982
*
934983
* @param trainLanguageModelOptions the {@link TrainLanguageModelOptions} containing the options for the call
935984
* @return a {@link ServiceCall} with a response type of {@link TrainingResponse}
@@ -1109,11 +1158,7 @@ public ServiceCall<Void> addCorpus(AddCorpusOptions addCorpusOptions) {
11091158
if (addCorpusOptions.allowOverwrite() != null) {
11101159
builder.query("allow_overwrite", String.valueOf(addCorpusOptions.allowOverwrite()));
11111160
}
1112-
MultipartBody.Builder multipartBuilder = new MultipartBody.Builder();
1113-
multipartBuilder.setType(MultipartBody.FORM);
1114-
RequestBody corpusFileBody = RequestUtils.inputStreamBody(addCorpusOptions.corpusFile(), "text/plain");
1115-
multipartBuilder.addFormDataPart("corpus_file", "filename", corpusFileBody);
1116-
builder.body(multipartBuilder.build());
1161+
builder.body(RequestUtils.inputStreamBody(addCorpusOptions.corpusFile(), "text/plain"));
11171162
ResponseConverter<Void> responseConverter = ResponseConverterUtils.getVoid();
11181163
return createServiceCall(builder.build(), responseConverter);
11191164
}
@@ -1736,9 +1781,6 @@ public ServiceCall<Void> deleteAcousticModel(DeleteAcousticModelOptions deleteAc
17361781
* * The custom model contains less than 10 minutes or more than 200 hours of audio data.
17371782
* * You passed an incompatible custom language model with the `custom_language_model_id` query parameter. Both custom
17381783
* models must be based on the same version of the same base model.
1739-
* * The custom model contains one or more invalid audio resources. You can correct the invalid audio resources or set
1740-
* the `strict` parameter to `false` to exclude the invalid resources from the training. The model must contain at
1741-
* least one valid resource for training to succeed.
17421784
*
17431785
* @param trainAcousticModelOptions the {@link TrainAcousticModelOptions} containing the options for the call
17441786
* @return a {@link ServiceCall} with a response type of {@link TrainingResponse}

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptions.java

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import java.util.ArrayList;
2020
import java.util.List;
2121

22+
import com.google.gson.annotations.SerializedName;
2223
import com.ibm.cloud.sdk.core.service.model.GenericModel;
2324
import com.ibm.cloud.sdk.core.util.Validator;
2425

@@ -113,7 +114,7 @@ public interface ContentType {
113114
String AUDIO_WEBM_CODECS_VORBIS = "audio/webm;codecs=vorbis";
114115
}
115116

116-
private InputStream audio;
117+
private transient InputStream audio;
117118
private String model;
118119
private String languageCustomizationId;
119120
private String acousticCustomizationId;
@@ -133,7 +134,11 @@ public interface ContentType {
133134
private String grammarName;
134135
private Boolean redaction;
135136
private Boolean audioMetrics;
137+
@SerializedName("content-type")
136138
private String contentType;
139+
private Boolean interimResults;
140+
private Boolean processingMetrics;
141+
private Float processingMetricsInterval;
137142

138143
/**
139144
* Builder.
@@ -160,6 +165,9 @@ public static class Builder {
160165
private Boolean redaction;
161166
private Boolean audioMetrics;
162167
private String contentType;
168+
private Boolean interimResults;
169+
private Boolean processingMetrics;
170+
private Float processingMetricsInterval;
163171

164172
private Builder(RecognizeOptions recognizeOptions) {
165173
this.audio = recognizeOptions.audio;
@@ -183,6 +191,9 @@ private Builder(RecognizeOptions recognizeOptions) {
183191
this.redaction = recognizeOptions.redaction;
184192
this.audioMetrics = recognizeOptions.audioMetrics;
185193
this.contentType = recognizeOptions.contentType;
194+
this.interimResults = recognizeOptions.interimResults;
195+
this.processingMetrics = recognizeOptions.processingMetrics;
196+
this.processingMetricsInterval = recognizeOptions.processingMetricsInterval;
186197
}
187198

188199
/**
@@ -468,6 +479,45 @@ public Builder audio(File audio) throws FileNotFoundException {
468479
this.audio = new FileInputStream(audio);
469480
return this;
470481
}
482+
483+
/**
484+
* Set the interimResults.
485+
*
486+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
487+
*
488+
* @param interimResults the interimResults
489+
* @return the interimResults
490+
*/
491+
public Builder interimResults(Boolean interimResults) {
492+
this.interimResults = interimResults;
493+
return this;
494+
}
495+
496+
/**
497+
* Set the processingMetrics.
498+
*
499+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
500+
*
501+
* @param processingMetrics the processingMetrics
502+
* @return the processingMetrics
503+
*/
504+
public Builder processingMetrics(Boolean processingMetrics) {
505+
this.processingMetrics = processingMetrics;
506+
return this;
507+
}
508+
509+
/**
510+
* Set the processingMetricsInterval.
511+
*
512+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
513+
*
514+
* @param processingMetricsInterval the processingMetricsInterval
515+
* @return the processingMetricsInterval
516+
*/
517+
public Builder processingMetricsInterval(Float processingMetricsInterval) {
518+
this.processingMetricsInterval = processingMetricsInterval;
519+
return this;
520+
}
471521
}
472522

473523
private RecognizeOptions(Builder builder) {
@@ -493,6 +543,9 @@ private RecognizeOptions(Builder builder) {
493543
redaction = builder.redaction;
494544
audioMetrics = builder.audioMetrics;
495545
contentType = builder.contentType;
546+
interimResults = builder.interimResults;
547+
processingMetrics = builder.processingMetrics;
548+
processingMetricsInterval = builder.processingMetricsInterval;
496549
}
497550

498551
/**
@@ -819,4 +872,55 @@ public Boolean audioMetrics() {
819872
public String contentType() {
820873
return contentType;
821874
}
875+
876+
/**
877+
* Gets the interimResults.
878+
*
879+
* If `true`, the service returns interim results as a stream of `SpeechRecognitionResults` objects. By default,
880+
* the service returns a single `SpeechRecognitionResults` object with final results only.
881+
*
882+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
883+
*
884+
* @return the interimResults
885+
*/
886+
public Boolean interimResults() {
887+
return interimResults;
888+
}
889+
890+
/**
891+
* Gets the processingMetrics.
892+
*
893+
* If `true`, requests processing metrics about the service's transcription of the input audio. The service returns
894+
* processing metrics at the interval specified by the `processing_metrics_interval` parameter. It also returns
895+
* processing metrics for transcription events, for example, for final and interim results. By default, the service
896+
* returns no processing metrics.
897+
*
898+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
899+
*
900+
* @return the processingMetrics
901+
*/
902+
public Boolean processingMetrics() {
903+
return processingMetrics;
904+
}
905+
906+
/**
907+
* Gets the processingMetricsInterval.
908+
*
909+
* Specifies the interval in real wall-clock seconds at which the service is to return processing metrics. The
910+
* parameter is ignored unless the `processing_metrics` parameter is set to `true`.
911+
*
912+
* The parameter accepts a minimum value of 0.1 seconds. The level of precision is not restricted, so you can
913+
* specify values such as 0.25 and 0.125.
914+
*
915+
* The service does not impose a maximum value. If you want to receive processing metrics only for transcription
916+
* events instead of at periodic intervals, set the value to a large number. If the value is larger than the
917+
* duration of the audio, the service returns processing metrics only for transcription events.
918+
*
919+
* NOTE: This parameter only works for the `recognizeUsingWebSocket` method.
920+
*
921+
* @return the processingMetricsInterval
922+
*/
923+
public Float processingMetricsInterval() {
924+
return processingMetricsInterval;
925+
}
822926
}

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/SpeechRecognitionAlternative.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ public class SpeechRecognitionAlternative extends GenericModel {
2424

2525
private String transcript;
2626
private Double confidence;
27-
private List<String> timestamps;
27+
private List<SpeechTimestamp> timestamps;
2828
@SerializedName("word_confidence")
29-
private List<String> wordConfidence;
29+
private List<SpeechWordConfidence> wordConfidence;
3030

3131
/**
3232
* Gets the transcript.
@@ -60,7 +60,7 @@ public Double getConfidence() {
6060
*
6161
* @return the timestamps
6262
*/
63-
public List<String> getTimestamps() {
63+
public List<SpeechTimestamp> getTimestamps() {
6464
return timestamps;
6565
}
6666

@@ -73,7 +73,7 @@ public List<String> getTimestamps() {
7373
*
7474
* @return the wordConfidence
7575
*/
76-
public List<String> getWordConfidence() {
76+
public List<SpeechWordConfidence> getWordConfidence() {
7777
return wordConfidence;
7878
}
7979
}

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/SpeechRecognitionResult.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public class SpeechRecognitionResult extends GenericModel {
2727
private Boolean finalResults;
2828
private List<SpeechRecognitionAlternative> alternatives;
2929
@SerializedName("keywords_result")
30-
private Map keywordsResult;
30+
private Map<String, List<KeywordResult>> keywordsResult;
3131
@SerializedName("word_alternatives")
3232
private List<WordAlternativeResults> wordAlternatives;
3333

@@ -65,7 +65,7 @@ public List<SpeechRecognitionAlternative> getAlternatives() {
6565
*
6666
* @return the keywordsResult
6767
*/
68-
public Map getKeywordsResult() {
68+
public Map<String, List<KeywordResult>> getKeywordsResult() {
6969
return keywordsResult;
7070
}
7171

0 commit comments

Comments
 (0)