Skip to content

Commit bea493c

Browse files
committed
feat(STT): add support for end_of_phrase_at_silence_time and split_transcript_at_phrase_end methods
1 parent a1f251f commit bea493c

File tree

1 file changed

+120
-16
lines changed

1 file changed

+120
-16
lines changed

lib/ibm_watson/speech_to_text_v1.rb

Lines changed: 120 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# frozen_string_literal: true
22

3-
# (C) Copyright IBM Corp. 2019.
3+
# (C) Copyright IBM Corp. 2020.
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
66
# you may not use this file except in compliance with the License.
@@ -135,7 +135,7 @@ def get_model(model_id:)
135135
#########################
136136

137137
##
138-
# @!method recognize(audio:, content_type: nil, model: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, audio_metrics: nil)
138+
# @!method recognize(audio:, content_type: nil, model: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, audio_metrics: nil, end_of_phrase_silence_time: nil, split_transcript_at_phrase_end: nil)
139139
# Recognize audio.
140140
# Sends audio and returns transcription results for a recognition request. You can
141141
# pass a maximum of 100 MB and a minimum of 100 bytes of audio with a request. The
@@ -358,8 +358,38 @@ def get_model(model_id:)
358358
# @param audio_metrics [Boolean] If `true`, requests detailed information about the signal characteristics of the
359359
# input audio. The service returns audio metrics with the final transcription
360360
# results. By default, the service returns no audio metrics.
361+
#
362+
# See [Audio
363+
# metrics](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-metrics#audio_metrics).
364+
# @param end_of_phrase_silence_time [Float] If `true`, specifies the duration of the pause interval at which the service
365+
# splits a transcript into multiple final results. If the service detects pauses or
366+
# extended silence before it reaches the end of the audio stream, its response can
367+
# include multiple final results. Silence indicates a point at which the speaker
368+
# pauses between spoken words or phrases.
369+
#
370+
# Specify a value for the pause interval in the range of 0.0 to 120.0.
371+
# * A value greater than 0 specifies the interval that the service is to use for
372+
# speech recognition.
373+
# * A value of 0 indicates that the service is to use the default interval. It is
374+
# equivalent to omitting the parameter.
375+
#
376+
# The default pause interval for most languages is 0.8 seconds; the default for
377+
# Chinese is 0.6 seconds.
378+
#
379+
# See [End of phrase silence
380+
# time](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#silence_time).
381+
# @param split_transcript_at_phrase_end [Boolean] If `true`, directs the service to split the transcript into multiple final results
382+
# based on semantic features of the input, for example, at the conclusion of
383+
# meaningful phrases such as sentences. The service bases its understanding of
384+
# semantic features on the base language model that you use with a request. Custom
385+
# language models and grammars can also influence how and where the service splits a
386+
# transcript. By default, the service splits transcripts based solely on the pause
387+
# interval.
388+
#
389+
# See [Split transcript at phrase
390+
# end](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#split_transcript).
361391
# @return [IBMCloudSdkCore::DetailedResponse] A `IBMCloudSdkCore::DetailedResponse` object representing the response.
362-
def recognize(audio:, content_type: nil, model: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, audio_metrics: nil)
392+
def recognize(audio:, content_type: nil, model: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, audio_metrics: nil, end_of_phrase_silence_time: nil, split_transcript_at_phrase_end: nil)
363393
raise ArgumentError.new("audio must be provided") if audio.nil?
364394

365395
headers = {
@@ -388,7 +418,9 @@ def recognize(audio:, content_type: nil, model: nil, language_customization_id:
388418
"customization_id" => customization_id,
389419
"grammar_name" => grammar_name,
390420
"redaction" => redaction,
391-
"audio_metrics" => audio_metrics
421+
"audio_metrics" => audio_metrics,
422+
"end_of_phrase_silence_time" => end_of_phrase_silence_time,
423+
"split_transcript_at_phrase_end" => split_transcript_at_phrase_end
392424
}
393425

394426
data = audio
@@ -407,7 +439,7 @@ def recognize(audio:, content_type: nil, model: nil, language_customization_id:
407439
end
408440

409441
##
410-
# @!method recognize_using_websocket(content_type: nil,recognize_callback:,audio: nil,chunk_data: false,model: nil,customization_id: nil,acoustic_customization_id: nil,customization_weight: nil,base_model_version: nil,inactivity_timeout: nil,interim_results: nil,keywords: nil,keywords_threshold: nil,max_alternatives: nil,word_alternatives_threshold: nil,word_confidence: nil,timestamps: nil,profanity_filter: nil,smart_formatting: nil,speaker_labels: nil)
442+
# @!method recognize_using_websocket(content_type: nil,recognize_callback:,audio: nil,chunk_data: false,model: nil,customization_id: nil,acoustic_customization_id: nil,customization_weight: nil,base_model_version: nil,inactivity_timeout: nil,interim_results: nil,keywords: nil,keywords_threshold: nil,max_alternatives: nil,word_alternatives_threshold: nil,word_confidence: nil,timestamps: nil,profanity_filter: nil,smart_formatting: nil,speaker_labels: nil, end_of_phrase_silence_time: nil, split_transcript_at_phrase_end: nil)
411443
# Sends audio for speech recognition using web sockets.
412444
# @param content_type [String] The type of the input: audio/basic, audio/flac, audio/l16, audio/mp3, audio/mpeg, audio/mulaw, audio/ogg, audio/ogg;codecs=opus, audio/ogg;codecs=vorbis, audio/wav, audio/webm, audio/webm;codecs=opus, audio/webm;codecs=vorbis, or multipart/form-data.
413445
# @param recognize_callback [RecognizeCallback] The instance handling events returned from the service.
@@ -469,6 +501,36 @@ def recognize(audio:, content_type: nil, model: nil, language_customization_id:
469501
# input audio. The service returns audio metrics with the final transcription
470502
# results. By default, the service returns no audio metrics.
471503
# @return [WebSocketClient] Returns a new WebSocketClient object
504+
#
505+
# See [Audio
506+
# metrics](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-metrics#audio_metrics).
507+
# @param end_of_phrase_silence_time [Float] If `true`, specifies the duration of the pause interval at which the service
508+
# splits a transcript into multiple final results. If the service detects pauses or
509+
# extended silence before it reaches the end of the audio stream, its response can
510+
# include multiple final results. Silence indicates a point at which the speaker
511+
# pauses between spoken words or phrases.
512+
#
513+
# Specify a value for the pause interval in the range of 0.0 to 120.0.
514+
# * A value greater than 0 specifies the interval that the service is to use for
515+
# speech recognition.
516+
# * A value of 0 indicates that the service is to use the default interval. It is
517+
# equivalent to omitting the parameter.
518+
#
519+
# The default pause interval for most languages is 0.8 seconds; the default for
520+
# Chinese is 0.6 seconds.
521+
#
522+
# See [End of phrase silence
523+
# time](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#silence_time).
524+
# @param split_transcript_at_phrase_end [Boolean] If `true`, directs the service to split the transcript into multiple final results
525+
# based on semantic features of the input, for example, at the conclusion of
526+
# meaningful phrases such as sentences. The service bases its understanding of
527+
# semantic features on the base language model that you use with a request. Custom
528+
# language models and grammars can also influence how and where the service splits a
529+
# transcript. By default, the service splits transcripts based solely on the pause
530+
# interval.
531+
#
532+
# See [Split transcript at phrase
533+
# end](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#split_transcript).
472534
def recognize_using_websocket(
473535
content_type: nil,
474536
recognize_callback:,
@@ -495,7 +557,9 @@ def recognize_using_websocket(
495557
redaction: nil,
496558
processing_metrics: nil,
497559
processing_metrics_interval: nil,
498-
audio_metrics: nil
560+
audio_metrics: nil,
561+
end_of_phrase_silence_time: nil,
562+
split_transcript_at_phrase_end: nil
499563
)
500564
raise ArgumentError("Audio must be provided") if audio.nil? && !chunk_data
501565
raise ArgumentError("Recognize callback must be provided") if recognize_callback.nil?
@@ -532,7 +596,9 @@ def recognize_using_websocket(
532596
"redaction" => redaction,
533597
"processing_metrics" => processing_metrics,
534598
"processing_metrics_interval" => processing_metrics_interval,
535-
"audio_metrics" => audio_metrics
599+
"audio_metrics" => audio_metrics,
600+
"end_of_phrase_silence_time" => end_of_phrase_silence_time,
601+
"split_transcript_at_phrase_end" => split_transcript_at_phrase_end
536602
}
537603
options.delete_if { |_, v| v.nil? }
538604
WebSocketClient.new(audio: audio, chunk_data: chunk_data, options: options, recognize_callback: recognize_callback, service_url: service_url, headers: headers, disable_ssl_verification: @disable_ssl_verification)
@@ -650,7 +716,7 @@ def unregister_callback(callback_url:)
650716
end
651717

652718
##
653-
# @!method create_job(audio:, content_type: nil, model: nil, callback_url: nil, events: nil, user_token: nil, results_ttl: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, processing_metrics: nil, processing_metrics_interval: nil, audio_metrics: nil)
719+
# @!method create_job(audio:, content_type: nil, model: nil, callback_url: nil, events: nil, user_token: nil, results_ttl: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, processing_metrics: nil, processing_metrics_interval: nil, audio_metrics: nil, end_of_phrase_silence_time: nil, split_transcript_at_phrase_end: nil)
654720
# Create a job.
655721
# Creates a job for a new asynchronous recognition request. The job is owned by the
656722
# instance of the service whose credentials are used to create it. How you learn the
@@ -919,6 +985,9 @@ def unregister_callback(callback_url:)
919985
# the `processing_metrics_interval` parameter. It also returns processing metrics
920986
# for transcription events, for example, for final and interim results. By default,
921987
# the service returns no processing metrics.
988+
#
989+
# See [Processing
990+
# metrics](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-metrics#processing_metrics).
922991
# @param processing_metrics_interval [Float] Specifies the interval in real wall-clock seconds at which the service is to
923992
# return processing metrics. The parameter is ignored unless the
924993
# `processing_metrics` parameter is set to `true`.
@@ -930,11 +999,44 @@ def unregister_callback(callback_url:)
930999
# metrics only for transcription events instead of at periodic intervals, set the
9311000
# value to a large number. If the value is larger than the duration of the audio,
9321001
# the service returns processing metrics only for transcription events.
1002+
#
1003+
# See [Processing
1004+
# metrics](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-metrics#processing_metrics).
9331005
# @param audio_metrics [Boolean] If `true`, requests detailed information about the signal characteristics of the
9341006
# input audio. The service returns audio metrics with the final transcription
9351007
# results. By default, the service returns no audio metrics.
1008+
#
1009+
# See [Audio
1010+
# metrics](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-metrics#audio_metrics).
1011+
# @param end_of_phrase_silence_time [Float] If `true`, specifies the duration of the pause interval at which the service
1012+
# splits a transcript into multiple final results. If the service detects pauses or
1013+
# extended silence before it reaches the end of the audio stream, its response can
1014+
# include multiple final results. Silence indicates a point at which the speaker
1015+
# pauses between spoken words or phrases.
1016+
#
1017+
# Specify a value for the pause interval in the range of 0.0 to 120.0.
1018+
# * A value greater than 0 specifies the interval that the service is to use for
1019+
# speech recognition.
1020+
# * A value of 0 indicates that the service is to use the default interval. It is
1021+
# equivalent to omitting the parameter.
1022+
#
1023+
# The default pause interval for most languages is 0.8 seconds; the default for
1024+
# Chinese is 0.6 seconds.
1025+
#
1026+
# See [End of phrase silence
1027+
# time](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#silence_time).
1028+
# @param split_transcript_at_phrase_end [Boolean] If `true`, directs the service to split the transcript into multiple final results
1029+
# based on semantic features of the input, for example, at the conclusion of
1030+
# meaningful phrases such as sentences. The service bases its understanding of
1031+
# semantic features on the base language model that you use with a request. Custom
1032+
# language models and grammars can also influence how and where the service splits a
1033+
# transcript. By default, the service splits transcripts based solely on the pause
1034+
# interval.
1035+
#
1036+
# See [Split transcript at phrase
1037+
# end](https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#split_transcript).
9361038
# @return [IBMCloudSdkCore::DetailedResponse] A `IBMCloudSdkCore::DetailedResponse` object representing the response.
937-
def create_job(audio:, content_type: nil, model: nil, callback_url: nil, events: nil, user_token: nil, results_ttl: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, processing_metrics: nil, processing_metrics_interval: nil, audio_metrics: nil)
1039+
def create_job(audio:, content_type: nil, model: nil, callback_url: nil, events: nil, user_token: nil, results_ttl: nil, language_customization_id: nil, acoustic_customization_id: nil, base_model_version: nil, customization_weight: nil, inactivity_timeout: nil, keywords: nil, keywords_threshold: nil, max_alternatives: nil, word_alternatives_threshold: nil, word_confidence: nil, timestamps: nil, profanity_filter: nil, smart_formatting: nil, speaker_labels: nil, customization_id: nil, grammar_name: nil, redaction: nil, processing_metrics: nil, processing_metrics_interval: nil, audio_metrics: nil, end_of_phrase_silence_time: nil, split_transcript_at_phrase_end: nil)
9381040
raise ArgumentError.new("audio must be provided") if audio.nil?
9391041

9401042
headers = {
@@ -969,7 +1071,9 @@ def create_job(audio:, content_type: nil, model: nil, callback_url: nil, events:
9691071
"redaction" => redaction,
9701072
"processing_metrics" => processing_metrics,
9711073
"processing_metrics_interval" => processing_metrics_interval,
972-
"audio_metrics" => audio_metrics
1074+
"audio_metrics" => audio_metrics,
1075+
"end_of_phrase_silence_time" => end_of_phrase_silence_time,
1076+
"split_transcript_at_phrase_end" => split_transcript_at_phrase_end
9731077
}
9741078

9751079
data = audio
@@ -1104,9 +1208,9 @@ def delete_job(id:)
11041208
# model is owned by the instance of the service whose credentials are used to create
11051209
# it.
11061210
#
1107-
# You can create a maximum of 1024 custom language models, per credential. The
1108-
# service returns an error if you attempt to create more than 1024 models. You do
1109-
# not lose any models, but you cannot create any more until your model count is
1211+
# You can create a maximum of 1024 custom language models per owning credentials.
1212+
# The service returns an error if you attempt to create more than 1024 models. You
1213+
# do not lose any models, but you cannot create any more until your model count is
11101214
# below the limit.
11111215
#
11121216
# **See also:** [Create a custom language
@@ -2230,9 +2334,9 @@ def delete_grammar(customization_id:, grammar_name:)
22302334
# model is owned by the instance of the service whose credentials are used to create
22312335
# it.
22322336
#
2233-
# You can create a maximum of 1024 custom acoustic models, per credential. The
2234-
# service returns an error if you attempt to create more than 1024 models. You do
2235-
# not lose any models, but you cannot create any more until your model count is
2337+
# You can create a maximum of 1024 custom acoustic models per owning credentials.
2338+
# The service returns an error if you attempt to create more than 1024 models. You
2339+
# do not lose any models, but you cannot create any more until your model count is
22362340
# below the limit.
22372341
#
22382342
# **See also:** [Create a custom acoustic

0 commit comments

Comments
 (0)