diff --git a/lib/recognize-stream.ts b/lib/recognize-stream.ts index 091f91e50b..6d09d20882 100644 --- a/lib/recognize-stream.ts +++ b/lib/recognize-stream.ts @@ -106,6 +106,7 @@ class RecognizeStream extends Duplex { * @param {number} [options.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service is to perform * @param {number} [options.backgroundAudioSuppression] - The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech * @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the service to produce results even more quickly than it usually does + * @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance, improved noise suppression, faster responsiveness, and increased accuracy. * @constructor */ constructor(options: RecognizeStream.Options) { @@ -182,7 +183,8 @@ class RecognizeStream extends Duplex { 'split_transcript_at_phrase_end', 'speech_detector_sensitivity', 'background_audio_suppression', - 'low_latency' + 'low_latency', + 'sad_module' ]; const openingMessage = processUserParameters(options, openingMessageParamsAllowed); openingMessage.action = 'start'; diff --git a/speech-to-text/v1-generated.ts b/speech-to-text/v1-generated.ts index b0f214362a..0468864ceb 100644 --- a/speech-to-text/v1-generated.ts +++ b/speech-to-text/v1-generated.ts @@ -339,8 +339,9 @@ class SpeechToTextV1 extends BaseService { * @param {boolean} [params.speechBeginEvent] - If `true`, the service returns a response object `SpeechActivity` * which contains the time when a speech activity is detected in the stream. This can be used both in standard and low * latency mode. This feature enables client applications to know that some words/speech has been detected and the - * service is in the process of decoding. This can be used in lieu of interim results in standard mode. See [Using - * speech recognition + * service is in the process of decoding. This can be used in lieu of interim results in standard mode. Use + * `sad_module: 2` to increase accuracy and performance in detecting speech boundaries within the audio stream. See + * [Using speech recognition * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters). * @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is * to be used with the recognition request. The base model of the specified custom language model must match the model @@ -508,6 +509,13 @@ class SpeechToTextV1 extends BaseService { * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity) * and [Language model * support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support). + * @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance, + * improved noise suppression, faster responsiveness, and increased accuracy. + * + * Specify `sad_module: 2` + * + * See [Speech Activity Detection + * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). * @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background * audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side * conversations or background noise. @@ -561,7 +569,7 @@ class SpeechToTextV1 extends BaseService { ): Promise> { const _params = { ...params }; const _requiredParams = ['audio']; - const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers']; + const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers']; const _validationErrors = validateParams(_params, _requiredParams, _validParams); if (_validationErrors) { return Promise.reject(_validationErrors); @@ -592,6 +600,7 @@ class SpeechToTextV1 extends BaseService { 'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime, 'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd, 'speech_detector_sensitivity': _params.speechDetectorSensitivity, + 'sad_module': _params.sadModule, 'background_audio_suppression': _params.backgroundAudioSuppression, 'low_latency': _params.lowLatency, 'character_insertion_bias': _params.characterInsertionBias, @@ -1116,6 +1125,13 @@ class SpeechToTextV1 extends BaseService { * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity) * and [Language model * support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support). + * @param {number} [params.sadModule] - Detects speech boundaries within the audio stream with better performance, + * improved noise suppression, faster responsiveness, and increased accuracy. + * + * Specify `sad_module: 2` + * + * See [Speech Activity Detection + * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). * @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background * audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side * conversations or background noise. @@ -1169,7 +1185,7 @@ class SpeechToTextV1 extends BaseService { ): Promise> { const _params = { ...params }; const _requiredParams = ['audio']; - const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers']; + const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers']; const _validationErrors = validateParams(_params, _requiredParams, _validParams); if (_validationErrors) { return Promise.reject(_validationErrors); @@ -1205,6 +1221,7 @@ class SpeechToTextV1 extends BaseService { 'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime, 'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd, 'speech_detector_sensitivity': _params.speechDetectorSensitivity, + 'sad_module': _params.sadModule, 'background_audio_suppression': _params.backgroundAudioSuppression, 'low_latency': _params.lowLatency, 'character_insertion_bias': _params.characterInsertionBias, @@ -4334,7 +4351,8 @@ namespace SpeechToTextV1 { /** If `true`, the service returns a response object `SpeechActivity` which contains the time when a speech * activity is detected in the stream. This can be used both in standard and low latency mode. This feature enables * client applications to know that some words/speech has been detected and the service is in the process of - * decoding. This can be used in lieu of interim results in standard mode. See [Using speech recognition + * decoding. This can be used in lieu of interim results in standard mode. Use `sad_module: 2` to increase accuracy + * and performance in detecting speech boundaries within the audio stream. See [Using speech recognition * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters). */ speechBeginEvent?: boolean; @@ -4541,6 +4559,15 @@ namespace SpeechToTextV1 { * support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support). */ speechDetectorSensitivity?: number; + /** Detects speech boundaries within the audio stream with better performance, improved noise suppression, + * faster responsiveness, and increased accuracy. + * + * Specify `sad_module: 2` + * + * See [Speech Activity Detection + * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). + */ + sadModule?: number; /** The level to which the service is to suppress background audio based on its volume to prevent it from being * transcribed as speech. Use the parameter to suppress side conversations or background noise. * @@ -5009,6 +5036,15 @@ namespace SpeechToTextV1 { * support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support). */ speechDetectorSensitivity?: number; + /** Detects speech boundaries within the audio stream with better performance, improved noise suppression, + * faster responsiveness, and increased accuracy. + * + * Specify `sad_module: 2` + * + * See [Speech Activity Detection + * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). + */ + sadModule?: number; /** The level to which the service is to suppress background audio based on its volume to prevent it from being * transcribed as speech. Use the parameter to suppress side conversations or background noise. * diff --git a/speech-to-text/v1.ts b/speech-to-text/v1.ts index 6fbc167a38..786606eb3b 100644 --- a/speech-to-text/v1.ts +++ b/speech-to-text/v1.ts @@ -286,6 +286,7 @@ namespace SpeechToTextV1 { speechDetectorSensitivity?: number; backgroundAudioSuppression?: number; characterInsertionBias?: number; + sadModule?: number; } } diff --git a/test/unit/speech-to-text.v1.test.js b/test/unit/speech-to-text.v1.test.js index 30eb469b9b..bfaee9ce02 100644 --- a/test/unit/speech-to-text.v1.test.js +++ b/test/unit/speech-to-text.v1.test.js @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2018, 2024. + * (C) Copyright IBM Corp. 2025. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -295,6 +295,7 @@ describe('SpeechToTextV1', () => { const endOfPhraseSilenceTime = 0.8; const splitTranscriptAtPhraseEnd = false; const speechDetectorSensitivity = 0.5; + const sadModule = 1; const backgroundAudioSuppression = 0.0; const lowLatency = false; const characterInsertionBias = 0.0; @@ -324,6 +325,7 @@ describe('SpeechToTextV1', () => { endOfPhraseSilenceTime, splitTranscriptAtPhraseEnd, speechDetectorSensitivity, + sadModule, backgroundAudioSuppression, lowLatency, characterInsertionBias, @@ -368,6 +370,7 @@ describe('SpeechToTextV1', () => { expect(mockRequestOptions.qs.end_of_phrase_silence_time).toEqual(endOfPhraseSilenceTime); expect(mockRequestOptions.qs.split_transcript_at_phrase_end).toEqual(splitTranscriptAtPhraseEnd); expect(mockRequestOptions.qs.speech_detector_sensitivity).toEqual(speechDetectorSensitivity); + expect(mockRequestOptions.qs.sad_module).toEqual(sadModule); expect(mockRequestOptions.qs.background_audio_suppression).toEqual(backgroundAudioSuppression); expect(mockRequestOptions.qs.low_latency).toEqual(lowLatency); expect(mockRequestOptions.qs.character_insertion_bias).toEqual(characterInsertionBias); @@ -636,6 +639,7 @@ describe('SpeechToTextV1', () => { const endOfPhraseSilenceTime = 0.8; const splitTranscriptAtPhraseEnd = false; const speechDetectorSensitivity = 0.5; + const sadModule = 1; const backgroundAudioSuppression = 0.0; const lowLatency = false; const characterInsertionBias = 0.0; @@ -670,6 +674,7 @@ describe('SpeechToTextV1', () => { endOfPhraseSilenceTime, splitTranscriptAtPhraseEnd, speechDetectorSensitivity, + sadModule, backgroundAudioSuppression, lowLatency, characterInsertionBias, @@ -719,6 +724,7 @@ describe('SpeechToTextV1', () => { expect(mockRequestOptions.qs.end_of_phrase_silence_time).toEqual(endOfPhraseSilenceTime); expect(mockRequestOptions.qs.split_transcript_at_phrase_end).toEqual(splitTranscriptAtPhraseEnd); expect(mockRequestOptions.qs.speech_detector_sensitivity).toEqual(speechDetectorSensitivity); + expect(mockRequestOptions.qs.sad_module).toEqual(sadModule); expect(mockRequestOptions.qs.background_audio_suppression).toEqual(backgroundAudioSuppression); expect(mockRequestOptions.qs.low_latency).toEqual(lowLatency); expect(mockRequestOptions.qs.character_insertion_bias).toEqual(characterInsertionBias); diff --git a/text-to-speech/v1-generated.ts b/text-to-speech/v1-generated.ts index 6dc7dffb43..f4d01c11ad 100644 --- a/text-to-speech/v1-generated.ts +++ b/text-to-speech/v1-generated.ts @@ -1894,22 +1894,29 @@ namespace TextToSpeechV1 { DE_DE_ERIKAV3VOICE = 'de-DE_ErikaV3Voice', EN_AU_HEIDIEXPRESSIVE = 'en-AU_HeidiExpressive', EN_AU_JACKEXPRESSIVE = 'en-AU_JackExpressive', + EN_CA_HANNAHNATURAL = 'en-CA_HannahNatural', EN_GB_CHARLOTTEV3VOICE = 'en-GB_CharlotteV3Voice', + EN_GB_CHLOENATURAL = 'en-GB_ChloeNatural', EN_GB_GEORGEEXPRESSIVE = 'en-GB_GeorgeExpressive', EN_GB_JAMESV3VOICE = 'en-GB_JamesV3Voice', + EN_GB_GEORGENATURAL = 'en-GB_GeorgeNatural', EN_GB_KATEV3VOICE = 'en-GB_KateV3Voice', EN_US_ALLISONEXPRESSIVE = 'en-US_AllisonExpressive', EN_US_ALLISONV3VOICE = 'en-US_AllisonV3Voice', EN_US_ELLIENATURAL = 'en-US_EllieNatural', EN_US_EMILYV3VOICE = 'en-US_EmilyV3Voice', EN_US_EMMAEXPRESSIVE = 'en-US_EmmaExpressive', + EN_US_EMMANATURAL = 'en-US_EmmaNatural', + EN_US_ETHANNATURAL = 'en-US_EthanNatural', EN_US_HENRYV3VOICE = 'en-US_HenryV3Voice', + EN_US_JACKSONNATURAL = 'en-US_JacksonNatural', EN_US_KEVINV3VOICE = 'en-US_KevinV3Voice', EN_US_LISAEXPRESSIVE = 'en-US_LisaExpressive', EN_US_LISAV3VOICE = 'en-US_LisaV3Voice', EN_US_MICHAELEXPRESSIVE = 'en-US_MichaelExpressive', EN_US_MICHAELV3VOICE = 'en-US_MichaelV3Voice', EN_US_OLIVIAV3VOICE = 'en-US_OliviaV3Voice', + EN_US_VICTORIANATURAL = 'en-US_VictoriaNatural', ES_ES_ENRIQUEV3VOICE = 'es-ES_EnriqueV3Voice', ES_ES_LAURAV3VOICE = 'es-ES_LauraV3Voice', ES_LA_DANIELAEXPRESSIVE = 'es-LA_DanielaExpressive', @@ -1922,8 +1929,10 @@ namespace TextToSpeechV1 { JA_JP_EMIV3VOICE = 'ja-JP_EmiV3Voice', KO_KR_JINV3VOICE = 'ko-KR_JinV3Voice', NL_NL_MERELV3VOICE = 'nl-NL_MerelV3Voice', + PT_BR_CAMILANATURAL = 'pt-BR_CamilaNatural', PT_BR_ISABELAV3VOICE = 'pt-BR_IsabelaV3Voice', PT_BR_LUCASEXPRESSIVE = 'pt-BR_LucasExpressive', + PT_BR_LUCASNATURAL = 'pt-BR_LucasNatural', } } @@ -2024,22 +2033,29 @@ namespace TextToSpeechV1 { DE_DE_ERIKAV3VOICE = 'de-DE_ErikaV3Voice', EN_AU_HEIDIEXPRESSIVE = 'en-AU_HeidiExpressive', EN_AU_JACKEXPRESSIVE = 'en-AU_JackExpressive', + EN_CA_HANNAHNATURAL = 'en-CA_HannahNatural', EN_GB_CHARLOTTEV3VOICE = 'en-GB_CharlotteV3Voice', + EN_GB_CHLOENATURAL = 'en-GB_ChloeNatural', EN_GB_GEORGEEXPRESSIVE = 'en-GB_GeorgeExpressive', EN_GB_JAMESV3VOICE = 'en-GB_JamesV3Voice', + EN_GB_GEORGENATURAL = 'en-GB_GeorgeNatural', EN_GB_KATEV3VOICE = 'en-GB_KateV3Voice', EN_US_ALLISONEXPRESSIVE = 'en-US_AllisonExpressive', EN_US_ALLISONV3VOICE = 'en-US_AllisonV3Voice', EN_US_ELLIENATURAL = 'en-US_EllieNatural', EN_US_EMILYV3VOICE = 'en-US_EmilyV3Voice', EN_US_EMMAEXPRESSIVE = 'en-US_EmmaExpressive', + EN_US_EMMANATURAL = 'en-US_EmmaNatural', + EN_US_ETHANNATURAL = 'en-US_EthanNatural', EN_US_HENRYV3VOICE = 'en-US_HenryV3Voice', + EN_US_JACKSONNATURAL = 'en-US_JacksonNatural', EN_US_KEVINV3VOICE = 'en-US_KevinV3Voice', EN_US_LISAEXPRESSIVE = 'en-US_LisaExpressive', EN_US_LISAV3VOICE = 'en-US_LisaV3Voice', EN_US_MICHAELEXPRESSIVE = 'en-US_MichaelExpressive', EN_US_MICHAELV3VOICE = 'en-US_MichaelV3Voice', EN_US_OLIVIAV3VOICE = 'en-US_OliviaV3Voice', + EN_US_VICTORIANATURAL = 'en-US_VictoriaNatural', ES_ES_ENRIQUEV3VOICE = 'es-ES_EnriqueV3Voice', ES_ES_LAURAV3VOICE = 'es-ES_LauraV3Voice', ES_LA_DANIELAEXPRESSIVE = 'es-LA_DanielaExpressive', @@ -2052,8 +2068,10 @@ namespace TextToSpeechV1 { JA_JP_EMIV3VOICE = 'ja-JP_EmiV3Voice', KO_KR_JINV3VOICE = 'ko-KR_JinV3Voice', NL_NL_MERELV3VOICE = 'nl-NL_MerelV3Voice', + PT_BR_CAMILANATURAL = 'pt-BR_CamilaNatural', PT_BR_ISABELAV3VOICE = 'pt-BR_IsabelaV3Voice', PT_BR_LUCASEXPRESSIVE = 'pt-BR_LucasExpressive', + PT_BR_LUCASNATURAL = 'pt-BR_LucasNatural', } /** *For German voices,* indicates how the service is to spell out strings of individual letters. To indicate the pace of the spelling, specify one of the following values: * `default` - The service reads the characters at the rate at which it synthesizes speech for the request. You can also omit the parameter entirely to achieve the default behavior. * `singles` - The service reads the characters one at a time, with a brief pause between each character. * `pairs` - The service reads the characters two at a time, with a brief pause between each pair. * `triples` - The service reads the characters three at a time, with a brief pause between each triplet. For more information, see [Specifying how strings are spelled out](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-synthesis-params#params-spell-out-mode). */ export enum SpellOutMode { @@ -2101,22 +2119,29 @@ namespace TextToSpeechV1 { DE_DE_ERIKAV3VOICE = 'de-DE_ErikaV3Voice', EN_AU_HEIDIEXPRESSIVE = 'en-AU_HeidiExpressive', EN_AU_JACKEXPRESSIVE = 'en-AU_JackExpressive', + EN_CA_HANNAHNATURAL = 'en-CA_HannahNatural', EN_GB_CHARLOTTEV3VOICE = 'en-GB_CharlotteV3Voice', + EN_GB_CHLOENATURAL = 'en-GB_ChloeNatural', EN_GB_GEORGEEXPRESSIVE = 'en-GB_GeorgeExpressive', EN_GB_JAMESV3VOICE = 'en-GB_JamesV3Voice', + EN_GB_GEORGENATURAL = 'en-GB_GeorgeNatural', EN_GB_KATEV3VOICE = 'en-GB_KateV3Voice', EN_US_ALLISONEXPRESSIVE = 'en-US_AllisonExpressive', EN_US_ALLISONV3VOICE = 'en-US_AllisonV3Voice', EN_US_ELLIENATURAL = 'en-US_EllieNatural', EN_US_EMILYV3VOICE = 'en-US_EmilyV3Voice', EN_US_EMMAEXPRESSIVE = 'en-US_EmmaExpressive', + EN_US_EMMANATURAL = 'en-US_EmmaNatural', + EN_US_ETHANNATURAL = 'en-US_EthanNatural', EN_US_HENRYV3VOICE = 'en-US_HenryV3Voice', + EN_US_JACKSONNATURAL = 'en-US_JacksonNatural', EN_US_KEVINV3VOICE = 'en-US_KevinV3Voice', EN_US_LISAEXPRESSIVE = 'en-US_LisaExpressive', EN_US_LISAV3VOICE = 'en-US_LisaV3Voice', EN_US_MICHAELEXPRESSIVE = 'en-US_MichaelExpressive', EN_US_MICHAELV3VOICE = 'en-US_MichaelV3Voice', EN_US_OLIVIAV3VOICE = 'en-US_OliviaV3Voice', + EN_US_VICTORIANATURAL = 'en-US_VictoriaNatural', ES_ES_ENRIQUEV3VOICE = 'es-ES_EnriqueV3Voice', ES_ES_LAURAV3VOICE = 'es-ES_LauraV3Voice', ES_LA_DANIELAEXPRESSIVE = 'es-LA_DanielaExpressive', @@ -2129,8 +2154,10 @@ namespace TextToSpeechV1 { JA_JP_EMIV3VOICE = 'ja-JP_EmiV3Voice', KO_KR_JINV3VOICE = 'ko-KR_JinV3Voice', NL_NL_MERELV3VOICE = 'nl-NL_MerelV3Voice', + PT_BR_CAMILANATURAL = 'pt-BR_CamilaNatural', PT_BR_ISABELAV3VOICE = 'pt-BR_IsabelaV3Voice', PT_BR_LUCASEXPRESSIVE = 'pt-BR_LucasExpressive', + PT_BR_LUCASNATURAL = 'pt-BR_LucasNatural', } /** The phoneme format in which to return the pronunciation. Omit the parameter to obtain the pronunciation in the default format. */ export enum Format {