Merge release/2.16.0 to main (#84)

rmittal-github · virajkarandikar · sarane22 · web-flow · commit 330aa606c96c · 2024-07-03T10:29:38.000+05:30
* add list voices support to tts client (#78) add --list-voices parameter to tts client to query supported voices * Add ASR endpointing stop_threshold_eou parameter (#83) * Exposing the 'stop_historu_eou_th' parameter * updating submodule * Updating param name * Updating help for VAD param * Adding check for stop_threshold_eou * Updating proto branch * updating the submodule --------- Co-authored-by: Viraj Karandikar <16838694+virajkarandikar@users.noreply.github.com> Co-authored-by: sarane22 <118975230+sarane22@users.noreply.github.com>
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 9dfc052bba9a0cc2cc4b4f156e0ca8a273e9444e
+Subproject commit 42bf472434054d4e30f06a2452b396c2a4486201
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -52,7 +52,7 @@ def add_asr_config_argparse_parameters(
         "--start-history",
         default=-1,
         type=int,
-        help="Value to detect and initiate start of speech utterance",
+        help="Value (in milliseconds) to detect and initiate start of speech utterance",
     )
     parser.add_argument(
         "--start-threshold",
@@ -64,19 +64,25 @@ def add_asr_config_argparse_parameters(
         "--stop-history",
         default=-1,
         type=int,
-        help="Value to reset the endpoint detection history",
+        help="Value (in milliseconds) to detect end of utterance and reset decoder",
+    )
+    parser.add_argument(
+        "--stop-threshold",
+        default=-1.0,
+        type=float,
+        help="Threshold value for detecting the end of speech utterance",
     )
     parser.add_argument(
         "--stop-history-eou",
         default=-1,
         type=int,
-        help="Value to determine the response history for endpoint detection",
+        help="Value (in milliseconds) to detect end of utterance for the 1st pass and generate an intermediate final transcript",
     )
     parser.add_argument(
-        "--stop-threshold",
+        "--stop-threshold-eou",
         default=-1.0,
         type=float,
-        help="Threshold value for detecting the end of speech utterance",
+        help="Threshold value for likelihood of blanks before detecting end of utterance",
     )
     return parser
 
diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -130,8 +130,9 @@ def add_endpoint_parameters_to_config(
     stop_history: int,
     stop_history_eou: int,
     stop_threshold: float,
+    stop_threshold_eou: float,
 ) -> None:
-    if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0):
+    if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0 or stop_threshold_eou > 0):
         return 
          
     inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
@@ -146,6 +147,8 @@ def add_endpoint_parameters_to_config(
         endpointing_config.stop_history_eou = stop_history_eou
     if stop_threshold > 0:
         endpointing_config.stop_threshold = stop_threshold
+    if stop_threshold_eou > 0:
+        endpointing_config.stop_threshold_eou = stop_threshold_eou
     inner_config.endpointing_config.CopyFrom(endpointing_config)
 
 
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -64,12 +64,13 @@ def streaming_transcription_worker(
             interim_results=True,
         )
         riva.client.add_endpoint_parameters_to_config(
-            config, 
-            args.start_history, 
-            args.start_threshold, 
-            args.stop_history, 
-            args.stop_history_eou, 
-            args.stop_threshold
+            config,
+            args.start_history,
+            args.start_threshold,
+            args.stop_history,
+            args.stop_history_eou,
+            args.stop_threshold,
+            args.stop_threshold_eou
         )
         riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
         for _ in range(args.num_iterations):
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -80,12 +80,13 @@ def main() -> None:
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )
     sound_callback = None
     try:
diff --git a/scripts/asr/transcribe_file_offline.py b/scripts/asr/transcribe_file_offline.py
@@ -39,12 +39,13 @@ def main() -> None:
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )    
     with args.input_file.open('rb') as fh:
         data = fh.read()
diff --git a/scripts/asr/transcribe_mic.py b/scripts/asr/transcribe_mic.py
@@ -58,12 +58,13 @@ def main() -> None:
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )
     with riva.client.audio_io.MicrophoneStream(
         args.sample_rate_hz,
diff --git a/scripts/tts/talk.py b/scripts/tts/talk.py
@@ -4,6 +4,7 @@
 import argparse
 import time
 import wave
+import json
 from pathlib import Path
 
 import riva.client
@@ -21,12 +22,12 @@ def parse_args() -> argparse.Namespace:
         help="A voice name to use. If this parameter is missing, then the server will try a first available model "
         "based on parameter `--language-code`.",
     )
-    parser.add_argument("--text", type=str, required=True, help="Text input to synthesize.")
+    parser.add_argument("--text", type=str, required=False, help="Text input to synthesize.")
     parser.add_argument(
         "--audio_prompt_file",
         type=Path,
         help="An input audio prompt (.wav) file for zero shot model. This is required to do zero shot inferencing.")
-    parser.add_argument("-o", "--output", type=Path, help="Output file .wav file to write synthesized audio.")
+    parser.add_argument("-o", "--output", type=Path, default="output.wav", help="Output file .wav file to write synthesized audio.")
     parser.add_argument("--quality", type=int, help="Number of times decoder should be run on the output audio. A higher number improves quality of the produced output but introduces latencies.")
     parser.add_argument(
         "--play-audio",
@@ -35,6 +36,7 @@ def parse_args() -> argparse.Namespace:
         "then the default output audio device will be used.",
     )
     parser.add_argument("--list-devices", action="store_true", help="List output audio devices indices.")
+    parser.add_argument("--list-voices", action="store_true", help="List available voices.")
     parser.add_argument("--output-device", type=int, help="Output device to use.")
     parser.add_argument("--language-code", default='en-US', help="A language of input text.")
     parser.add_argument(
@@ -49,11 +51,6 @@ def parse_args() -> argparse.Namespace:
     )
     parser = add_connection_argparse_parameters(parser)
     args = parser.parse_args()
-    if args.output is None and not args.play_audio and args.output_device is None and not args.list_devices:
-        parser.error(
-            f"You have to provide at least one of arguments: `--play-audio`, `--output-device`, `--output`, "
-            f"`--list-devices`."
-        )
     if args.output is not None:
         args.output = args.output.expanduser()
     if args.list_devices or args.output_device or args.play_audio:
@@ -65,12 +62,36 @@ def main() -> None:
     args = parse_args()
     if args.list_devices:
         riva.client.audio_io.list_output_devices()
-        return
+
     auth = riva.client.Auth(args.ssl_cert, args.use_ssl, args.server, args.metadata)
     service = riva.client.SpeechSynthesisService(auth)
     nchannels = 1
     sampwidth = 2
     sound_stream, out_f = None, None
+
+    if args.list_voices:
+        config_response = service.stub.GetRivaSynthesisConfig(
+                riva.client.proto.riva_tts_pb2.RivaSynthesisConfigRequest()
+            )
+        tts_models = dict()
+        for model_config in config_response.model_config:
+                language_code = model_config.parameters['language_code']
+                voice_name = model_config.parameters['voice_name']
+                subvoices = [voice.split(':')[0] for voice in model_config.parameters['subvoices'].split(',')]
+                full_voice_names = [voice_name + "." + subvoice for subvoice in subvoices]
+
+                if language_code in tts_models:
+                    tts_models[language_code]['voices'].extend(full_voice_names)
+                else:
+                    tts_models[language_code] = {"voices": full_voice_names}
+
+        tts_models = dict(sorted(tts_models.items()))
+        print(json.dumps(tts_models, indent=4))
+
+    if not args.text:
+        print("No input text provided")
+        return
+
     try:
         if args.output_device is not None or args.play_audio:
             sound_stream = riva.client.audio_io.SoundCallBack(