Add ASR endpointing stop_threshold_eou parameter (#83)

sarane22 · web-flow · commit 0a7501583be9 · 2024-06-28T01:23:31.000+05:30
* Exposing the 'stop_historu_eou_th' parameter

* updating submodule

* Updating param name

* Updating help for VAD param

* Adding check for stop_threshold_eou

* Updating proto branch

* updating the submodule
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "common"]
 	path = common
 	url = https://github.com/nvidia-riva/common.git
-	branch = main
+	branch = release/2.16.0
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 9dfc052bba9a0cc2cc4b4f156e0ca8a273e9444e
+Subproject commit a5707ad2c4e3bf904905a9c5165fdecf9fab133b
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -52,7 +52,7 @@ def add_asr_config_argparse_parameters(
         "--start-history",
         default=-1,
         type=int,
-        help="Value to detect and initiate start of speech utterance",
+        help="Value (in milliseconds) to detect and initiate start of speech utterance",
     )
     parser.add_argument(
         "--start-threshold",
@@ -64,19 +64,25 @@ def add_asr_config_argparse_parameters(
         "--stop-history",
         default=-1,
         type=int,
-        help="Value to reset the endpoint detection history",
+        help="Value (in milliseconds) to detect end of utterance and reset decoder",
+    )
+    parser.add_argument(
+        "--stop-threshold",
+        default=-1.0,
+        type=float,
+        help="Threshold value for detecting the end of speech utterance",
     )
     parser.add_argument(
         "--stop-history-eou",
         default=-1,
         type=int,
-        help="Value to determine the response history for endpoint detection",
+        help="Value (in milliseconds) to detect end of utterance for the 1st pass and generate an intermediate final transcript",
     )
     parser.add_argument(
-        "--stop-threshold",
+        "--stop-threshold-eou",
         default=-1.0,
         type=float,
-        help="Threshold value for detecting the end of speech utterance",
+        help="Threshold value for likelihood of blanks before detecting end of utterance",
     )
     return parser
 
diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -130,8 +130,9 @@ def add_endpoint_parameters_to_config(
     stop_history: int,
     stop_history_eou: int,
     stop_threshold: float,
+    stop_threshold_eou: float,
 ) -> None:
-    if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0):
+    if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0 or stop_threshold_eou > 0):
         return 
          
     inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
@@ -146,6 +147,8 @@ def add_endpoint_parameters_to_config(
         endpointing_config.stop_history_eou = stop_history_eou
     if stop_threshold > 0:
         endpointing_config.stop_threshold = stop_threshold
+    if stop_threshold_eou > 0:
+        endpointing_config.stop_threshold_eou = stop_threshold_eou
     inner_config.endpointing_config.CopyFrom(endpointing_config)
 
 
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -64,12 +64,13 @@ def streaming_transcription_worker(
             interim_results=True,
         )
         riva.client.add_endpoint_parameters_to_config(
-            config, 
-            args.start_history, 
-            args.start_threshold, 
-            args.stop_history, 
-            args.stop_history_eou, 
-            args.stop_threshold
+            config,
+            args.start_history,
+            args.start_threshold,
+            args.stop_history,
+            args.stop_history_eou,
+            args.stop_threshold,
+            args.stop_threshold_eou
         )
         riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
         for _ in range(args.num_iterations):
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -80,12 +80,13 @@ def main() -> None:
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )
     sound_callback = None
     try:
diff --git a/scripts/asr/transcribe_file_offline.py b/scripts/asr/transcribe_file_offline.py
@@ -39,12 +39,13 @@ def main() -> None:
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )    
     with args.input_file.open('rb') as fh:
         data = fh.read()
diff --git a/scripts/asr/transcribe_mic.py b/scripts/asr/transcribe_mic.py
@@ -58,12 +58,13 @@ def main() -> None:
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_endpoint_parameters_to_config(
-        config, 
-        args.start_history, 
-        args.start_threshold, 
-        args.stop_history, 
-        args.stop_history_eou, 
-        args.stop_threshold
+        config,
+        args.start_history,
+        args.start_threshold,
+        args.stop_history,
+        args.stop_history_eou,
+        args.stop_threshold,
+        args.stop_threshold_eou
     )
     with riva.client.audio_io.MicrophoneStream(
         args.sample_rate_hz,