Add endpointing config parameters to ASR clients (#80)

manishaj-nv · web-flow · commit ebc2c0517b09 · 2024-06-13T09:57:27.000+05:30
* asr: add eou param to py clients

* feat(asr):rename params

* asr: rename variable

* update default values and checks

* asr: add validation check

* asr: update gitmodule

* asr:update gutsubmodule

* asr: update protos with main branch

* asr: update .gitmodules
diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ its purpose and parameters.
 
 #### ASR
 
-You may find a detailed documentation [here](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/apis/development-cpp.html).
+You may find a detailed documentation [here](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/apis/cli.html).
 
 For transcribing in streaming mode you may use `scripts/asr/transcribe_file.py`.
 ```bash
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 9f192f67f56fcc916ae6c17329394a71c780b0fb
+Subproject commit 9dfc052bba9a0cc2cc4b4f156e0ca8a273e9444e
diff --git a/riva/client/__init__.py b/riva/client/__init__.py
@@ -11,6 +11,7 @@
     print_offline,
     print_streaming,
     sleep_audio_length,
+    add_endpoint_parameters_to_config,
 )
 from riva.client.auth import Auth
 from riva.client.nlp import (
@@ -33,7 +34,7 @@
     __shortversion__,
     __version__,
 )
-from riva.client.proto.riva_asr_pb2 import RecognitionConfig, StreamingRecognitionConfig
+from riva.client.proto.riva_asr_pb2 import RecognitionConfig, StreamingRecognitionConfig, EndpointingConfig
 from riva.client.proto.riva_audio_pb2 import AudioEncoding
 from riva.client.proto.riva_nlp_pb2 import AnalyzeIntentOptions
 from riva.client.proto.riva_nmt_pb2 import StreamingTranslateSpeechToSpeechConfig, TranslationConfig, SynthesizeSpeechConfig, StreamingTranslateSpeechToTextConfig
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -48,6 +48,36 @@ def add_asr_config_argparse_parameters(
         action='store_true',
         help="Flag that controls if speaker diarization should be performed",
     )
+    parser.add_argument(
+        "--start-history",
+        default=-1,
+        type=int,
+        help="Value to detect and initiate start of speech utterance",
+    )
+    parser.add_argument(
+        "--start-threshold",
+        default=-1.0,
+        type=float,
+        help="Threshold value for detecting the start of speech utterance",
+    )
+    parser.add_argument(
+        "--stop-history",
+        default=-1,
+        type=int,
+        help="Value to reset the endpoint detection history",
+    )
+    parser.add_argument(
+        "--stop-history-eou",
+        default=-1,
+        type=int,
+        help="Value to determine the response history for endpoint detection",
+    )
+    parser.add_argument(
+        "--stop-threshold",
+        default=-1.0,
+        type=float,
+        help="Threshold value for detecting the end of speech utterance",
+    )
     return parser
 
 
diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -123,6 +123,31 @@ def add_speaker_diarization_to_config(
         diarization_config = rasr.SpeakerDiarizationConfig(enable_speaker_diarization=True)
         inner_config.diarization_config.CopyFrom(diarization_config)
 
+def add_endpoint_parameters_to_config(
+    config: Union[rasr.RecognitionConfig, rasr.EndpointingConfig],
+    start_history: int,
+    start_threshold: float,
+    stop_history: int,
+    stop_history_eou: int,
+    stop_threshold: float,
+) -> None:
+    if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0):
+        return 
+         
+    inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
+    endpointing_config = rasr.EndpointingConfig()
+    if start_history > 0:
+        endpointing_config.start_history = start_history
+    if start_threshold > 0:
+        endpointing_config.start_threshold = start_threshold
+    if stop_history > 0:
+        endpointing_config.stop_history = stop_history
+    if stop_history_eou > 0:
+        endpointing_config.stop_history_eou = stop_history_eou
+    if stop_threshold > 0:
+        endpointing_config.stop_threshold = stop_threshold
+    inner_config.endpointing_config.CopyFrom(endpointing_config)
+
 
 PRINT_STREAMING_ADDITIONAL_INFO_MODES = ['no', 'time', 'confidence']
 
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -63,6 +63,14 @@ def streaming_transcription_worker(
             ),
             interim_results=True,
         )
+        riva.client.add_endpoint_parameters_to_config(
+            config, 
+            args.start_history, 
+            args.start_threshold, 
+            args.stop_history, 
+            args.stop_history_eou, 
+            args.stop_threshold
+        )
         riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
         for _ in range(args.num_iterations):
             with riva.client.AudioChunkFileIterator(
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -79,6 +79,14 @@ def main() -> None:
         interim_results=True,
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
+    riva.client.add_endpoint_parameters_to_config(
+        config, 
+        args.start_history, 
+        args.start_threshold, 
+        args.stop_history, 
+        args.stop_history_eou, 
+        args.stop_threshold
+    )
     sound_callback = None
     try:
         if args.play_audio or args.output_device is not None:
diff --git a/scripts/asr/transcribe_file_offline.py b/scripts/asr/transcribe_file_offline.py
@@ -38,7 +38,14 @@ def main() -> None:
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)
-
+    riva.client.add_endpoint_parameters_to_config(
+        config, 
+        args.start_history, 
+        args.start_threshold, 
+        args.stop_history, 
+        args.stop_history_eou, 
+        args.stop_threshold
+    )    
     with args.input_file.open('rb') as fh:
         data = fh.read()
     try:
diff --git a/scripts/asr/transcribe_mic.py b/scripts/asr/transcribe_mic.py
@@ -57,6 +57,14 @@ def main() -> None:
         interim_results=True,
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
+    riva.client.add_endpoint_parameters_to_config(
+        config, 
+        args.start_history, 
+        args.start_threshold, 
+        args.stop_history, 
+        args.stop_history_eou, 
+        args.stop_threshold
+    )
     with riva.client.audio_io.MicrophoneStream(
         args.sample_rate_hz,
         args.file_streaming_chunk,