Release/2.17.0 (#99)

rmittal-github · sarane22 · mohnishparmar · web-flow · commit 2c483108b866 · 2024-09-30T23:38:07.000+05:30
* Support custom_configuration param in ASR clients (#94) * Passing AST param through custom_configuration * Added exception handling for TTS talk.py * Exposing custom-configurtion to cli * Updating function name to add_custom_configuration_to_config * Updating help message --------- Co-authored-by: mohnishparmar <109233781+mohnishparmar@users.noreply.github.com> * Support setting max speakers for offline diarization (#97) * fix: accept input for max_speaker_count in asr/transcribe_file_offline * fix: rename input field to diarization_max_speakers * remove: redundant default value for max_speakers * update SHA of common repo submodule --------- Co-authored-by: sarane22 <118975230+sarane22@users.noreply.github.com> Co-authored-by: mohnishparmar <109233781+mohnishparmar@users.noreply.github.com> Co-authored-by: Prabhsimran Singh <pskrunner14@gmail.com>
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 988f86f84bf28d028f146ee5669b998ce3442be2
+Subproject commit 1c7da5aed4e4df3a296d2672379c5099a193aaae
diff --git a/riva/client/__init__.py b/riva/client/__init__.py
@@ -12,6 +12,7 @@
     print_streaming,
     sleep_audio_length,
     add_endpoint_parameters_to_config,
+    add_custom_configuration_to_config,
 )
 from riva.client.auth import Auth
 from riva.client.nlp import (
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -49,6 +49,12 @@ def add_asr_config_argparse_parameters(
         action='store_true',
         help="Flag that controls if speaker diarization should be performed",
     )
+    parser.add_argument(
+        "--diarization-max-speakers",
+        default=3,
+        type=int,
+        help="Max number of speakers to detect when performing speaker diarization",
+    )
     parser.add_argument(
         "--start-history",
         default=-1,
@@ -85,6 +91,12 @@ def add_asr_config_argparse_parameters(
         type=float,
         help="Threshold value for likelihood of blanks before detecting end of utterance",
     )
+    parser.add_argument(
+        "--custom-configuration",
+        default="",
+        type=str,
+        help="Custom configurations to be sent to the server as key value pairs <key:value,key:value,...>",
+    )
     return parser
 
 
diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -117,14 +117,19 @@ def add_audio_file_specs_to_config(
 def add_speaker_diarization_to_config(
     config: Union[rasr.RecognitionConfig],
     diarization_enable: bool,
+    diarization_max_speakers: int,
 ) -> None:
     inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
     if diarization_enable:
-        diarization_config = rasr.SpeakerDiarizationConfig(enable_speaker_diarization=True)
+        diarization_config = rasr.SpeakerDiarizationConfig(
+            enable_speaker_diarization=True,
+            max_speaker_count=diarization_max_speakers,
+        )
         inner_config.diarization_config.CopyFrom(diarization_config)
 
+
 def add_endpoint_parameters_to_config(
-    config: Union[rasr.RecognitionConfig, rasr.EndpointingConfig],
+    config: Union[rasr.StreamingRecognitionConfig, rasr.RecognitionConfig],
     start_history: int,
     start_threshold: float,
     stop_history: int,
@@ -152,6 +157,22 @@ def add_endpoint_parameters_to_config(
     inner_config.endpointing_config.CopyFrom(endpointing_config)
 
 
+def add_custom_configuration_to_config(
+    config: Union[rasr.StreamingRecognitionConfig, rasr.RecognitionConfig],
+    custom_configuration: str,
+) -> None:
+    custom_configuration = custom_configuration.strip().replace(" ", "")
+    if not custom_configuration:
+        return
+    inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
+    for pair in custom_configuration.split(","):
+        key_value = pair.split(":")
+        if len(key_value) == 2:
+            inner_config.custom_configuration[key_value[0]] = key_value[1]
+        else:
+            raise ValueError(f"Invalid key:value pair {key_value}")
+
+
 PRINT_STREAMING_ADDITIONAL_INFO_MODES = ['no', 'time', 'confidence']
 
 
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -73,6 +73,10 @@ def streaming_transcription_worker(
             args.stop_threshold,
             args.stop_threshold_eou
         )
+        riva.client.add_custom_configuration_to_config(
+            config,
+            args.custom_configuration
+        )
         riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
         for _ in range(args.num_iterations):
             with riva.client.AudioChunkFileIterator(
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -109,6 +109,10 @@ def main() -> None:
         args.stop_threshold,
         args.stop_threshold_eou
     )
+    riva.client.add_custom_configuration_to_config(
+        config,
+        args.custom_configuration
+    )
     sound_callback = None
     try:
         if args.play_audio or args.output_device is not None:
diff --git a/scripts/asr/transcribe_file_offline.py b/scripts/asr/transcribe_file_offline.py
@@ -37,7 +37,7 @@ def main() -> None:
         enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization,
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
-    riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)
+    riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization, args.diarization_max_speakers)
     riva.client.add_endpoint_parameters_to_config(
         config,
         args.start_history,
@@ -46,7 +46,11 @@ def main() -> None:
         args.stop_history_eou,
         args.stop_threshold,
         args.stop_threshold_eou
-    )    
+    )
+    riva.client.add_custom_configuration_to_config(
+        config,
+        args.custom_configuration
+    )
     with args.input_file.open('rb') as fh:
         data = fh.read()
     try:
diff --git a/scripts/asr/transcribe_mic.py b/scripts/asr/transcribe_mic.py
@@ -67,6 +67,10 @@ def main() -> None:
         args.stop_threshold,
         args.stop_threshold_eou
     )
+    riva.client.add_custom_configuration_to_config(
+        config,
+        args.custom_configuration
+    )
     with riva.client.audio_io.MicrophoneStream(
         args.sample_rate_hz,
         args.file_streaming_chunk,
diff --git a/scripts/tts/talk.py b/scripts/tts/talk.py
@@ -157,6 +157,8 @@ def main() -> None:
                 sound_stream(resp.audio)
             if out_f is not None:
                 out_f.writeframesraw(resp.audio)
+    except Exception as e:
+        print(e.details())
     finally:
         if out_f is not None:
             out_f.close()

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@`
`12`	`12`	`print_streaming,`
`13`	`13`	`sleep_audio_length,`
`14`	`14`	`add_endpoint_parameters_to_config,`
	`15`	`+ add_custom_configuration_to_config,`
`15`	`16`	`)`
`16`	`17`	`from riva.client.auth import Auth`
`17`	`18`	`from riva.client.nlp import (`