asr: add model name parameter (#85)

virajkarandikar · web-flow · commit 157576c408dd · 2024-07-23T00:47:13.000+05:30
* asr: add model name parameter

* minor fixes to asr and tts clients

* asr add file path check
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -38,9 +38,10 @@ def add_asr_config_argparse_parameters(
         help="If specified, text inverse normalization will be applied",
     )
     parser.add_argument("--language-code", default="en-US", help="Language code of the model to be used.")
-    parser.add_argument("--boosted-lm-words", action='append', help="Words to boost when decoding.")
+    parser.add_argument("--model-name", default="", help="Model name to be used.")
+    parser.add_argument("--boosted-lm-words", action='append', help="Words to boost when decoding. Can be used multiple times to boost multiple words.")
     parser.add_argument(
-        "--boosted-lm-score", type=float, default=4.0, help="Value by which to boost words when decoding."
+        "--boosted-lm-score", type=float, default=4.0, help="Recommended range for the boost score is 20 to 100. The higher the boost score, the more biased the ASR engine is towards this word."
     )
     parser.add_argument(
         "--speaker-diarization",
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -55,6 +55,7 @@ def streaming_transcription_worker(
         config = riva.client.StreamingRecognitionConfig(
             config=riva.client.RecognitionConfig(
                 language_code=args.language_code,
+                model=args.model_name,
                 max_alternatives=args.max_alternatives,
                 profanity_filter=args.profanity_filter,
                 enable_automatic_punctuation=args.automatic_punctuation,
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -3,6 +3,7 @@
 
 import argparse
 
+import os
 import riva.client
 from riva.client.argparse_utils import add_asr_config_argparse_parameters, add_connection_argparse_parameters
 
@@ -15,8 +16,11 @@ def parse_args() -> argparse.Namespace:
         "`--play-audio` or `--output-device`.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument("--input-file", help="A path to a local file to stream.")
-    parser.add_argument("--list-devices", action="store_true", help="List output devices indices")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--input-file", help="A path to a local file to stream.")
+    group.add_argument("--list-models", action="store_true", help="List available models.")
+    group.add_argument("--list-devices", action="store_true", help="List output devices indices")
+
     parser.add_argument(
         "--show-intermediate", action="store_true", help="Show intermediate transcripts as they are available."
     )
@@ -51,11 +55,6 @@ def parse_args() -> argparse.Namespace:
     parser = add_connection_argparse_parameters(parser)
     parser = add_asr_config_argparse_parameters(parser, max_alternatives=True, profanity_filter=True, word_time_offsets=True)
     args = parser.parse_args()
-    if not args.list_devices and args.input_file is None:
-        parser.error(
-            "You have to provide at least one of parameters `--input-file` and `--list-devices` whereas both "
-            "parameters are missing."
-        )
     if args.play_audio or args.output_device is not None or args.list_devices:
         import riva.client.audio_io
     return args
@@ -68,9 +67,31 @@ def main() -> None:
         return
     auth = riva.client.Auth(args.ssl_cert, args.use_ssl, args.server, args.metadata)
     asr_service = riva.client.ASRService(auth)
+
+    if args.list_models:
+        asr_models = dict()
+        config_response = asr_service.stub.GetRivaSpeechRecognitionConfig(riva.client.proto.riva_asr_pb2.RivaSpeechRecognitionConfigRequest())
+        for model_config in config_response.model_config:
+            if model_config.parameters["streaming"] and model_config.parameters["type"]:
+                language_code = model_config.parameters['language_code']
+                if language_code in asr_models:
+                    asr_models[language_code]["models"].append(model_config.model_name)
+                else:
+                    asr_models[language_code] = {"models": [model_config.model_name]}
+
+        print("Available ASR models")
+        asr_models = dict(sorted(asr_models.items()))
+        print(asr_models)
+        return
+
+    if not os.path.isfile(args.input_file):
+        print(f"Invalid input file path: {args.input_file}")
+        return
+
     config = riva.client.StreamingRecognitionConfig(
         config=riva.client.RecognitionConfig(
             language_code=args.language_code,
+            model=args.model_name,
             max_alternatives=1,
             profanity_filter=args.profanity_filter,
             enable_automatic_punctuation=args.automatic_punctuation,
diff --git a/scripts/asr/transcribe_mic.py b/scripts/asr/transcribe_mic.py
@@ -47,6 +47,7 @@ def main() -> None:
         config=riva.client.RecognitionConfig(
             encoding=riva.client.AudioEncoding.LINEAR_PCM,
             language_code=args.language_code,
+            model=args.model_name,
             max_alternatives=1,
             profanity_filter=args.profanity_filter,
             enable_automatic_punctuation=args.automatic_punctuation,
diff --git a/scripts/tts/talk.py b/scripts/tts/talk.py
@@ -13,16 +13,18 @@
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
-        description="A speech synthesis via Riva AI Services. You HAVE TO provide at least one of arguments "
-        "`--output`, `--play-audio`, `--list-devices`, `--output-device`.",
+        description="Speech synthesis via Riva AI Services",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--text", type=str, help="Text input to synthesize.")
+    group.add_argument("--list-devices", action="store_true", help="List output audio devices indices.")
+    group.add_argument("--list-voices", action="store_true", help="List available voices.")
     parser.add_argument(
         "--voice",
         help="A voice name to use. If this parameter is missing, then the server will try a first available model "
         "based on parameter `--language-code`.",
     )
-    parser.add_argument("--text", type=str, required=False, help="Text input to synthesize.")
     parser.add_argument(
         "--audio_prompt_file",
         type=Path,
@@ -35,8 +37,6 @@ def parse_args() -> argparse.Namespace:
         help="Whether to play input audio simultaneously with transcribing. If `--output-device` is not provided, "
         "then the default output audio device will be used.",
     )
-    parser.add_argument("--list-devices", action="store_true", help="List output audio devices indices.")
-    parser.add_argument("--list-voices", action="store_true", help="List available voices.")
     parser.add_argument("--output-device", type=int, help="Output device to use.")
     parser.add_argument("--language-code", default='en-US', help="A language of input text.")
     parser.add_argument(
@@ -62,6 +62,7 @@ def main() -> None:
     args = parse_args()
     if args.list_devices:
         riva.client.audio_io.list_output_devices()
+        return
 
     auth = riva.client.Auth(args.ssl_cert, args.use_ssl, args.server, args.metadata)
     service = riva.client.SpeechSynthesisService(auth)
@@ -87,6 +88,7 @@ def main() -> None:
 
         tts_models = dict(sorted(tts_models.items()))
         print(json.dumps(tts_models, indent=4))
+        return
 
     if not args.text:
         print("No input text provided")

Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,10 @@ def add_asr_config_argparse_parameters(`
`38`	`38`	`help="If specified, text inverse normalization will be applied",`
`39`	`39`	`)`
`40`	`40`	`parser.add_argument("--language-code", default="en-US", help="Language code of the model to be used.")`
`41`		`- parser.add_argument("--boosted-lm-words", action='append', help="Words to boost when decoding.")`
	`41`	`+ parser.add_argument("--model-name", default="", help="Model name to be used.")`
	`42`	`+ parser.add_argument("--boosted-lm-words", action='append', help="Words to boost when decoding. Can be used multiple times to boost multiple words.")`
`42`	`43`	`parser.add_argument(`
`43`		`- "--boosted-lm-score", type=float, default=4.0, help="Value by which to boost words when decoding."`
	`44`	`+ "--boosted-lm-score", type=float, default=4.0, help="Recommended range for the boost score is 20 to 100. The higher the boost score, the more biased the ASR engine is towards this word."`
`44`	`45`	`)`
`45`	`46`	`parser.add_argument(`
`46`	`47`	`"--speaker-diarization",`