support word boosting from file

virajkarandikar · virajkarandikar · commit de755af4510c · 2024-04-16T15:58:57.000+05:30
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
+from pathlib import Path
 
 
 def add_asr_config_argparse_parameters(
@@ -35,16 +36,18 @@ def add_asr_config_argparse_parameters(
         help="Flag that controls if transcript should be automatically punctuated",
     )
     parser.add_argument(
-        "--no-verbatim-transcripts",
-        default=False,
-        action='store_true',
-        help="If specified, text inverse normalization will be applied",
+        "--verbatim-transcripts",
+        default=True,
+        action='store_false',
+        help="True returns text exactly as it was said. False applies Inverse text normalization",
     )
     parser.add_argument("--language-code", default="en-US", help="Language code of the model to be used.")
     parser.add_argument("--model-name", default="", help="Name of the model to be used to be used.")
-    parser.add_argument("--boosted-lm-words", action='append', help="Words to boost when decoding.")
     parser.add_argument(
-        "--boosted-lm-score", type=float, default=4.0, help="Value by which to boost words when decoding."
+        "--boosted-words-file", default=None, type=Path, help="File with a list of words to boost. One line per word."
+    )
+    parser.add_argument(
+        "--boosted-words-score", type=float, default=4.0, help="Score by which to boost the boosted words."
     )
     parser.add_argument(
         "--speaker-diarization",
diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -97,14 +97,19 @@ def __next__(self) -> bytes:
 
 def add_word_boosting_to_config(
     config: Union[rasr.StreamingRecognitionConfig, rasr.RecognitionConfig],
-    boosted_lm_words: Optional[List[str]],
-    boosted_lm_score: float,
+    boosted_words_file: Union[str, os.PathLike],
+    boosted_words_score: float,
 ) -> None:
     inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
-    if boosted_lm_words is not None:
+    boosted_words = []
+    if boosted_words_file:
+        with open(boosted_words_file) as f:
+            boosted_words = f.read().splitlines()
+
+    if boosted_words is not None:
         speech_context = rasr.SpeechContext()
-        speech_context.phrases.extend(boosted_lm_words)
-        speech_context.boost = boosted_lm_score
+        speech_context.phrases.extend(boosted_words)
+        speech_context.boost = boosted_words_score
         inner_config.speech_contexts.append(speech_context)
 
 
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -26,7 +26,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--num-parallel-requests", default=1, type=int, help="Number of client threads.")
     parser.add_argument("--num-iterations", default=1, type=int, help="Number of iterations over the file.")
     parser.add_argument(
-        "--input-file", required=True, type=str, help="Name of the WAV file with LINEAR_PCM encoding to transcribe."
+        "--input-file", required=True, type=Path, help="Name of the WAV file with LINEAR_PCM encoding to transcribe."
     )
     parser.add_argument(
         "--simulate-realtime",
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
+import json
+from pathlib import Path
 
 import riva.client
 from riva.client.argparse_utils import add_asr_config_argparse_parameters, add_connection_argparse_parameters
@@ -15,7 +17,12 @@ def parse_args() -> argparse.Namespace:
         "`--play-audio` or `--output-device`.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument("--input-file", help="A path to a local file to stream.")
+    parser.add_argument(
+        "--input-file",
+        required=True,
+        type=Path,
+        help="A path to a local file to stream or a JSONL file containing list of files. JSONL file should contain JSON entry on each line, for example: {'audio_filepath': 'audio.wav'} ",
+    )
     parser.add_argument("--list-devices", action="store_true", help="List output devices indices")
     parser.add_argument(
         "--interim-results", default=False, action='store_true', help="Print intermediate transcripts",
@@ -63,6 +70,17 @@ def main() -> None:
     if args.list_devices:
         riva.client.audio_io.list_output_devices()
         return
+    input_files = []
+    if args.input_file.suffix == ".json":
+        with open(args.input_file) as f:
+            lines = f.read().splitlines()
+            for line in lines:
+                data = json.loads(line)
+                if "audio_filepath" in data:
+                    input_files.append(data["audio_filepath"])
+    else:
+        input_files = [args.input_file]
+
     auth = riva.client.Auth(args.ssl_cert, args.use_ssl, args.server, args.metadata)
     asr_service = riva.client.ASRService(auth)
     config = riva.client.StreamingRecognitionConfig(
@@ -71,37 +89,40 @@ def main() -> None:
             max_alternatives=args.max_alternatives,
             profanity_filter=args.profanity_filter,
             enable_automatic_punctuation=args.automatic_punctuation,
-            verbatim_transcripts=not args.no_verbatim_transcripts,
+            verbatim_transcripts=args.verbatim_transcripts,
             enable_word_time_offsets=args.word_time_offsets,
             model=args.model_name,
         ),
         interim_results=args.interim_results,
     )
-    riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
+    riva.client.add_word_boosting_to_config(config, args.boosted_words_file, args.boosted_words_score)
     sound_callback = None
-    try:
-        if args.play_audio or args.output_device is not None:
-            wp = riva.client.get_wav_file_parameters(args.input_file)
-            sound_callback = riva.client.audio_io.SoundCallBack(
-                args.output_device, wp['sampwidth'], wp['nchannels'], wp['framerate'],
-            )
-            delay_callback = sound_callback
-        else:
-            delay_callback = riva.client.sleep_audio_length if args.simulate_realtime else None
-        with riva.client.AudioChunkFileIterator(
-            args.input_file, args.chunk_duration_ms, delay_callback,
-        ) as audio_chunk_iterator:
-            riva.client.print_streaming(
-                responses=asr_service.streaming_response_generator(
-                    audio_chunks=audio_chunk_iterator, streaming_config=config,
-                ),
-                input_file=args.input_file,
-                show_intermediate=args.interim_results,
-                additional_info="confidence" if args.print_confidence else "no",
-            )
-    finally:
-        if sound_callback is not None and sound_callback.opened:
-            sound_callback.close()
+
+    for file in input_files:
+        try:
+            if args.play_audio or args.output_device is not None:
+                wp = riva.client.get_wav_file_parameters(file)
+                sound_callback = riva.client.audio_io.SoundCallBack(
+                    args.output_device, wp['sampwidth'], wp['nchannels'], wp['framerate'],
+                )
+                delay_callback = sound_callback
+            else:
+                delay_callback = riva.client.sleep_audio_length if args.simulate_realtime else None
+
+            with riva.client.AudioChunkFileIterator(
+                file, args.chunk_duration_ms, delay_callback,
+            ) as audio_chunk_iterator:
+                riva.client.print_streaming(
+                    responses=asr_service.streaming_response_generator(
+                        audio_chunks=audio_chunk_iterator, streaming_config=config,
+                    ),
+                    input_file=file,
+                    show_intermediate=args.interim_results,
+                    additional_info="confidence" if args.print_confidence else "no",
+                )
+        finally:
+            if sound_callback is not None and sound_callback.opened:
+                sound_callback.close()
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ def parse_args() -> argparse.Namespace:`
`26`	`26`	`parser.add_argument("--num-parallel-requests", default=1, type=int, help="Number of client threads.")`
`27`	`27`	`parser.add_argument("--num-iterations", default=1, type=int, help="Number of iterations over the file.")`
`28`	`28`	`parser.add_argument(`
`29`		`- "--input-file", required=True, type=str, help="Name of the WAV file with LINEAR_PCM encoding to transcribe."`
	`29`	`+ "--input-file", required=True, type=Path, help="Name of the WAV file with LINEAR_PCM encoding to transcribe."`
`30`	`30`	`)`
`31`	`31`	`parser.add_argument(`
`32`	`32`	`"--simulate-realtime",`