asr: minor updates to argument names and logging

virajkarandikar · virajkarandikar · commit 2086d969369d · 2024-02-22T19:43:05.000+05:30
diff --git a/riva/client/argparse_utils.py b/riva/client/argparse_utils.py
@@ -38,6 +38,7 @@ def add_asr_config_argparse_parameters(
         help="If specified, text inverse normalization will be applied",
     )
     parser.add_argument("--language-code", default="en-US", help="Language code of the model to be used.")
+    parser.add_argument("--model-name", default=None, help="Name of the model to be used to be used.")
     parser.add_argument("--boosted-lm-words", action='append', help="Words to boost when decoding.")
     parser.add_argument(
         "--boosted-lm-score", type=float, default=4.0, help="Value by which to boost words when decoding."
diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -46,11 +46,11 @@ class AudioChunkFileIterator:
     def __init__(
         self,
         input_file: Union[str, os.PathLike],
-        chunk_n_frames: int,
+        chunk_duration_ms: int,
         delay_callback: Optional[Callable[[bytes, float], None]] = None,
     ) -> None:
         self.input_file: Path = Path(input_file).expanduser()
-        self.chunk_n_frames = chunk_n_frames
+        self.chunk_duration_ms = chunk_duration_ms
         self.delay_callback = delay_callback
         self.file_parameters = get_wav_file_parameters(self.input_file)
         self.file_object: Optional[typing.BinaryIO] = open(str(self.input_file), 'rb')
@@ -75,9 +75,11 @@ def __iter__(self):
 
     def __next__(self) -> bytes:
         if self.file_parameters:
-            data = self.file_object.read(self.chunk_n_frames * self.file_parameters['sampwidth'] * self.file_parameters['nchannels'])
+            num_frames = int(self.chunk_duration_ms * self.file_parameters['framerate'] / 1000)
+            data = self.file_object.read(num_frames * self.file_parameters['sampwidth'] * self.file_parameters['nchannels'])
         else:
-            data = self.file_object.read(self.chunk_n_frames)
+            # Fixed chunk size when file_parameters is not available
+            data = self.file_object.read(8192)
         if not data:
             self.close()
             raise StopIteration
@@ -129,6 +131,7 @@ def add_speaker_diarization_to_config(
 
 def print_streaming(
     responses: Iterable[rasr.StreamingRecognizeResponse],
+    input_file: str,
     output_file: Optional[Union[Union[os.PathLike, str, TextIO], List[Union[os.PathLike, str, TextIO]]]] = None,
     additional_info: str = 'no',
     word_time_offsets: bool = False,
@@ -194,6 +197,9 @@ def print_streaming(
                 output_file[i] = Path(elem).expanduser().open(file_mode)
         start_time = time.time()  # used in 'time` additional_info
         num_chars_printed = 0  # used in 'no' additional_info
+        final_transcript = "" # for printing best final transcript
+        for f in output_file:
+            f.write(f"File: {input_file}\n")
         for response in responses:
             if not response.results:
                 continue
@@ -204,6 +210,7 @@ def print_streaming(
                 transcript = result.alternatives[0].transcript
                 if additional_info == 'no':
                     if result.is_final:
+                        final_transcript += transcript
                         if show_intermediate:
                             overwrite_chars = ' ' * (num_chars_printed - len(transcript))
                             for i, f in enumerate(output_file):
@@ -221,6 +228,7 @@ def print_streaming(
                         partial_transcript += transcript
                 elif additional_info == 'time':
                     if result.is_final:
+                        final_transcript += transcript
                         for i, alternative in enumerate(result.alternatives):
                             for f in output_file:
                                 f.write(
@@ -239,6 +247,7 @@ def print_streaming(
                         partial_transcript += transcript
                 else:  # additional_info == 'confidence'
                     if result.is_final:
+                        final_transcript += transcript
                         for f in output_file:
                             f.write(f'## {transcript}\n')
                             f.write(f'Confidence: {result.alternatives[0].confidence:9.4f}\n')
@@ -259,6 +268,9 @@ def print_streaming(
             else:
                 for f in output_file:
                     f.write('----\n')
+        for f in output_file:
+            f.write(f"Final transcripts:\n")
+            f.write(f"0 : {final_transcript}\n")
     finally:
         for fo, elem in zip(file_opened, output_file):
             if fo:
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -23,7 +23,7 @@ def parse_args() -> argparse.Namespace:
         "which names follow a format `output_<thread_num>.txt`.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument("--num-clients", default=1, type=int, help="Number of client threads.")
+    parser.add_argument("--num-parallel-requests", default=1, type=int, help="Number of client threads.")
     parser.add_argument("--num-iterations", default=1, type=int, help="Number of iterations over the file.")
     parser.add_argument(
         "--input-file", required=True, type=str, help="Name of the WAV file with LINEAR_PCM encoding to transcribe."
@@ -35,7 +35,10 @@ def parse_args() -> argparse.Namespace:
         "normal speech.",
     )
     parser.add_argument(
-        "--file-streaming-chunk", type=int, default=1600, help="Number of frames in one chunk sent to server."
+        "--chunk-duration-ms", type=int, default=100, help="Chunk duration in milliseconds."
+    )
+    parser.add_argument(
+        "--interim-results", default=False, action='store_true', help="Print intermediate transcripts",
     )
     parser = add_connection_argparse_parameters(parser)
     parser = add_asr_config_argparse_parameters(parser, max_alternatives=True, profanity_filter=True, word_time_offsets=True)
@@ -61,23 +64,22 @@ def streaming_transcription_worker(
                 verbatim_transcripts=not args.no_verbatim_transcripts,
                 enable_word_time_offsets=args.word_time_offsets,
             ),
-            interim_results=True,
+            interim_results=args.interim_results,
         )
         riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
         for _ in range(args.num_iterations):
             with riva.client.AudioChunkFileIterator(
                 args.input_file,
-                args.file_streaming_chunk,
+                args.chunk_duration_ms,
                 delay_callback=riva.client.sleep_audio_length if args.simulate_realtime else None,
             ) as audio_chunk_iterator:
                 riva.client.print_streaming(
                     responses=asr_service.streaming_response_generator(
                         audio_chunks=audio_chunk_iterator,
                         streaming_config=config,
                     ),
+                    input_file=args.input_file,
                     output_file=output_file,
-                    additional_info='time',
-                    file_mode='a',
                     word_time_offsets=args.word_time_offsets,
                 )
     except BaseException as e:
@@ -87,12 +89,12 @@ def streaming_transcription_worker(
 
 def main() -> None:
     args = parse_args()
-    print("Number of clients:", args.num_clients)
+    print("Number of clients:", args.num_parallel_requests)
     print("Number of iteration:", args.num_iterations)
     print("Input file:", args.input_file)
     threads = []
     exception_queue = queue.Queue()
-    for i in range(args.num_clients):
+    for i in range(args.num_parallel_requests):
         t = Thread(target=streaming_transcription_worker, args=[args, f"output_{i:d}.txt", i, exception_queue])
         t.start()
         threads.append(t)
@@ -112,7 +114,7 @@ def main() -> None:
         if all_dead:
             break
         time.sleep(0.05)
-    print(str(args.num_clients), "threads done, output written to output_<thread_id>.txt")
+    print(str(args.num_parallel_requests), "threads done, output written to output_<thread_id>.txt")
 
 
 if __name__ == "__main__":
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -18,7 +18,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--input-file", help="A path to a local file to stream.")
     parser.add_argument("--list-devices", action="store_true", help="List output devices indices")
     parser.add_argument(
-        "--show-intermediate", action="store_true", help="Show intermediate transcripts as they are available."
+        "--interim-results", default=False, action='store_true', help="Print intermediate transcripts",
     )
     parser.add_argument(
         "--output-device",
@@ -34,10 +34,7 @@ def parse_args() -> argparse.Namespace:
         "then the default output audio device will be used.",
     )
     parser.add_argument(
-        "--file-streaming-chunk",
-        type=int,
-        default=1600,
-        help="A maximum number of frames in one chunk sent to server.",
+        "--chunk-duration-ms", type=int, default=100, help="Chunk duration in milliseconds."
     )
     parser.add_argument(
         "--simulate-realtime",
@@ -76,7 +73,7 @@ def main() -> None:
             enable_automatic_punctuation=args.automatic_punctuation,
             verbatim_transcripts=not args.no_verbatim_transcripts,
         ),
-        interim_results=True,
+        interim_results=args.interim_results,
     )
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     sound_callback = None
@@ -90,14 +87,15 @@ def main() -> None:
         else:
             delay_callback = riva.client.sleep_audio_length if args.simulate_realtime else None
         with riva.client.AudioChunkFileIterator(
-            args.input_file, args.file_streaming_chunk, delay_callback,
+            args.input_file, args.chunk_duration_ms, delay_callback,
         ) as audio_chunk_iterator:
             riva.client.print_streaming(
                 responses=asr_service.streaming_response_generator(
                     audio_chunks=audio_chunk_iterator,
                     streaming_config=config,
                 ),
-                show_intermediate=args.show_intermediate,
+                input_file=args.input_file,
+                show_intermediate=args.interim_results,
                 additional_info="confidence" if args.print_confidence else "no",
             )
     finally:

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ def add_asr_config_argparse_parameters(`
`38`	`38`	`help="If specified, text inverse normalization will be applied",`
`39`	`39`	`)`
`40`	`40`	`parser.add_argument("--language-code", default="en-US", help="Language code of the model to be used.")`
	`41`	`+ parser.add_argument("--model-name", default=None, help="Name of the model to be used to be used.")`
`41`	`42`	`parser.add_argument("--boosted-lm-words", action='append', help="Words to boost when decoding.")`
`42`	`43`	`parser.add_argument(`
`43`	`44`	`"--boosted-lm-score", type=float, default=4.0, help="Value by which to boost words when decoding."`