support file formats other than LINEAR_PCM (#47)

virajkarandikar · web-flow · commit 2ab2e0e74ab2 · 2023-06-09T12:48:24.000-07:00
* Support file formats other than LINEAR_PCM

Disable setting audio encoding parameters explicitly, let Riva do
the decode.

* add ffmpeg-python requirement

* additional fixes and remove ffmpeg dependency
diff --git a/riva/client/asr.py b/riva/client/asr.py
@@ -12,23 +12,29 @@
 
 from grpc._channel import _MultiThreadedRendezvous
 
+import riva.client
 import riva.client.proto.riva_asr_pb2 as rasr
 import riva.client.proto.riva_asr_pb2_grpc as rasr_srv
 from riva.client.auth import Auth
 
 
 def get_wav_file_parameters(input_file: Union[str, os.PathLike]) -> Dict[str, Union[int, float]]:
-    input_file = Path(input_file).expanduser()
-    with wave.open(str(input_file), 'rb') as wf:
-        nframes = wf.getnframes()
-        rate = wf.getframerate()
-        parameters = {
-            'nframes': nframes,
-            'framerate': rate,
-            'duration': nframes / rate,
-            'nchannels': wf.getnchannels(),
-            'sampwidth': wf.getsampwidth(),
-        }
+    try:
+        input_file = Path(input_file).expanduser()
+        with wave.open(str(input_file), 'rb') as wf:
+            nframes = wf.getnframes()
+            rate = wf.getframerate()
+            parameters = {
+                'nframes': nframes,
+                'framerate': rate,
+                'duration': nframes / rate,
+                'nchannels': wf.getnchannels(),
+                'sampwidth': wf.getsampwidth(),
+                'data_offset': wf.getfp().size_read + wf.getfp().offset
+            }
+    except:
+        # Not a WAV file
+        return None
     return parameters
 
 
@@ -47,7 +53,11 @@ def __init__(
         self.chunk_n_frames = chunk_n_frames
         self.delay_callback = delay_callback
         self.file_parameters = get_wav_file_parameters(self.input_file)
-        self.file_object: Optional[wave.Wave_read] = wave.open(str(self.input_file), 'rb')
+        self.file_object: Optional[typing.BinaryIO] = open(str(self.input_file), 'rb')
+        if self.delay_callback and self.file_parameters is None:
+            warnings.warn(f"delay_callback not supported for encoding other than LINEAR_PCM")
+            self.delay_callback = None
+        self.first_buffer = True
 
     def close(self) -> None:
         self.file_object.close()
@@ -64,15 +74,19 @@ def __iter__(self):
         return self
 
     def __next__(self) -> bytes:
-        data = self.file_object.readframes(self.chunk_n_frames)
+        if self.file_parameters:
+            data = self.file_object.read(self.chunk_n_frames * self.file_parameters['sampwidth'] * self.file_parameters['nchannels'])
+        else:
+            data = self.file_object.read(self.chunk_n_frames)
         if not data:
             self.close()
             raise StopIteration
         if self.delay_callback is not None:
+            offset = self.file_parameters['data_offset'] if self.first_buffer else 0
             self.delay_callback(
-                data,
-                len(data) / self.file_parameters['sampwidth'] / self.file_parameters['framerate']
+                data[offset:], (len(data) - offset) / self.file_parameters['sampwidth'] / self.file_parameters['framerate']
             )
+            self.first_buffer = False
         return data
 
 
@@ -95,8 +109,9 @@ def add_audio_file_specs_to_config(
 ) -> None:
     inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
     wav_parameters = get_wav_file_parameters(audio_file)
-    inner_config.sample_rate_hertz = wav_parameters['framerate']
-    inner_config.audio_channel_count = wav_parameters['nchannels']
+    if wav_parameters is not None:
+        inner_config.sample_rate_hertz = wav_parameters['framerate']
+        inner_config.audio_channel_count = wav_parameters['nchannels']
 
 
 def add_speaker_diarization_to_config(
diff --git a/scripts/asr/riva_streaming_asr_client.py b/scripts/asr/riva_streaming_asr_client.py
@@ -54,7 +54,6 @@ def streaming_transcription_worker(
         asr_service = riva.client.ASRService(auth)
         config = riva.client.StreamingRecognitionConfig(
             config=riva.client.RecognitionConfig(
-                encoding=riva.client.AudioEncoding.LINEAR_PCM,
                 language_code=args.language_code,
                 max_alternatives=args.max_alternatives,
                 profanity_filter=args.profanity_filter,
@@ -64,7 +63,6 @@ def streaming_transcription_worker(
             ),
             interim_results=True,
         )
-        riva.client.add_audio_file_specs_to_config(config, args.input_file)
         riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
         for _ in range(args.num_iterations):
             with riva.client.AudioChunkFileIterator(
@@ -92,8 +90,6 @@ def main() -> None:
     print("Number of clients:", args.num_clients)
     print("Number of iteration:", args.num_iterations)
     print("Input file:", args.input_file)
-    wav_parameters = get_wav_file_parameters(args.input_file)
-    print(f"File duration: {wav_parameters['duration']:.2f}s")
     threads = []
     exception_queue = queue.Queue()
     for i in range(args.num_clients):
diff --git a/scripts/asr/transcribe_file.py b/scripts/asr/transcribe_file.py
@@ -70,7 +70,6 @@ def main() -> None:
     asr_service = riva.client.ASRService(auth)
     config = riva.client.StreamingRecognitionConfig(
         config=riva.client.RecognitionConfig(
-            encoding=riva.client.AudioEncoding.LINEAR_PCM,
             language_code=args.language_code,
             max_alternatives=1,
             profanity_filter=args.profanity_filter,
@@ -79,7 +78,6 @@ def main() -> None:
         ),
         interim_results=True,
     )
-    riva.client.add_audio_file_specs_to_config(config, args.input_file)
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     sound_callback = None
     try:
diff --git a/scripts/asr/transcribe_file_offline.py b/scripts/asr/transcribe_file_offline.py
@@ -29,15 +29,13 @@ def main() -> None:
     auth = riva.client.Auth(args.ssl_cert, args.use_ssl, args.server)
     asr_service = riva.client.ASRService(auth)
     config = riva.client.RecognitionConfig(
-        encoding=riva.client.AudioEncoding.LINEAR_PCM,
         language_code=args.language_code,
         max_alternatives=args.max_alternatives,
         profanity_filter=args.profanity_filter,
         enable_automatic_punctuation=args.automatic_punctuation,
         verbatim_transcripts=not args.no_verbatim_transcripts,
         enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization,
     )
-    riva.client.add_audio_file_specs_to_config(config, args.input_file)
     riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
     riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)