Merge pull request #55 from KunalDhawan/add_canary_flash

nithinraok · web-flow · commit afba94c73e2d · 2025-03-18T19:21:03.000-04:00
diff --git a/nemo_asr/run_canary.sh b/nemo_asr/run_canary.sh
@@ -2,16 +2,15 @@
 
 export PYTHONPATH="..":$PYTHONPATH
 
-MODEL_IDs=("nvidia/canary-1b")
-BATCH_SIZE=64
+MODEL_IDs=("nvidia/canary-1b-flash")  # options: "nvidia/canary-1b" "nvidia/canary-1b-flash" "nvidia/canary-180m-flash"
+BATCH_SIZE=128
 DEVICE_ID=0
 
 num_models=${#MODEL_IDs[@]}
 
 for (( i=0; i<${num_models}; i++ ));
 do
     MODEL_ID=${MODEL_IDs[$i]}
-
     
     python run_eval.py \
         --model_id=${MODEL_ID} \
diff --git a/nemo_asr/run_eval.py b/nemo_asr/run_eval.py
@@ -1,5 +1,6 @@
 import argparse
 
+import io
 import os
 import torch
 import evaluate
@@ -51,12 +52,33 @@ def download_audio_files(batch):
         durations = []
 
         for id, sample in zip(batch["id"], batch["audio"]):
+
+            # first step added here to make ID and wav filenames unique
+            # several datasets like earnings22 have a hierarchical structure
+            # for eg. earnings22/test/4432298/281.wav, earnings22/test/4450488/281.wav
+            # lhotse uses the filename (281.wav) here as unique ID to create and name cuts
+            # ref: https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/collation.py#L186
+            id = id.replace('/', '_').removesuffix('.wav')
+
             audio_path = os.path.join(CACHE_DIR, f"{id}.wav")
+
+            if "array" in sample:
+                audio_array = np.float32(sample["array"])
+                sample_rate = 16000
+
+            elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream
+                with io.BytesIO(sample["bytes"]) as audio_file:
+                    audio_array, sample_rate = soundfile.read(audio_file, dtype="float32")
+
+            else:
+                raise ValueError("Sample must have either 'array' or 'bytes' key")
+
             if not os.path.exists(audio_path):
                 os.makedirs(os.path.dirname(audio_path), exist_ok=True)
-                soundfile.write(audio_path, np.float32(sample["array"]), 16_000)
+                soundfile.write(audio_path, audio_array, sample_rate)
+
             audio_paths.append(audio_path)
-            durations.append(len(sample["array"]) / 16_000)
+            durations.append(len(audio_array) / sample_rate)
 
         
         batch["references"] = batch["norm_text"]
@@ -118,7 +140,7 @@ def download_audio_files(batch):
     # normalize transcriptions with English normalizer
     if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
         transcriptions = transcriptions[0]
-    predictions = [data_utils.normalizer(pred) for pred in transcriptions]
+    predictions = [data_utils.normalizer(pred.text) for pred in transcriptions]
 
     avg_time = total_time / len(all_data["audio_filepaths"])
 
diff --git a/requirements/requirements_nemo.txt b/requirements/requirements_nemo.txt
@@ -1,6 +1,6 @@
-git+https://github.com/NVIDIA/NeMo.git@d0efff087613ea2584e215969f289fed17414d8b#egg=nemo_toolkit[all] # This commit hash is a recent version of main at the time of testing.
+git+https://github.com/NVIDIA/NeMo.git@208e0da28e2ada8da84d8f7ddff8623efe1ff01c#egg=nemo_toolkit[asr] # This commit hash is a recent version of main at the time of testing.
 tqdm
 soundfile
 librosa
 IPython # Workaround for https://github.com/NVIDIA/NeMo/pull/9890#discussion_r1701028427
-cuda-python>=12.4 # Used for fast TDT and RNN-T inference
+cuda-python>=12.4 # Used for fast TDT and RNN-T inference