fixes for Canary-1B-Flash

KunalDhawan · KunalDhawan · commit 1089cb2c21a4 · 2025-03-12T00:41:27.000-07:00
Signed-off-by: Kunal Dhawan &lt;kunaldhawan97@gmail.com&gt;
diff --git a/nemo_asr/run_canary.sh b/nemo_asr/run_canary.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="..":$PYTHONPATH
 
-MODEL_IDs=("nvidia/canary-1b")
+MODEL_IDs=("nvidia/canary-1b-flash")  # options: "nvidia/canary-1b" "nvidia/canary-1b-flash"
 BATCH_SIZE=64
 DEVICE_ID=0
 
@@ -11,7 +11,6 @@ num_models=${#MODEL_IDs[@]}
 for (( i=0; i<${num_models}; i++ ));
 do
     MODEL_ID=${MODEL_IDs[$i]}
-
     
     python run_eval.py \
         --model_id=${MODEL_ID} \
diff --git a/nemo_asr/run_canary_flash.sh b/nemo_asr/run_canary_flash.sh
diff --git a/nemo_asr/run_eval.py b/nemo_asr/run_eval.py
@@ -1,5 +1,6 @@
 import argparse
 
+import io
 import os
 import torch
 import evaluate
@@ -50,15 +51,26 @@ def download_audio_files(batch):
         audio_paths = []
         durations = []
 
-        # import ipdb; ipdb.set_trace()
-
         for id, sample in zip(batch["id"], batch["audio"]):
             audio_path = os.path.join(CACHE_DIR, f"{id}.wav")
+
+            if "array" in sample:
+                audio_array = np.float32(sample["array"])
+                sample_rate = 16000
+
+            elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream
+                with io.BytesIO(sample["bytes"]) as audio_file:
+                    audio_array, sample_rate = soundfile.read(audio_file, dtype="float32")
+
+            else:
+                raise ValueError("Sample must have either 'array' or 'bytes' key")
+
             if not os.path.exists(audio_path):
                 os.makedirs(os.path.dirname(audio_path), exist_ok=True)
-                soundfile.write(audio_path, np.float32(sample["array"]), 16_000)
+                soundfile.write(audio_path, audio_array, sample_rate)
+
             audio_paths.append(audio_path)
-            durations.append(len(sample["array"]) / 16_000)
+            durations.append(len(audio_array) / sample_rate)
 
         
         batch["references"] = batch["norm_text"]
diff --git a/requirements/requirements_nemo.txt b/requirements/requirements_nemo.txt
@@ -3,5 +3,4 @@ tqdm
 soundfile
 librosa
 IPython # Workaround for https://github.com/NVIDIA/NeMo/pull/9890#discussion_r1701028427
-cuda-python>=12.4 # Used for fast TDT and RNN-T inference
-datasets <= 2.21.0
+cuda-python>=12.4 # Used for fast TDT and RNN-T inference