added scripts for canary-1b-flash eval

KunalDhawan · KunalDhawan · commit c41131cd743e · 2025-03-11T19:11:18.000-07:00
Signed-off-by: Kunal Dhawan &lt;kunaldhawan97@gmail.com&gt;
diff --git a/nemo_asr/run_canary_flash.sh b/nemo_asr/run_canary_flash.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+export PYTHONPATH="..":$PYTHONPATH
+
+MODEL_IDs=("nvidia/canary-1b-flash")
+BATCH_SIZE=64
+DEVICE_ID=0
+
+num_models=${#MODEL_IDs[@]}
+
+for (( i=0; i<${num_models}; i++ ));
+do
+    MODEL_ID=${MODEL_IDs[$i]}
+
+    
+    # python run_eval.py \
+    #     --model_id=${MODEL_ID} \
+    #     --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+    #     --dataset="ami" \
+    #     --split="test" \
+    #     --device=${DEVICE_ID} \
+    #     --batch_size=${BATCH_SIZE} \
+    #     --max_eval_samples=-1 
+    
+    # python run_eval.py \
+    #     --model_id=${MODEL_ID} \
+    #     --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+    #     --dataset="earnings22" \
+    #     --split="test" \
+    #     --device=${DEVICE_ID} \
+    #     --batch_size=${BATCH_SIZE} \
+    #     --max_eval_samples=-1 
+
+    # python run_eval.py \
+    #     --model_id=${MODEL_ID} \
+    #     --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+    #     --dataset="gigaspeech" \
+    #     --split="test" \
+    #     --device=${DEVICE_ID} \
+    #     --batch_size=${BATCH_SIZE} \
+    #     --max_eval_samples=-1 
+
+    python run_eval.py \
+        --model_id=${MODEL_ID} \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+        --dataset="librispeech" \
+        --split="test.clean" \
+        --device=${DEVICE_ID} \
+        --batch_size=${BATCH_SIZE} \
+        --max_eval_samples=-1
+
+    python run_eval.py \
+        --model_id=${MODEL_ID} \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+        --dataset="librispeech" \
+        --split="test.other" \
+        --device=${DEVICE_ID} \
+        --batch_size=${BATCH_SIZE} \
+        --max_eval_samples=-1 
+
+    # python run_eval.py \
+    #     --model_id=${MODEL_ID} \
+    #     --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+    #     --dataset="spgispeech" \
+    #     --split="test" \
+    #     --device=${DEVICE_ID} \
+    #     --batch_size=${BATCH_SIZE} \
+    #     --max_eval_samples=-1 
+
+    # python run_eval.py \
+    #     --model_id=${MODEL_ID} \
+    #     --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+    #     --dataset="tedlium" \
+    #     --split="test" \
+    #     --device=${DEVICE_ID} \
+    #     --batch_size=${BATCH_SIZE} \
+    #     --max_eval_samples=-1 
+
+    # python run_eval.py \
+    #     --model_id=${MODEL_ID} \
+    #     --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+    #     --dataset="voxpopuli" \
+    #     --split="test" \
+    #     --device=${DEVICE_ID} \
+    #     --batch_size=${BATCH_SIZE} \
+    #     --max_eval_samples=-1 
+
+    # Evaluate results
+    RUNDIR=`pwd` && \
+    cd ../normalizer && \
+    python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
+    cd $RUNDIR
+
+done
diff --git a/nemo_asr/run_eval.py b/nemo_asr/run_eval.py
@@ -50,6 +50,8 @@ def download_audio_files(batch):
         audio_paths = []
         durations = []
 
+        # import ipdb; ipdb.set_trace()
+
         for id, sample in zip(batch["id"], batch["audio"]):
             audio_path = os.path.join(CACHE_DIR, f"{id}.wav")
             if not os.path.exists(audio_path):
@@ -118,7 +120,7 @@ def download_audio_files(batch):
     # normalize transcriptions with English normalizer
     if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
         transcriptions = transcriptions[0]
-    predictions = [data_utils.normalizer(pred) for pred in transcriptions]
+    predictions = [data_utils.normalizer(pred.text) for pred in transcriptions]
 
     avg_time = total_time / len(all_data["audio_filepaths"])
 
diff --git a/requirements/requirements_nemo.txt b/requirements/requirements_nemo.txt
@@ -1,6 +1,7 @@
-git+https://github.com/NVIDIA/NeMo.git@d0efff087613ea2584e215969f289fed17414d8b#egg=nemo_toolkit[all] # This commit hash is a recent version of main at the time of testing.
+git+https://github.com/NVIDIA/NeMo.git@cb755f5595880a56159cc9a6f4a050c20a449d0a#egg=nemo_toolkit[all] # This commit hash is a recent version of main at the time of testing.
 tqdm
 soundfile
 librosa
 IPython # Workaround for https://github.com/NVIDIA/NeMo/pull/9890#discussion_r1701028427
 cuda-python>=12.4 # Used for fast TDT and RNN-T inference
+datasets <= 2.21.0