NeMo RTFx updates (#33)

galv · web-flow · commit ac455ba6aa4f · 2024-08-09T08:23:32.000-07:00
* Remove common voice from evaluation, as discussed. Pin nemo to a particular version to make sure results are reproducible. In particular, include: NVIDIA-NeMo/NeMo#10054 Make sure that optional dependency cuda-python is included to ensure that we use cuda graph accelerated decoder inference in RNN-T and TDT mdoels.
diff --git a/README.md b/README.md
@@ -9,8 +9,9 @@ Each library has its own set of requirements. We recommend using a clean conda e
 1) Clone this repository.
 2) Install PyTorch by following the instructions here: https://pytorch.org/get-started/locally/
 3) Install the common requirements for all library by running `pip install -r requirements/requirements.txt`.
-4) Install the requirements for each library you wish to evalaute by running `pip install -r requirements/requirements_<library_name>.txt`.
-5) Connect your Hugging Face account by running `huggingface-cli login`.
+4) If you wish to run NeMo, note that the benchmark currently needs CUDA 12.6 (`nvidia-smi` should output "CUDA Version: 12.6" or higher), to fix a problem in previous drivers for RNN-T inference with cooperative kernels inside of conditional nodes (see here: https://github.com/NVIDIA/NeMo/pull/9869)
+5) Install the requirements for each library you wish to evalaute by running `pip install -r requirements/requirements_<library_name>.txt`.
+6) Connect your Hugging Face account by running `huggingface-cli login`.
 
 # Evaluate a model
 
diff --git a/nemo_asr/run_canary.sh b/nemo_asr/run_canary.sh
@@ -85,15 +85,6 @@ do
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
-    python run_eval.py \
-        --model_id=${MODEL_ID} \
-        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-        --dataset="common_voice" \
-        --split="test" \
-        --device=${DEVICE_ID} \
-        --batch_size=${BATCH_SIZE} \
-        --max_eval_samples=-1 
-
     # Evaluate results
     RUNDIR=`pwd` && \
     cd ../normalizer && \
diff --git a/nemo_asr/run_eval.py b/nemo_asr/run_eval.py
@@ -101,7 +101,7 @@ def download_audio_files(batch):
     total_time = 0
     for _ in range(2): # warmup once and calculate rtf
         if _ == 0:
-            audio_files = all_data["audio_filepaths"][:256] # warmup with 4 batches
+            audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches
         else:
             audio_files = all_data["audio_filepaths"]
         start_time = time.time()
diff --git a/nemo_asr/run_fast_conformer_ctc.sh b/nemo_asr/run_fast_conformer_ctc.sh
@@ -86,15 +86,6 @@ do
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
-    python run_eval.py \
-        --model_id=${MODEL_ID} \
-        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-        --dataset="common_voice" \
-        --split="test" \
-        --device=${DEVICE_ID} \
-        --batch_size=${BATCH_SIZE} \
-        --max_eval_samples=-1 
-
     # Evaluate results
     RUNDIR=`pwd` && \
     cd ../normalizer && \
diff --git a/nemo_asr/run_fast_conformer_rnnt.sh b/nemo_asr/run_fast_conformer_rnnt.sh
@@ -86,15 +86,6 @@ do
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
-    python run_eval.py \
-        --model_id=${MODEL_ID} \
-        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-        --dataset="common_voice" \
-        --split="test" \
-        --device=${DEVICE_ID} \
-        --batch_size=${BATCH_SIZE} \
-        --max_eval_samples=-1 
-
     # Evaluate results
     RUNDIR=`pwd` && \
     cd ../normalizer && \
diff --git a/requirements/requirements_nemo.txt b/requirements/requirements_nemo.txt
@@ -1,4 +1,6 @@
-git+https://github.com/NVIDIA/NeMo.git@r2.0.0rc1#egg=nemo_toolkit[all]
+git+https://github.com/NVIDIA/NeMo.git@d0efff087613ea2584e215969f289fed17414d8b#egg=nemo_toolkit[all] # This commit hash is a recent version of main at the time of testing.
 tqdm
 soundfile
 librosa
+IPython # Workaround for https://github.com/NVIDIA/NeMo/pull/9890#discussion_r1701028427
+cuda-python>=12.4 # Used for fast TDT and RNN-T inference