huggingface
diff --git a/‎data/sample_4469669.wav‎
18.3 MB b/‎data/sample_4469669.wav‎
18.3 MB
diff --git a/‎data/sample_ami-es2015b.wav‎
18.3 MB b/‎data/sample_ami-es2015b.wav‎
18.3 MB
diff --git a/‎nemo_asr/calc_rtf.py‎
Lines changed: 65 additions & 104 deletions b/‎nemo_asr/calc_rtf.py‎
Lines changed: 65 additions & 104 deletions
diff --git a/‎nemo_asr/run_eval.py‎
Lines changed: 4 additions & 1 deletion b/‎nemo_asr/run_eval.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎requirements/requirements_nemo.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/requirements_nemo.txt‎
Lines changed: 1 addition & 1 deletion
@@ -14,58 +14,44 @@
     --nbatches: Total number of batches to process.
     --warmup_batches: Number of batches to skip as warmup.
     --audio: Path to the input audio file for ASR.
-    --audio_maxlen: Maximum duration of audio to process (in seconds).
-    --precision: Model precision (16, 32, or bf16).
-    --cudnn_benchmark: Enable cuDNN benchmarking.
-    --log: Enable logging.
 
 Example:
-    python calculate_rtf.py --model stt_en_conformer_ctc_large --decoding_type ctc --gpu 0 --batch_size 1 --nbatches 5 --warmup_batches 5 --audio /path/to/audio.wav --audio_maxlen 600 --precision bf16 --cudnn_benchmark
+    python calc_rtf.py --model stt_en_conformer_ctc_large --decoding_type ctc
 """
-
+import copy 
+from omegaconf import OmegaConf
 import time
 import argparse
 from tqdm import tqdm
 import torch
-from omegaconf import OmegaConf
-import copy
 import sys
 import soundfile as sf
 import numpy as np
+import librosa
 
 from nemo.utils import logging
-from contextlib import nullcontext
 from nemo.collections.asr.models import ASRModel
 
 
 
 parser = argparse.ArgumentParser(description='model forward pass profiler / performance tester.')
-parser.add_argument("--model", default='stt_en_conformer_ctc_large', type=str, help="ASR model")
-parser.add_argument("--decoding_type", default='ctc', type=str, help="Encoding type (bpe or char)")
+parser.add_argument("--model", default='stt_en_fastconformer_ctc_large', type=str, help="ASR model")
+parser.add_argument("--decoding_type", default='ctc', type=str, help="Type of model [rnnt, ctc, aed]")
 parser.add_argument("--gpu", default=0, type=int, help="GPU device to use")
 parser.add_argument("--batch_size", default=1, type=int, help="batch size to use")
-parser.add_argument("--nbatches", default=5, type=int, help="Total Number of batches to process")
-parser.add_argument("--warmup_batches", default=5, type=int, help="Number of batches to skip as warmup")
-parser.add_argument("--audio", default="/disk3/datasets/speech-datasets/earnings22/media/4469669.wav", type=str, help="wav file to use")
-parser.add_argument("--audio_maxlen", default=16, type=float, help="cut the file at given length if it is longer")
-parser.add_argument("--precision", default='bf16', type=str, help="precision: 16/32/bf16")
-parser.add_argument("--cudnn_benchmark", dest="enable_cudnn_bench", action="store_true", help="toggle cudnn benchmarking", default=True)
-parser.add_argument("--log", dest="log", action="store_true", help="toggle logging", default=True)
+parser.add_argument("--nbatches", default=3, type=int, help="Total Number of batches to process")
+parser.add_argument("--warmup_batches", default=3, type=int, help="Number of batches to skip as warmup")
+parser.add_argument("--audio", default="../data/sample_4469669.wav", type=str, help="wav file to use")
 
-args = parser.parse_args()
-
-if args.log:
-    # INFO
-    logging.setLevel(20)
-else:
-    logging.setLevel(0)
+# parser.add_argument("--audio_maxlen", default=30, type=float, help="Multiple chunks of audio of this length is used to calculate RTFX")
 
-if args.enable_cudnn_bench:
-    torch.backends.cudnn.benchmark=True
+args = parser.parse_args()
+torch.backends.cudnn.benchmark=True
 
-PRECISION = args.precision
 WAV = args.audio
-audio_maxlen = args.audio_maxlen
+SAMPLING_RATE = 16000
+chunk_len = 30
+total_audio_len = 600
 MODEL = args.model
 batch_size = args.batch_size
 nbatches = args.nbatches
@@ -74,50 +60,31 @@
 
 DEVICE=torch.device(args.gpu)
 
-if PRECISION != 'bf16' and PRECISION != '16' and PRECISION != '32':
-    logging.error(f'unknown precision: {PRECISION}')
-    sys.exit(1)
-
-logging.info(f'precision: {PRECISION}')
-logging.info(f'WAV: {WAV}')
-logging.info(f'AUDIO MAXLEN: {audio_maxlen}')
 logging.info(f'MODEL: {MODEL}')
-logging.info(f'batch_size: {batch_size}')
-logging.info(f'num batches: {nbatches}')
-logging.info(f'cudnn_benchmark: {args.enable_cudnn_bench}')
-
 
-def get_samples(audio_file, audio_maxlen, target_sr=16000):
+def get_samples(audio_file, total_audio_len, target_sr=16000):
     with sf.SoundFile(audio_file, 'r') as f:
         dtype = 'int16'
         sample_rate = f.samplerate
         samples = f.read(dtype=dtype)
         if sample_rate != target_sr:
-            samples = librosa.core.resample(samples, sample_rate, target_sr)
+            samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
         samples = samples.astype('float32') / 32768
         samples = samples.transpose()
         sample_length = samples.shape[0]
-        if sample_length > audio_maxlen * target_sr:
-            logging.info(f'resizing audio sample from {sample_length / target_sr} to maxlen of {audio_maxlen}')
-            sample_length = int(audio_maxlen * target_sr)
+        if sample_length > total_audio_len * target_sr:
+            logging.info(f'resizing audio sample from {sample_length / target_sr} to maxlen of {total_audio_len}')
+            sample_length = int(total_audio_len * target_sr)
             samples = samples[:sample_length]
             logging.info(f'new sample lengh: {samples.shape[0]}')
         else:
-            pad_length = int(audio_maxlen * target_sr) - sample_length
-            logging.info(f'padding audio sample from {sample_length / target_sr} to maxlen of {audio_maxlen}')
+            pad_length = int(total_audio_len * target_sr) - sample_length
+            logging.info(f'padding audio sample from {sample_length / target_sr} to maxlen of {total_audio_len}')
             samples = np.pad(samples, (0, pad_length), 'constant', constant_values=(0, 0))
-            sample_length = int(audio_maxlen * target_sr)
+            sample_length = int(total_audio_len * target_sr)
 
         return samples, sample_length
 
-def preprocess_audio(preprocessor, audio, device):
-    audio_signal = torch.from_numpy(audio).unsqueeze_(0).to(device)
-
-    audio_signal_len = torch.Tensor([audio.shape[0]]).to(device)
-    processed_signal, processed_signal_length = preprocessor(
-        input_signal=audio_signal, length=audio_signal_len
-    )
-    return processed_signal, processed_signal_length
 
 def extract_preprocessor(model, device):
     cfg = copy.deepcopy(model._cfg)
@@ -135,64 +102,58 @@ def main():
         asr_model = ASRModel.from_pretrained(MODEL)
 
     asr_model.to(DEVICE)
-    asr_model.encoder.eval()
-    asr_model.encoder.freeze()
+    asr_model.eval()
     asr_model._prepare_for_export()
 
-    processor = extract_preprocessor(asr_model, DEVICE)
-
-    input_example, input_example_length  = get_samples(WAV, audio_maxlen)
-    logging.info(f'processed example shape: {input_example.shape}')
-    logging.info(f'processed example length shape: {input_example_length}')
-    processed_example, processed_example_length = preprocess_audio(processor, input_example, DEVICE)
-    processed_example = processed_example.repeat(batch_size, 1, 1)
-    processed_example_length = processed_example_length.repeat(batch_size)
-    logging.info(f'processed example shape: {processed_example.size()}')
-    logging.info(f'processed example length shape: {processed_example_length.size()}')
-
-    profiling_context = nullcontext()
-    # if FP16:
-    if PRECISION == '16':
-        precision_context = torch.cuda.amp.autocast()
-    elif PRECISION == 'bf16':
-        precision_context = torch.cuda.amp.autocast(dtype=torch.bfloat16)
-    elif PRECISION == '32':
-        pass
-    else:
-        logging.error(f'unknown precision: {PRECISION}')
-        sys.exit(1)
-
+    preprocessor = extract_preprocessor(asr_model, DEVICE)
+    input_example, input_example_length  = get_samples(WAV, total_audio_len)
+    input_example = torch.tensor(input_example).to(DEVICE)
+    input_example = input_example.repeat(batch_size, 1)
+    input_example_length = torch.tensor(input_example_length).to(DEVICE)
+    input_example_length = input_example_length.repeat(batch_size)
 
-    if decoding_type == 'ctc':
-        asr_model.change_decoding_strategy(decoding_cfg=None)
+    processed_signal, processed_signal_length = preprocessor(input_signal=input_example, length=input_example_length)
+    processed_example = processed_signal.repeat(batch_size, 1, 1)
 
+
     logging.info(f"running {nbatches} batches; with {warmup_batches} batches warmup; batch_size: {batch_size}")
     rtfs=[]
     for i in range(3): # average over 3 runs
         total_time = 0
-        with profiling_context:
-            with precision_context:
-                with torch.no_grad():
-                    for i in tqdm(range(nbatches + warmup_batches)):
-
-                        start = time.time()
-                        if decoding_type == 'rnnt':
-                            enc_out, enc_len = asr_model.encoder.forward(audio_signal=processed_example, length=processed_example_length)
-                            dec_out, dec_len = asr_model.decoding.rnnt_decoder_predictions_tensor(
-                            encoder_output=enc_out, encoded_lengths=enc_len, return_hypotheses=False
-                            )
-                        else:
-                            enc_out, enc_len, greedy_predictions = asr_model.forward(processed_signal=processed_example, processed_signal_length=processed_example_length)
-                            dec_out, dec_len = asr_model.decoding.ctc_decoder_predictions_tensor(
-                            enc_out, decoder_lengths=enc_len, return_hypotheses=False
-                            )
-                        torch.cuda.synchronize()
-                        end = time.time()
-                        if i >= warmup_batches:
-                            total_time += end - start
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            with torch.no_grad():
+                for i in tqdm(range(nbatches + warmup_batches)):
+                    start = time.time()
+                    if decoding_type == 'rnnt':
+                        enc_out, enc_len = asr_model.encoder.forward(audio_signal=processed_example, length=processed_signal_length)
+                        dec_out, dec_len = asr_model.decoding.rnnt_decoder_predictions_tensor(
+                        encoder_output=enc_out, encoded_lengths=enc_len, return_hypotheses=False
+                        )
+                    elif decoding_type == 'ctc':
+                        enc_out, enc_len, greedy_predictions = asr_model.forward(input_signal=input_example, input_signal_length=input_example_length)
+                        dec_out, dec_len = asr_model.decoding.ctc_decoder_predictions_tensor(
+                        enc_out, decoder_lengths=enc_len, return_hypotheses=False
+                        )
+                    elif decoding_type == 'aed':
+                        log_probs, encoded_len, enc_states, enc_mask = asr_model.forward(input_signal=input_example, input_signal_length=input_example_length)
+                        beam_hypotheses = asr_model.decoding.decode_predictions_tensor(
+                        encoder_hidden_states=enc_states, 
+                        encoder_input_mask=enc_mask, 
+                        decoder_input_ids=None, #torch.tensor([[  3,   4,   8,   4,  11]]).to(DEVICE),
+                        return_hypotheses=False,
+                        )[0]
+
+                        beam_hypotheses = [asr_model.decoding.strip_special_tokens(text) for text in beam_hypotheses]
+                    else:
+                        raise ValueError(f'Invalid decoding type: {decoding_type}')
+                    
+                    torch.cuda.synchronize()
+                    end = time.time()
+                    if i >= warmup_batches:
+                        total_time += end - start
 
 
-        rtf = (total_time/nbatches) / (input_example_length / 16000)
+        rtf = (total_time/nbatches) / (float(input_example_length) / 16000)
 
         rtfs.append(rtf)
 
 
@@ -92,7 +92,10 @@ def main(args):
     else:
         device = torch.device("cpu")
 
-    asr_model = ASRModel.from_pretrained(args.model_id, map_location=device)  # type: ASRModel
+    if args.model_id.endswith(".nemo"):
+        asr_model = ASRModel.restore_from(args.model_id, map_location=device)
+    else:
+        asr_model = ASRModel.from_pretrained(args.model_id, map_location=device)  # type: ASRModel
     asr_model.freeze()
 
     dataset = data_utils.load_data(args)
 
@@ -1,4 +1,4 @@
-nemo_toolkit[all]
+git+https://github.com/NVIDIA/[email protected]#egg=nemo_toolkit[all]
 tqdm
 soundfile
 librosa