diff --git a/api/run_longform.sh b/api/run_longform.sh new file mode 100644 index 0000000..10b103c --- /dev/null +++ b/api/run_longform.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +export OPENAI_API_KEY="your_api_key" +export ASSEMBLYAI_API_KEY="your_api_key" +export ELEVENLABS_API_KEY="your_api_key" +export REVAI_API_KEY="your_api_key" +export AQUAVOICE_API_KEY="your_api_key" + +MODEL_IDs=( + "openai/gpt-4o-transcribe" + "openai/gpt-4o-mini-transcribe" + "openai/whisper-1" + "assembly/best" + "elevenlabs/scribe_v1" + "revai/machine" # please use --use_url=True + "revai/fusion" # please use --use_url=True + "speechmatics/enhanced" + "aquavoice/avalon-v1-en" +) + +MAX_WORKERS=10 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + python run_eval.py \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings21" \ + --split="test" \ + --model_name ${MODEL_ID} \ + --max_workers ${MAX_WORKERS} + + python run_eval.py \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings22" \ + --split="test" \ + --model_name ${MODEL_ID} \ + --max_workers ${MAX_WORKERS} + + python run_eval.py \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="tedlium" \ + --split="test" \ + --model_name ${MODEL_ID} \ + --max_workers ${MAX_WORKERS} + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/kyutai/run_kyutai_fongform.sh b/kyutai/run_kyutai_fongform.sh new file mode 100644 index 0000000..e9ac90f --- /dev/null +++ b/kyutai/run_kyutai_fongform.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("kyutai/stt-2.6b-en") +BATCH_SIZE=16 + +num_models=${#MODEL_IDs[@]} +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings21" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/nemo_asr/run_eval_long.py b/nemo_asr/run_eval_long.py new file mode 100644 index 0000000..9e88794 --- /dev/null +++ b/nemo_asr/run_eval_long.py @@ -0,0 +1,225 @@ +import argparse + +import io +import os +import torch +import evaluate +import soundfile + +from tqdm import tqdm +from normalizer import data_utils +import numpy as np + +from nemo.collections.asr.models import ASRModel +import time + + +wer_metric = evaluate.load("wer") + + +def main(args): + + DATA_CACHE_DIR = os.path.join(os.getcwd(), "audio_cache") + DATASET_NAME = args.dataset + SPLIT_NAME = args.split + + CACHE_DIR = os.path.join(DATA_CACHE_DIR, DATASET_NAME, SPLIT_NAME) + if not os.path.exists(CACHE_DIR): + os.makedirs(CACHE_DIR) + + if args.device >= 0: + device = torch.device(f"cuda:{args.device}") + compute_dtype=torch.bfloat16 + else: + device = torch.device("cpu") + compute_dtype=torch.float32 + + + if args.model_id.endswith(".nemo"): + asr_model = ASRModel.restore_from(args.model_id, map_location=device) + else: + asr_model = ASRModel.from_pretrained(args.model_id, map_location=device) # type: ASRModel + + if args.longform: + asr_model.change_attention_model("rel_pos_local_attn", [128, 128]) # local attn + asr_model.to(compute_dtype) + asr_model.eval() + + dataset = data_utils.load_data(args) + + def download_audio_files(batch, indices): + + # download audio files and write the paths, transcriptions and durations to a manifest file + audio_paths = [] + durations = [] + + # Use global indices for unique filenames across all batches + for global_idx, sample in zip(indices, batch["audio"]): + # Use a unique filename based on global index + audio_path = os.path.join(CACHE_DIR, f"sample_{global_idx}.wav") + + if "array" in sample: + audio_array = np.float32(sample["array"]) + sample_rate = 16000 + + elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream + with io.BytesIO(sample["bytes"]) as audio_file: + audio_array, sample_rate = soundfile.read(audio_file, dtype="float32") + + else: + raise ValueError("Sample must have either 'array' or 'bytes' key") + + if not os.path.exists(audio_path): + os.makedirs(os.path.dirname(audio_path), exist_ok=True) + soundfile.write(audio_path, audio_array, sample_rate) + + audio_paths.append(audio_path) + durations.append(len(audio_array) / sample_rate) + + + batch["references"] = batch["norm_text"] + batch["audio_filepaths"] = audio_paths + batch["durations"] = durations + + return batch + + + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples !") + dataset = dataset.take(args.max_eval_samples) + + dataset = data_utils.prepare_data(dataset) + if asr_model.cfg.decoding.strategy != "beam": + asr_model.cfg.decoding.strategy = "greedy_batch" + asr_model.change_decoding_strategy(asr_model.cfg.decoding) + + # prepraing the offline dataset + dataset = dataset.map(download_audio_files, batch_size=args.batch_size, batched=True, with_indices=True, remove_columns=["audio"]) + + # Write manifest from daraset batch using json and keys audio_filepath, duration, text + + all_data = { + "audio_filepaths": [], + "durations": [], + "references": [], + } + + data_itr = iter(dataset) + for data in tqdm(data_itr, desc="Downloading Samples"): + for key in all_data: + all_data[key].append(data[key]) + + # Sort audio_filepaths and references based on durations values + sorted_indices = sorted(range(len(all_data["durations"])), key=lambda k: all_data["durations"][k], reverse=True) + all_data["audio_filepaths"] = [all_data["audio_filepaths"][i] for i in sorted_indices] + all_data["references"] = [all_data["references"][i] for i in sorted_indices] + all_data["durations"] = [all_data["durations"][i] for i in sorted_indices] + + total_time = 0 + for _ in range(2): # warmup once and calculate rtf + if _ == 0: + audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches + else: + audio_files = all_data["audio_filepaths"] + start_time = time.time() + with torch.inference_mode(), torch.no_grad(): + + if 'canary' in args.model_id and 'v2' not in args.model_id: + pnc = 'nopnc' + else: + pnc = 'pnc' + + if 'canary' in args.model_id: + transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, pnc=pnc, num_workers=1) + else: + transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, num_workers=1) + end_time = time.time() + if _ == 1: + total_time += end_time - start_time + total_time = total_time + + # normalize transcriptions with English normalizer + if isinstance(transcriptions, tuple) and len(transcriptions) == 2: + transcriptions = transcriptions[0] + predictions = [data_utils.normalizer(pred.text) for pred in transcriptions] + + avg_time = total_time / len(all_data["audio_filepaths"]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_data["references"], + predictions, + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_data["durations"], + transcription_time=[avg_time] * len(all_data["audio_filepaths"]), + ) + + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute(references=all_data['references'], predictions=predictions) + wer = round(100 * wer, 2) + + # transcription_time = sum(all_results["transcription_time"]) + audio_length = sum(all_data["durations"]) + rtfx = audio_length / total_time + rtfx = round(rtfx, 2) + + print("RTFX:", rtfx) + print("WER:", wer, "%") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", type=str, required=True, help="Model identifier. Should be loadable with NVIDIA NeMo.", + ) + parser.add_argument( + '--dataset_path', type=str, default='esb/datasets', help='Dataset path. By default, it is `esb/datasets`' + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", type=int, default=32, help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest='streaming', + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--longform", + action="store_true", + help="Whether to use longform mode.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=True) + + main(args) diff --git a/nemo_asr/run_longform.sh b/nemo_asr/run_longform.sh new file mode 100755 index 0000000..8423198 --- /dev/null +++ b/nemo_asr/run_longform.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +#considering latest model +MODEL_IDs=("nvidia/parakeet-tdt-1.1b" "nvidia/parakeet-rnnt-1.1b" "nvidia/parakeet-rnnt-0.6b" "nvidia/stt_en_fastconformer_transducer_large" "nvidia/stt_en_conformer_transducer_large" "nvidia/stt_en_conformer_transducer_small" "nvidia/parakeet-tdt-0.6b-v2") + +# For CTC models: +# MODEL_IDs=("nvidia/parakeet-ctc-1.1b" "nvidia/parakeet-ctc-0.6b" "nvidia/stt_en_fastconformer_ctc_large" "nvidia/stt_en_conformer_ctc_large" "nvidia/stt_en_conformer_ctc_small") + +BATCH_SIZE=1 +DEVICE_ID=0 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval_long.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings21" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --longform + + python run_eval_long.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --longform + + python run_eval_long.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --longform + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/transformers/run_eval.py b/transformers/run_eval.py index 89c4272..395402d 100644 --- a/transformers/run_eval.py +++ b/transformers/run_eval.py @@ -14,7 +14,7 @@ def main(args): config = AutoConfig.from_pretrained(args.model_id) cls_model = AutoModelForSpeechSeq2Seq if type(config) in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else AutoModelForCTC - model = cls_model.from_pretrained(args.model_id, torch_dtype=torch.bfloat16, attn_implementation="sdpa").to(args.device) + model = cls_model.from_pretrained(args.model_id, dtype=torch.bfloat16, attn_implementation="sdpa").to(args.device) processor = AutoProcessor.from_pretrained(args.model_id) model_input_name = processor.model_input_names[0] @@ -37,6 +37,9 @@ def benchmark(batch, min_new_tokens=None): # Load audio inputs audios = [audio["array"] for audio in batch["audio"]] minibatch_size = len(audios) + + # Compute audio length in seconds (16kHz sampling rate) + batch["audio_length_s"] = [len(audio) / 16_000 for audio in audios] # START TIMING start_time = time.time() @@ -61,7 +64,17 @@ def benchmark(batch, min_new_tokens=None): ) else: # 1.3 Standard Whisper processing: pad audios to 30-seconds and converted to log-mel - inputs = processor(audios, sampling_rate=16_000, return_tensors="pt", device=args.device) + if args.longform: + inputs = processor( + audios, + sampling_rate=16_000, + return_tensors="pt", + truncation=False, + padding="longest", + return_attention_mask=True, + ) + else: + inputs = processor(audios, sampling_rate=16_000, return_tensors="pt", device=args.device) inputs = inputs.to(args.device) inputs[model_input_name] = inputs[model_input_name].to(torch.bfloat16) @@ -70,7 +83,10 @@ def benchmark(batch, min_new_tokens=None): with sdpa_kernel(SDPBackend.MATH if args.torch_compile else SDPBackend.FLASH_ATTENTION): if model.can_generate(): # 2.1 Auto-regressive generation for encoder-decoder models - pred_ids = model.generate(**inputs, **gen_kwargs, min_new_tokens=min_new_tokens) + if args.longform: + pred_ids = model.generate(**inputs, **gen_kwargs, return_timestamps=True) + else: + pred_ids = model.generate(**inputs, **gen_kwargs, min_new_tokens=min_new_tokens) else: # 2.2. Single forward pass for CTC with torch.no_grad(): @@ -213,6 +229,11 @@ def benchmark(batch, min_new_tokens=None): default=None, help="Maximum number of tokens to generate (for auto-regressive models).", ) + parser.add_argument( + "--longform", + action="store_true", + help="Whether to use longform mode.", + ) parser.add_argument( "--torch_compile", action="store_true", diff --git a/transformers/run_whisper_longform.sh b/transformers/run_whisper_longform.sh new file mode 100755 index 0000000..d3e9939 --- /dev/null +++ b/transformers/run_whisper_longform.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("openai/whisper-large" "openai/whisper-large-v2" "openai/whisper-large-v3" "openai/whisper-large-v3-turbo" "distil-whisper/distil-medium.en" "distil-whisper/distil-large-v2" "distil-whisper/distil-large-v3" "distil-whisper/distil-large-v3.5") +BATCH_SIZE=32 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings21" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --longform + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --longform + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/asr-leaderboard-longform" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --longform + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done