Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions api/run_longform.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash

export PYTHONPATH="..":$PYTHONPATH

export OPENAI_API_KEY="your_api_key"
export ASSEMBLYAI_API_KEY="your_api_key"
export ELEVENLABS_API_KEY="your_api_key"
export REVAI_API_KEY="your_api_key"
export AQUAVOICE_API_KEY="your_api_key"

MODEL_IDs=(
"openai/gpt-4o-transcribe"
"openai/gpt-4o-mini-transcribe"
"openai/whisper-1"
"assembly/best"
"elevenlabs/scribe_v1"
"revai/machine" # please use --use_url=True
"revai/fusion" # please use --use_url=True
"speechmatics/enhanced"
"aquavoice/avalon-v1-en"
)

MAX_WORKERS=10

num_models=${#MODEL_IDs[@]}

for (( i=0; i<${num_models}; i++ ));
do
MODEL_ID=${MODEL_IDs[$i]}
python run_eval.py \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="earnings21" \
--split="test" \
--model_name ${MODEL_ID} \
--max_workers ${MAX_WORKERS}

python run_eval.py \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="earnings22" \
--split="test" \
--model_name ${MODEL_ID} \
--max_workers ${MAX_WORKERS}

python run_eval.py \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="tedlium" \
--split="test" \
--model_name ${MODEL_ID} \
--max_workers ${MAX_WORKERS}

# Evaluate results
RUNDIR=`pwd` && \
cd ../normalizer && \
python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
cd $RUNDIR

done
46 changes: 46 additions & 0 deletions kyutai/run_kyutai_fongform.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash

export PYTHONPATH="..":$PYTHONPATH

MODEL_IDs=("kyutai/stt-2.6b-en")
BATCH_SIZE=16

num_models=${#MODEL_IDs[@]}
for (( i=0; i<${num_models}; i++ ));
do
MODEL_ID=${MODEL_IDs[$i]}

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="earnings21" \
--split="test" \
--device=0 \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="earnings22" \
--split="test" \
--device=0 \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="tedlium" \
--split="test" \
--device=0 \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1

# Evaluate results
RUNDIR=`pwd` && \
cd ../normalizer && \
python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
cd $RUNDIR

done
225 changes: 225 additions & 0 deletions nemo_asr/run_eval_long.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import argparse

import io
import os
import torch
import evaluate
import soundfile

from tqdm import tqdm
from normalizer import data_utils
import numpy as np

from nemo.collections.asr.models import ASRModel
import time


wer_metric = evaluate.load("wer")


def main(args):

DATA_CACHE_DIR = os.path.join(os.getcwd(), "audio_cache")
DATASET_NAME = args.dataset
SPLIT_NAME = args.split

CACHE_DIR = os.path.join(DATA_CACHE_DIR, DATASET_NAME, SPLIT_NAME)
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)

if args.device >= 0:
device = torch.device(f"cuda:{args.device}")
compute_dtype=torch.bfloat16
else:
device = torch.device("cpu")
compute_dtype=torch.float32


if args.model_id.endswith(".nemo"):
asr_model = ASRModel.restore_from(args.model_id, map_location=device)
else:
asr_model = ASRModel.from_pretrained(args.model_id, map_location=device) # type: ASRModel

if args.longform:
asr_model.change_attention_model("rel_pos_local_attn", [128, 128]) # local attn
asr_model.to(compute_dtype)
asr_model.eval()

dataset = data_utils.load_data(args)

def download_audio_files(batch, indices):

# download audio files and write the paths, transcriptions and durations to a manifest file
audio_paths = []
durations = []

# Use global indices for unique filenames across all batches
for global_idx, sample in zip(indices, batch["audio"]):
# Use a unique filename based on global index
audio_path = os.path.join(CACHE_DIR, f"sample_{global_idx}.wav")

if "array" in sample:
audio_array = np.float32(sample["array"])
sample_rate = 16000

elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream
with io.BytesIO(sample["bytes"]) as audio_file:
audio_array, sample_rate = soundfile.read(audio_file, dtype="float32")

else:
raise ValueError("Sample must have either 'array' or 'bytes' key")

if not os.path.exists(audio_path):
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
soundfile.write(audio_path, audio_array, sample_rate)

audio_paths.append(audio_path)
durations.append(len(audio_array) / sample_rate)


batch["references"] = batch["norm_text"]
batch["audio_filepaths"] = audio_paths
batch["durations"] = durations

return batch


if args.max_eval_samples is not None and args.max_eval_samples > 0:
print(f"Subsampling dataset to first {args.max_eval_samples} samples !")
dataset = dataset.take(args.max_eval_samples)

dataset = data_utils.prepare_data(dataset)
if asr_model.cfg.decoding.strategy != "beam":
asr_model.cfg.decoding.strategy = "greedy_batch"
asr_model.change_decoding_strategy(asr_model.cfg.decoding)

# prepraing the offline dataset
dataset = dataset.map(download_audio_files, batch_size=args.batch_size, batched=True, with_indices=True, remove_columns=["audio"])

# Write manifest from daraset batch using json and keys audio_filepath, duration, text

all_data = {
"audio_filepaths": [],
"durations": [],
"references": [],
}

data_itr = iter(dataset)
for data in tqdm(data_itr, desc="Downloading Samples"):
for key in all_data:
all_data[key].append(data[key])

# Sort audio_filepaths and references based on durations values
sorted_indices = sorted(range(len(all_data["durations"])), key=lambda k: all_data["durations"][k], reverse=True)
all_data["audio_filepaths"] = [all_data["audio_filepaths"][i] for i in sorted_indices]
all_data["references"] = [all_data["references"][i] for i in sorted_indices]
all_data["durations"] = [all_data["durations"][i] for i in sorted_indices]

total_time = 0
for _ in range(2): # warmup once and calculate rtf
if _ == 0:
audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches
else:
audio_files = all_data["audio_filepaths"]
start_time = time.time()
with torch.inference_mode(), torch.no_grad():

if 'canary' in args.model_id and 'v2' not in args.model_id:
pnc = 'nopnc'
else:
pnc = 'pnc'

if 'canary' in args.model_id:
transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, pnc=pnc, num_workers=1)
else:
transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, num_workers=1)
end_time = time.time()
if _ == 1:
total_time += end_time - start_time
total_time = total_time

# normalize transcriptions with English normalizer
if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
transcriptions = transcriptions[0]
predictions = [data_utils.normalizer(pred.text) for pred in transcriptions]

avg_time = total_time / len(all_data["audio_filepaths"])

# Write manifest results (WER and RTFX)
manifest_path = data_utils.write_manifest(
all_data["references"],
predictions,
args.model_id,
args.dataset_path,
args.dataset,
args.split,
audio_length=all_data["durations"],
transcription_time=[avg_time] * len(all_data["audio_filepaths"]),
)

print("Results saved at path:", os.path.abspath(manifest_path))

wer = wer_metric.compute(references=all_data['references'], predictions=predictions)
wer = round(100 * wer, 2)

# transcription_time = sum(all_results["transcription_time"])
audio_length = sum(all_data["durations"])
rtfx = audio_length / total_time
rtfx = round(rtfx, 2)

print("RTFX:", rtfx)
print("WER:", wer, "%")


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument(
"--model_id", type=str, required=True, help="Model identifier. Should be loadable with NVIDIA NeMo.",
)
parser.add_argument(
'--dataset_path', type=str, default='esb/datasets', help='Dataset path. By default, it is `esb/datasets`'
)
parser.add_argument(
"--dataset",
type=str,
required=True,
help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names "
"can be found at `https://huggingface.co/datasets/esb/datasets`",
)
parser.add_argument(
"--split",
type=str,
default="test",
help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.",
)
parser.add_argument(
"--device",
type=int,
default=-1,
help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
)
parser.add_argument(
"--batch_size", type=int, default=32, help="Number of samples to go through each streamed batch.",
)
parser.add_argument(
"--max_eval_samples",
type=int,
default=None,
help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.",
)
parser.add_argument(
"--no-streaming",
dest='streaming',
action="store_false",
help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.",
)
parser.add_argument(
"--longform",
action="store_true",
help="Whether to use longform mode.",
)
args = parser.parse_args()
parser.set_defaults(streaming=True)

main(args)
56 changes: 56 additions & 0 deletions nemo_asr/run_longform.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

export PYTHONPATH="..":$PYTHONPATH

#considering latest model
MODEL_IDs=("nvidia/parakeet-tdt-1.1b" "nvidia/parakeet-rnnt-1.1b" "nvidia/parakeet-rnnt-0.6b" "nvidia/stt_en_fastconformer_transducer_large" "nvidia/stt_en_conformer_transducer_large" "nvidia/stt_en_conformer_transducer_small" "nvidia/parakeet-tdt-0.6b-v2")

# For CTC models:
# MODEL_IDs=("nvidia/parakeet-ctc-1.1b" "nvidia/parakeet-ctc-0.6b" "nvidia/stt_en_fastconformer_ctc_large" "nvidia/stt_en_conformer_ctc_large" "nvidia/stt_en_conformer_ctc_small")

BATCH_SIZE=1
DEVICE_ID=0

num_models=${#MODEL_IDs[@]}

for (( i=0; i<${num_models}; i++ ));
do
MODEL_ID=${MODEL_IDs[$i]}

python run_eval_long.py \
--model_id=${MODEL_ID} \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="earnings21" \
--split="test" \
--device=0 \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--longform

python run_eval_long.py \
--model_id=${MODEL_ID} \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="earnings22" \
--split="test" \
--device=0 \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--longform

python run_eval_long.py \
--model_id=${MODEL_ID} \
--dataset_path="hf-audio/asr-leaderboard-longform" \
--dataset="tedlium" \
--split="test" \
--device=0 \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--longform

# Evaluate results
RUNDIR=`pwd` && \
cd ../normalizer && \
python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
cd $RUNDIR

done
Loading