|
| 1 | +import argparse |
| 2 | + |
| 3 | +import io |
| 4 | +import os |
| 5 | +import torch |
| 6 | +import evaluate |
| 7 | +import soundfile |
| 8 | +import lhotse |
| 9 | + |
| 10 | +from tqdm import tqdm |
| 11 | +from normalizer import data_utils |
| 12 | +import numpy as np |
| 13 | + |
| 14 | +from nemo.collections.asr.models import ASRModel |
| 15 | +import time |
| 16 | + |
| 17 | + |
| 18 | +from nemo.collections.speechlm2.models.salm import SALM |
| 19 | +from omegaconf import OmegaConf |
| 20 | +from pathlib import Path |
| 21 | +from transformers import GenerationConfig |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | +wer_metric = evaluate.load("wer") |
| 26 | + |
| 27 | + |
| 28 | +class ToAudio(torch.utils.data.Dataset): |
| 29 | + def __getitem__(self, cuts): |
| 30 | + cuts = lhotse.CutSet([c.to_mono(mono_downmix=True) if isinstance(c, lhotse.MultiCut) else c for c in cuts]) |
| 31 | + audios, audio_lens = cuts.load_audio(collate=True) |
| 32 | + return {"cuts": cuts, "audios": audios, "audio_lens": audio_lens} |
| 33 | + |
| 34 | + |
| 35 | +def setup_dloader(audio_files, batch_size, num_workers): |
| 36 | + cuts = lhotse.CutSet([lhotse.Recording.from_file(p).to_cut() for p in audio_files]) |
| 37 | + cuts = cuts.resample(16000) |
| 38 | + return torch.utils.data.DataLoader( |
| 39 | + dataset=ToAudio(), |
| 40 | + sampler=lhotse.dataset.DynamicCutSampler(cuts, max_cuts=batch_size), |
| 41 | + num_workers=num_workers, |
| 42 | + batch_size=None, |
| 43 | + ) |
| 44 | + |
| 45 | + |
| 46 | +def transcribe(model, dloader) -> list[str]: |
| 47 | + hyps = [] |
| 48 | + eos_tokens = torch.tensor([model.text_eos_id]) |
| 49 | + for batch_idx, batch in enumerate(dloader): |
| 50 | + answer_ids = model.generate( |
| 51 | + prompts=[ |
| 52 | + [ |
| 53 | + {"role": "user", "slots": {"message": f"Transcribe the following: {model.audio_locator_tag}"}} |
| 54 | + ] |
| 55 | + ] * len(batch["cuts"]), |
| 56 | + audios=batch["audios"].to(model.device, non_blocking=True), |
| 57 | + audio_lens=batch["audio_lens"].to(model.device, non_blocking=True), |
| 58 | + generation_config=GenerationConfig( |
| 59 | + max_new_tokens=128, |
| 60 | + bos_token_id=model.text_bos_id, |
| 61 | + eos_token_id=eos_tokens, |
| 62 | + pad_token_id=model.text_pad_id, |
| 63 | + ), |
| 64 | + ) |
| 65 | + answer_ids = [parse_hyp(ans, eos_tokens) for ans in answer_ids.cpu()] |
| 66 | + hyps.extend(model.tokenizer.ids_to_text(ans).strip() for ans in answer_ids) |
| 67 | + return hyps |
| 68 | + |
| 69 | + |
| 70 | +def parse_hyp(answer: torch.Tensor, eos_tokens): |
| 71 | + end = (answer == torch.isin(answer, eos_tokens)).nonzero(as_tuple=True)[0] |
| 72 | + if end.numel() == 0: |
| 73 | + return answer |
| 74 | + end = end[0] |
| 75 | + return answer[:end] |
| 76 | + |
| 77 | + |
| 78 | +def main(args): |
| 79 | + |
| 80 | + DATA_CACHE_DIR = os.path.join(os.getcwd(), "audio_cache") |
| 81 | + DATASET_NAME = args.dataset |
| 82 | + SPLIT_NAME = args.split |
| 83 | + |
| 84 | + CACHE_DIR = os.path.join(DATA_CACHE_DIR, DATASET_NAME, SPLIT_NAME) |
| 85 | + if not os.path.exists(CACHE_DIR): |
| 86 | + os.makedirs(CACHE_DIR) |
| 87 | + |
| 88 | + torch.set_float32_matmul_precision("medium") |
| 89 | + |
| 90 | + device = torch.device(f"cuda:{args.device}") |
| 91 | + model = SALM.from_pretrained(args.model_id).eval().to(torch.bfloat16).to(device) |
| 92 | + |
| 93 | + dataset = data_utils.load_data(args) |
| 94 | + |
| 95 | + def download_audio_files(batch): |
| 96 | + |
| 97 | + # download audio files and write the paths, transcriptions and durations to a manifest file |
| 98 | + audio_paths = [] |
| 99 | + durations = [] |
| 100 | + |
| 101 | + for id, sample in zip(batch["id"], batch["audio"]): |
| 102 | + |
| 103 | + # first step added here to make ID and wav filenames unique |
| 104 | + # several datasets like earnings22 have a hierarchical structure |
| 105 | + # for eg. earnings22/test/4432298/281.wav, earnings22/test/4450488/281.wav |
| 106 | + # lhotse uses the filename (281.wav) here as unique ID to create and name cuts |
| 107 | + # ref: https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/collation.py#L186 |
| 108 | + id = id.replace('/', '_').removesuffix('.wav') |
| 109 | + |
| 110 | + audio_path = os.path.join(CACHE_DIR, f"{id}.wav") |
| 111 | + |
| 112 | + if "array" in sample: |
| 113 | + audio_array = np.float32(sample["array"]) |
| 114 | + sample_rate = 16000 |
| 115 | + |
| 116 | + elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream |
| 117 | + with io.BytesIO(sample["bytes"]) as audio_file: |
| 118 | + audio_array, sample_rate = soundfile.read(audio_file, dtype="float32") |
| 119 | + |
| 120 | + else: |
| 121 | + raise ValueError("Sample must have either 'array' or 'bytes' key") |
| 122 | + |
| 123 | + if not os.path.exists(audio_path): |
| 124 | + os.makedirs(os.path.dirname(audio_path), exist_ok=True) |
| 125 | + soundfile.write(audio_path, audio_array, sample_rate) |
| 126 | + |
| 127 | + audio_paths.append(audio_path) |
| 128 | + durations.append(len(audio_array) / sample_rate) |
| 129 | + |
| 130 | + |
| 131 | + batch["references"] = batch["norm_text"] |
| 132 | + batch["audio_filepaths"] = audio_paths |
| 133 | + batch["durations"] = durations |
| 134 | + |
| 135 | + return batch |
| 136 | + |
| 137 | + |
| 138 | + if args.max_eval_samples is not None and args.max_eval_samples > 0: |
| 139 | + print(f"Subsampling dataset to first {args.max_eval_samples} samples !") |
| 140 | + dataset = dataset.take(args.max_eval_samples) |
| 141 | + |
| 142 | + dataset = data_utils.prepare_data(dataset) |
| 143 | + |
| 144 | + # prepraing the offline dataset |
| 145 | + dataset = dataset.map(download_audio_files, batch_size=args.batch_size, batched=True, remove_columns=["audio"]) |
| 146 | + |
| 147 | + # Write manifest from daraset batch using json and keys audio_filepath, duration, text |
| 148 | + |
| 149 | + all_data = { |
| 150 | + "audio_filepaths": [], |
| 151 | + "durations": [], |
| 152 | + "references": [], |
| 153 | + } |
| 154 | + |
| 155 | + data_itr = iter(dataset) |
| 156 | + for data in tqdm(data_itr, desc="Downloading Samples"): |
| 157 | + for key in all_data: |
| 158 | + all_data[key].append(data[key]) |
| 159 | + |
| 160 | + # Sort audio_filepaths and references based on durations values |
| 161 | + sorted_indices = sorted(range(len(all_data["durations"])), key=lambda k: all_data["durations"][k], reverse=True) |
| 162 | + all_data["audio_filepaths"] = [all_data["audio_filepaths"][i] for i in sorted_indices] |
| 163 | + all_data["references"] = [all_data["references"][i] for i in sorted_indices] |
| 164 | + all_data["durations"] = [all_data["durations"][i] for i in sorted_indices] |
| 165 | + |
| 166 | + |
| 167 | + total_time = 0 |
| 168 | + for _ in range(2): # warmup once and calculate rtf |
| 169 | + if _ == 0: |
| 170 | + audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches |
| 171 | + else: |
| 172 | + audio_files = all_data["audio_filepaths"] |
| 173 | + dloader = setup_dloader(audio_files=audio_files, batch_size=args.batch_size, num_workers=1) |
| 174 | + with torch.inference_mode(): |
| 175 | + start_time = time.time() |
| 176 | + transcriptions = transcribe(model, dloader) |
| 177 | + end_time = time.time() |
| 178 | + if _ == 1: |
| 179 | + total_time += end_time - start_time |
| 180 | + total_time = total_time |
| 181 | + |
| 182 | + # normalize transcriptions with English normalizer |
| 183 | + if isinstance(transcriptions, tuple) and len(transcriptions) == 2: |
| 184 | + transcriptions = transcriptions[0] |
| 185 | + predictions = [data_utils.normalizer(pred) for pred in transcriptions] |
| 186 | + |
| 187 | + avg_time = total_time / len(all_data["audio_filepaths"]) |
| 188 | + |
| 189 | + # Write manifest results (WER and RTFX) |
| 190 | + manifest_path = data_utils.write_manifest( |
| 191 | + all_data["references"], |
| 192 | + predictions, |
| 193 | + args.model_id, |
| 194 | + args.dataset_path, |
| 195 | + args.dataset, |
| 196 | + args.split, |
| 197 | + audio_length=all_data["durations"], |
| 198 | + transcription_time=[avg_time] * len(all_data["audio_filepaths"]), |
| 199 | + ) |
| 200 | + |
| 201 | + print("Results saved at path:", os.path.abspath(manifest_path)) |
| 202 | + |
| 203 | + wer = wer_metric.compute(references=all_data['references'], predictions=predictions) |
| 204 | + wer = round(100 * wer, 2) |
| 205 | + |
| 206 | + audio_length = sum(all_data["durations"]) |
| 207 | + rtfx = audio_length / total_time |
| 208 | + rtfx = round(rtfx, 2) |
| 209 | + |
| 210 | + print("RTFX:", rtfx) |
| 211 | + print("WER:", wer, "%") |
| 212 | + |
| 213 | + |
| 214 | +if __name__ == "__main__": |
| 215 | + parser = argparse.ArgumentParser() |
| 216 | + |
| 217 | + parser.add_argument( |
| 218 | + "--model_id", type=str, required=True, help="Model identifier. Should be loadable with NVIDIA NeMo.", |
| 219 | + ) |
| 220 | + parser.add_argument( |
| 221 | + '--dataset_path', type=str, default='esb/datasets', help='Dataset path. By default, it is `esb/datasets`' |
| 222 | + ) |
| 223 | + parser.add_argument( |
| 224 | + "--dataset", |
| 225 | + type=str, |
| 226 | + required=True, |
| 227 | + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " |
| 228 | + "can be found at `https://huggingface.co/datasets/esb/datasets`", |
| 229 | + ) |
| 230 | + parser.add_argument( |
| 231 | + "--split", |
| 232 | + type=str, |
| 233 | + default="test", |
| 234 | + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", |
| 235 | + ) |
| 236 | + parser.add_argument( |
| 237 | + "--device", |
| 238 | + type=int, |
| 239 | + default=-1, |
| 240 | + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", |
| 241 | + ) |
| 242 | + parser.add_argument( |
| 243 | + "--batch_size", type=int, default=32, help="Number of samples to go through each streamed batch.", |
| 244 | + ) |
| 245 | + parser.add_argument( |
| 246 | + "--max_eval_samples", |
| 247 | + type=int, |
| 248 | + default=None, |
| 249 | + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", |
| 250 | + ) |
| 251 | + parser.add_argument( |
| 252 | + "--no-streaming", |
| 253 | + dest='streaming', |
| 254 | + action="store_false", |
| 255 | + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", |
| 256 | + ) |
| 257 | + args = parser.parse_args() |
| 258 | + parser.set_defaults(streaming=True) |
| 259 | + |
| 260 | + main(args) |
0 commit comments