Skip to content

Commit 64a4f34

Browse files
Merge pull request #81 from pzelasko/salm2
Canary-Qwen-2.5B
2 parents 76e5444 + c9ecf86 commit 64a4f34

File tree

3 files changed

+357
-1
lines changed

3 files changed

+357
-1
lines changed

nemo_asr/run_eval_salm.py

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
import argparse
2+
3+
import io
4+
import os
5+
import torch
6+
import evaluate
7+
import soundfile
8+
import lhotse
9+
10+
from tqdm import tqdm
11+
from normalizer import data_utils
12+
import numpy as np
13+
14+
from nemo.collections.asr.models import ASRModel
15+
import time
16+
17+
18+
from nemo.collections.speechlm2.models.salm import SALM
19+
from omegaconf import OmegaConf
20+
from pathlib import Path
21+
from transformers import GenerationConfig
22+
23+
24+
25+
wer_metric = evaluate.load("wer")
26+
27+
28+
class ToAudio(torch.utils.data.Dataset):
29+
def __getitem__(self, cuts):
30+
cuts = lhotse.CutSet([c.to_mono(mono_downmix=True) if isinstance(c, lhotse.MultiCut) else c for c in cuts])
31+
audios, audio_lens = cuts.load_audio(collate=True)
32+
return {"cuts": cuts, "audios": audios, "audio_lens": audio_lens}
33+
34+
35+
def setup_dloader(audio_files, batch_size, num_workers):
36+
cuts = lhotse.CutSet([lhotse.Recording.from_file(p).to_cut() for p in audio_files])
37+
cuts = cuts.resample(16000)
38+
return torch.utils.data.DataLoader(
39+
dataset=ToAudio(),
40+
sampler=lhotse.dataset.DynamicCutSampler(cuts, max_cuts=batch_size),
41+
num_workers=num_workers,
42+
batch_size=None,
43+
)
44+
45+
46+
def transcribe(model, dloader) -> list[str]:
47+
hyps = []
48+
eos_tokens = torch.tensor([model.text_eos_id])
49+
for batch_idx, batch in enumerate(dloader):
50+
answer_ids = model.generate(
51+
prompts=[
52+
[
53+
{"role": "user", "slots": {"message": f"Transcribe the following: {model.audio_locator_tag}"}}
54+
]
55+
] * len(batch["cuts"]),
56+
audios=batch["audios"].to(model.device, non_blocking=True),
57+
audio_lens=batch["audio_lens"].to(model.device, non_blocking=True),
58+
generation_config=GenerationConfig(
59+
max_new_tokens=128,
60+
bos_token_id=model.text_bos_id,
61+
eos_token_id=eos_tokens,
62+
pad_token_id=model.text_pad_id,
63+
),
64+
)
65+
answer_ids = [parse_hyp(ans, eos_tokens) for ans in answer_ids.cpu()]
66+
hyps.extend(model.tokenizer.ids_to_text(ans).strip() for ans in answer_ids)
67+
return hyps
68+
69+
70+
def parse_hyp(answer: torch.Tensor, eos_tokens):
71+
end = (answer == torch.isin(answer, eos_tokens)).nonzero(as_tuple=True)[0]
72+
if end.numel() == 0:
73+
return answer
74+
end = end[0]
75+
return answer[:end]
76+
77+
78+
def main(args):
79+
80+
DATA_CACHE_DIR = os.path.join(os.getcwd(), "audio_cache")
81+
DATASET_NAME = args.dataset
82+
SPLIT_NAME = args.split
83+
84+
CACHE_DIR = os.path.join(DATA_CACHE_DIR, DATASET_NAME, SPLIT_NAME)
85+
if not os.path.exists(CACHE_DIR):
86+
os.makedirs(CACHE_DIR)
87+
88+
torch.set_float32_matmul_precision("medium")
89+
90+
device = torch.device(f"cuda:{args.device}")
91+
model = SALM.from_pretrained(args.model_id).eval().to(torch.bfloat16).to(device)
92+
93+
dataset = data_utils.load_data(args)
94+
95+
def download_audio_files(batch):
96+
97+
# download audio files and write the paths, transcriptions and durations to a manifest file
98+
audio_paths = []
99+
durations = []
100+
101+
for id, sample in zip(batch["id"], batch["audio"]):
102+
103+
# first step added here to make ID and wav filenames unique
104+
# several datasets like earnings22 have a hierarchical structure
105+
# for eg. earnings22/test/4432298/281.wav, earnings22/test/4450488/281.wav
106+
# lhotse uses the filename (281.wav) here as unique ID to create and name cuts
107+
# ref: https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/collation.py#L186
108+
id = id.replace('/', '_').removesuffix('.wav')
109+
110+
audio_path = os.path.join(CACHE_DIR, f"{id}.wav")
111+
112+
if "array" in sample:
113+
audio_array = np.float32(sample["array"])
114+
sample_rate = 16000
115+
116+
elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream
117+
with io.BytesIO(sample["bytes"]) as audio_file:
118+
audio_array, sample_rate = soundfile.read(audio_file, dtype="float32")
119+
120+
else:
121+
raise ValueError("Sample must have either 'array' or 'bytes' key")
122+
123+
if not os.path.exists(audio_path):
124+
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
125+
soundfile.write(audio_path, audio_array, sample_rate)
126+
127+
audio_paths.append(audio_path)
128+
durations.append(len(audio_array) / sample_rate)
129+
130+
131+
batch["references"] = batch["norm_text"]
132+
batch["audio_filepaths"] = audio_paths
133+
batch["durations"] = durations
134+
135+
return batch
136+
137+
138+
if args.max_eval_samples is not None and args.max_eval_samples > 0:
139+
print(f"Subsampling dataset to first {args.max_eval_samples} samples !")
140+
dataset = dataset.take(args.max_eval_samples)
141+
142+
dataset = data_utils.prepare_data(dataset)
143+
144+
# prepraing the offline dataset
145+
dataset = dataset.map(download_audio_files, batch_size=args.batch_size, batched=True, remove_columns=["audio"])
146+
147+
# Write manifest from daraset batch using json and keys audio_filepath, duration, text
148+
149+
all_data = {
150+
"audio_filepaths": [],
151+
"durations": [],
152+
"references": [],
153+
}
154+
155+
data_itr = iter(dataset)
156+
for data in tqdm(data_itr, desc="Downloading Samples"):
157+
for key in all_data:
158+
all_data[key].append(data[key])
159+
160+
# Sort audio_filepaths and references based on durations values
161+
sorted_indices = sorted(range(len(all_data["durations"])), key=lambda k: all_data["durations"][k], reverse=True)
162+
all_data["audio_filepaths"] = [all_data["audio_filepaths"][i] for i in sorted_indices]
163+
all_data["references"] = [all_data["references"][i] for i in sorted_indices]
164+
all_data["durations"] = [all_data["durations"][i] for i in sorted_indices]
165+
166+
167+
total_time = 0
168+
for _ in range(2): # warmup once and calculate rtf
169+
if _ == 0:
170+
audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches
171+
else:
172+
audio_files = all_data["audio_filepaths"]
173+
dloader = setup_dloader(audio_files=audio_files, batch_size=args.batch_size, num_workers=1)
174+
with torch.inference_mode():
175+
start_time = time.time()
176+
transcriptions = transcribe(model, dloader)
177+
end_time = time.time()
178+
if _ == 1:
179+
total_time += end_time - start_time
180+
total_time = total_time
181+
182+
# normalize transcriptions with English normalizer
183+
if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
184+
transcriptions = transcriptions[0]
185+
predictions = [data_utils.normalizer(pred) for pred in transcriptions]
186+
187+
avg_time = total_time / len(all_data["audio_filepaths"])
188+
189+
# Write manifest results (WER and RTFX)
190+
manifest_path = data_utils.write_manifest(
191+
all_data["references"],
192+
predictions,
193+
args.model_id,
194+
args.dataset_path,
195+
args.dataset,
196+
args.split,
197+
audio_length=all_data["durations"],
198+
transcription_time=[avg_time] * len(all_data["audio_filepaths"]),
199+
)
200+
201+
print("Results saved at path:", os.path.abspath(manifest_path))
202+
203+
wer = wer_metric.compute(references=all_data['references'], predictions=predictions)
204+
wer = round(100 * wer, 2)
205+
206+
audio_length = sum(all_data["durations"])
207+
rtfx = audio_length / total_time
208+
rtfx = round(rtfx, 2)
209+
210+
print("RTFX:", rtfx)
211+
print("WER:", wer, "%")
212+
213+
214+
if __name__ == "__main__":
215+
parser = argparse.ArgumentParser()
216+
217+
parser.add_argument(
218+
"--model_id", type=str, required=True, help="Model identifier. Should be loadable with NVIDIA NeMo.",
219+
)
220+
parser.add_argument(
221+
'--dataset_path', type=str, default='esb/datasets', help='Dataset path. By default, it is `esb/datasets`'
222+
)
223+
parser.add_argument(
224+
"--dataset",
225+
type=str,
226+
required=True,
227+
help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names "
228+
"can be found at `https://huggingface.co/datasets/esb/datasets`",
229+
)
230+
parser.add_argument(
231+
"--split",
232+
type=str,
233+
default="test",
234+
help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.",
235+
)
236+
parser.add_argument(
237+
"--device",
238+
type=int,
239+
default=-1,
240+
help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
241+
)
242+
parser.add_argument(
243+
"--batch_size", type=int, default=32, help="Number of samples to go through each streamed batch.",
244+
)
245+
parser.add_argument(
246+
"--max_eval_samples",
247+
type=int,
248+
default=None,
249+
help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.",
250+
)
251+
parser.add_argument(
252+
"--no-streaming",
253+
dest='streaming',
254+
action="store_false",
255+
help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.",
256+
)
257+
args = parser.parse_args()
258+
parser.set_defaults(streaming=True)
259+
260+
main(args)

nemo_asr/run_salm.sh

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
3+
export PYTHONPATH="..":$PYTHONPATH
4+
export TOKENIZERS_PARALLELISM=false
5+
6+
MODEL_IDs=(
7+
nvidia/canary-qwen-2.5b
8+
)
9+
BATCH_SIZE=192
10+
DEVICE_ID=0
11+
12+
num_models=${#MODEL_IDs[@]}
13+
14+
for (( i=0; i<${num_models}; i++ ));
15+
do
16+
MODEL_ID=${MODEL_IDs[$i]}
17+
18+
python run_eval_salm.py \
19+
--model_id=${MODEL_ID} \
20+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
21+
--dataset="ami" \
22+
--split="test" \
23+
--device=${DEVICE_ID} \
24+
--batch_size=${BATCH_SIZE} \
25+
--max_eval_samples=-1
26+
27+
python run_eval_salm.py \
28+
--model_id=${MODEL_ID} \
29+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
30+
--dataset="earnings22" \
31+
--split="test" \
32+
--device=${DEVICE_ID} \
33+
--batch_size=${BATCH_SIZE} \
34+
--max_eval_samples=-1
35+
36+
python run_eval_salm.py \
37+
--model_id=${MODEL_ID} \
38+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
39+
--dataset="gigaspeech" \
40+
--split="test" \
41+
--device=${DEVICE_ID} \
42+
--batch_size=${BATCH_SIZE} \
43+
--max_eval_samples=-1
44+
45+
python run_eval_salm.py \
46+
--model_id=${MODEL_ID} \
47+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
48+
--dataset="librispeech" \
49+
--split="test.clean" \
50+
--device=${DEVICE_ID} \
51+
--batch_size=${BATCH_SIZE} \
52+
--max_eval_samples=-1
53+
54+
python run_eval_salm.py \
55+
--model_id=${MODEL_ID} \
56+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
57+
--dataset="librispeech" \
58+
--split="test.other" \
59+
--device=${DEVICE_ID} \
60+
--batch_size=${BATCH_SIZE} \
61+
--max_eval_samples=-1
62+
63+
python run_eval_salm.py \
64+
--model_id=${MODEL_ID} \
65+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
66+
--dataset="spgispeech" \
67+
--split="test" \
68+
--device=${DEVICE_ID} \
69+
--batch_size=${BATCH_SIZE} \
70+
--max_eval_samples=-1
71+
72+
python run_eval_salm.py \
73+
--model_id=${MODEL_ID} \
74+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
75+
--dataset="tedlium" \
76+
--split="test" \
77+
--device=${DEVICE_ID} \
78+
--batch_size=${BATCH_SIZE} \
79+
--max_eval_samples=-1
80+
81+
python run_eval_salm.py \
82+
--model_id=${MODEL_ID} \
83+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
84+
--dataset="voxpopuli" \
85+
--split="test" \
86+
--device=${DEVICE_ID} \
87+
--batch_size=${BATCH_SIZE} \
88+
--max_eval_samples=-1
89+
90+
# Evaluate results
91+
RUNDIR=`pwd` && \
92+
cd ../normalizer && \
93+
python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
94+
cd $RUNDIR
95+
96+
done

requirements/requirements_nemo.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
git+https://github.com/NVIDIA/NeMo.git@r2.3.0#egg=nemo_toolkit[asr]
1+
git+https://github.com/NVIDIA/NeMo.git@6294bc7708ce67522b92f5e9b6917ea0b2e23429#egg=nemo_toolkit[asr]
22
tqdm
33
soundfile
44
librosa

0 commit comments

Comments
 (0)