Skip to content

Commit ae71445

Browse files
committed
🔬 test(benchmarks): add LibriSpeech performance comparison for werx, werpy, and jiwer
1 parent 19222cc commit ae71445

File tree

1 file changed

+81
-0
lines changed

1 file changed

+81
-0
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# SPDX-FileCopyrightText: 2023 Analytics in Motion <https://www.analyticsinmotion.com>
2+
# SPDX-License-Identifier: BSD-3-Clause
3+
4+
"""
5+
Speed comparison benchmark for WER calculation packages.
6+
7+
This script compares the performance of werx, werpy, and jiwer on the
8+
LibriSpeech evaluation dataset using timeit for accurate timing measurements.
9+
"""
10+
11+
from datasets import load_dataset
12+
import werpy
13+
import werx
14+
import jiwer
15+
import timeit
16+
17+
# Load the consolidated CSV from the Hugging Face Hub
18+
dataset = load_dataset(
19+
"analyticsinmotion/librispeech-eval",
20+
data_files="all_splits.csv",
21+
split="train"
22+
)
23+
24+
# Specify which splits and model/version to evaluate
25+
splits = ["test-clean", "test-other"]
26+
model_name = "whisper-base"
27+
model_version = "v20240930"
28+
29+
for split in splits:
30+
print(f"\n{'='*70}")
31+
print(f"Benchmarking: {split}")
32+
print(f"{'='*70}\n")
33+
34+
# Filter references and hypotheses for the chosen split/model/version
35+
filtered = dataset.filter(
36+
lambda x: x["split"] == split and
37+
x["model_name"] == model_name and
38+
x["model_version"] == model_version
39+
)
40+
41+
filtered = list(filtered)
42+
references = [werpy.normalize(row["reference"]) for row in filtered]
43+
hypotheses = [werpy.normalize(row["hypothesis"]) for row in filtered]
44+
45+
print(f"Loaded {len(references):,} utterances\n")
46+
47+
# --- WER tools ---
48+
tools = {
49+
"WERX": werx.wer,
50+
"WERPY": werpy.wer,
51+
"JIWER": jiwer.wer,
52+
}
53+
54+
# --- Run + time each tool using timeit ---
55+
results = []
56+
n_repeats = 10 # Number of repeats for timeit
57+
58+
for name, func in tools.items():
59+
def stmt():
60+
return func(references, hypotheses)
61+
total_time = timeit.timeit(stmt, number=n_repeats)
62+
avg_time = total_time / n_repeats
63+
wer = func(references, hypotheses)
64+
results.append((name, wer, avg_time))
65+
66+
# --- Normalize by fastest average time ---
67+
min_time = min(r[2] for r in results)
68+
normalized_results = [
69+
(name, wer, t, t / min_time) for name, wer, t in results
70+
]
71+
72+
# --- Print CLI-friendly table ---
73+
print("\n Word Error Rate Benchmark:\n")
74+
print(f"{'Tool':<15} {'WER':<8} {'WER (%)':<10} {'Time (s)':<12} {'Norm Time':<18}")
75+
print("-" * 70)
76+
for name, wer, t, norm in normalized_results:
77+
if name == "WERX":
78+
norm_str = "1.00× (baseline)"
79+
else:
80+
norm_str = f"{norm:.2f}× slower"
81+
print(f"{name:<15} {wer:.4f} {wer*100:6.2f}% {t:.6f} {norm_str:<18}")

0 commit comments

Comments
 (0)