|
| 1 | +# SPDX-FileCopyrightText: 2023 Analytics in Motion <https://www.analyticsinmotion.com> |
| 2 | +# SPDX-License-Identifier: BSD-3-Clause |
| 3 | + |
| 4 | +""" |
| 5 | +Speed comparison benchmark for WER calculation packages. |
| 6 | +
|
| 7 | +This script compares the performance of werx, werpy, and jiwer on the |
| 8 | +LibriSpeech evaluation dataset using timeit for accurate timing measurements. |
| 9 | +""" |
| 10 | + |
| 11 | +from datasets import load_dataset |
| 12 | +import werpy |
| 13 | +import werx |
| 14 | +import jiwer |
| 15 | +import timeit |
| 16 | + |
| 17 | +# Load the consolidated CSV from the Hugging Face Hub |
| 18 | +dataset = load_dataset( |
| 19 | + "analyticsinmotion/librispeech-eval", |
| 20 | + data_files="all_splits.csv", |
| 21 | + split="train" |
| 22 | +) |
| 23 | + |
| 24 | +# Specify which splits and model/version to evaluate |
| 25 | +splits = ["test-clean", "test-other"] |
| 26 | +model_name = "whisper-base" |
| 27 | +model_version = "v20240930" |
| 28 | + |
| 29 | +for split in splits: |
| 30 | + print(f"\n{'='*70}") |
| 31 | + print(f"Benchmarking: {split}") |
| 32 | + print(f"{'='*70}\n") |
| 33 | + |
| 34 | + # Filter references and hypotheses for the chosen split/model/version |
| 35 | + filtered = dataset.filter( |
| 36 | + lambda x: x["split"] == split and |
| 37 | + x["model_name"] == model_name and |
| 38 | + x["model_version"] == model_version |
| 39 | + ) |
| 40 | + |
| 41 | + filtered = list(filtered) |
| 42 | + references = [werpy.normalize(row["reference"]) for row in filtered] |
| 43 | + hypotheses = [werpy.normalize(row["hypothesis"]) for row in filtered] |
| 44 | + |
| 45 | + print(f"Loaded {len(references):,} utterances\n") |
| 46 | + |
| 47 | + # --- WER tools --- |
| 48 | + tools = { |
| 49 | + "WERX": werx.wer, |
| 50 | + "WERPY": werpy.wer, |
| 51 | + "JIWER": jiwer.wer, |
| 52 | + } |
| 53 | + |
| 54 | + # --- Run + time each tool using timeit --- |
| 55 | + results = [] |
| 56 | + n_repeats = 10 # Number of repeats for timeit |
| 57 | + |
| 58 | + for name, func in tools.items(): |
| 59 | + def stmt(): |
| 60 | + return func(references, hypotheses) |
| 61 | + total_time = timeit.timeit(stmt, number=n_repeats) |
| 62 | + avg_time = total_time / n_repeats |
| 63 | + wer = func(references, hypotheses) |
| 64 | + results.append((name, wer, avg_time)) |
| 65 | + |
| 66 | + # --- Normalize by fastest average time --- |
| 67 | + min_time = min(r[2] for r in results) |
| 68 | + normalized_results = [ |
| 69 | + (name, wer, t, t / min_time) for name, wer, t in results |
| 70 | + ] |
| 71 | + |
| 72 | + # --- Print CLI-friendly table --- |
| 73 | + print("\n Word Error Rate Benchmark:\n") |
| 74 | + print(f"{'Tool':<15} {'WER':<8} {'WER (%)':<10} {'Time (s)':<12} {'Norm Time':<18}") |
| 75 | + print("-" * 70) |
| 76 | + for name, wer, t, norm in normalized_results: |
| 77 | + if name == "WERX": |
| 78 | + norm_str = "1.00× (baseline)" |
| 79 | + else: |
| 80 | + norm_str = f"{norm:.2f}× slower" |
| 81 | + print(f"{name:<15} {wer:.4f} {wer*100:6.2f}% {t:.6f} {norm_str:<18}") |
0 commit comments