diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0dc01c1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,134 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# speechbrain +*/pretrained_models/* +*/model_checkpoints/* +*/results/* \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4559bd5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2024] [The HuggingFace Inc. team] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index f3cee30..dd0fa89 100644 --- a/README.md +++ b/README.md @@ -1 +1,243 @@ # Open ASR Leaderboard + +This repository contains the code for the Open ASR Leaderboard. The leaderboard is a Gradio Space that allows users to compare the accuracy of ASR models on a variety of datasets. The leaderboard is hosted at [hf-audio/open_asr_leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard). + +# Requirements + +Each library has its own set of requirements. We recommend using a clean conda environment, with Python 3.10 or above. + +1) Clone this repository. +2) Install PyTorch by following the instructions here: https://pytorch.org/get-started/locally/ +3) Install the common requirements for all library by running `pip install -r requirements/requirements.txt`. +4) Install the requirements for each library you wish to evaluate by running `pip install -r requirements/requirements_.txt`. +5) Connect your Hugging Face account by running `huggingface-cli login`. + +**Note:** If you wish to run NeMo, the benchmark currently needs CUDA 12.6 to fix a problem in previous drivers for RNN-T inference with cooperative kernels inside conditional nodes (see here: https://github.com/NVIDIA/NeMo/pull/9869). Running `nvidia-smi` should output "CUDA Version: 12.6" or higher. + +# Evaluate a model + +Each library has a script `run_eval.py` that acts as the entry point for evaluating a model. The script is run by the corresponding bash script for each model that is being evaluated. The script then outputs a JSONL file containing the predictions of the model on each dataset, and summarizes the Word Error Rate (WER) and Inverse Real-Time Factor (RTFx) of the model on each dataset after completion. + +To reproduce existing results: + +1) Change directory into the library you wish to evaluate. For example, `cd transformers`. +2) Run the bash script for the model you wish to evaluate. For example, `bash run_wav2vec2.sh`. + +**Note**: All evaluations were run using an NVIDIA A100-SXM4-80GB GPU, with NVIDIA driver 560.28.03, CUDA 12.6, and PyTorch 2.4.0. You should ensure you use the same configuration when submitting results. If you are unable to create an equivalent machine, please request one of the maintainers to run your scripts for evaluation! + +# Add a new library + +To add a new library for evaluation in this benchmark, please follow the steps below: + +1) Fork this repository and create a new branch +2) Create a new directory for your library. For example, `mkdir transformers`. +3) Copy the template `run_eval.py` script below into your new directory. The script should be updated for the new library by making two modifications. Otherwise, please try to keep the structure of the script the same as in the template. In particular, the data loading, evaluation and manifest writing must be done in the same way as other libraries for consistency. + 1) Update the model loading logic in the `main` function + 2) Update the inference logic in the `benchmark` function + +
+ + Template script for Transformers: + +```python +import argparse +import os +import torch +from transformers import WhisperForConditionalGeneration, WhisperProcessor +import evaluate +from normalizer import data_utils +import time +from tqdm import tqdm + +wer_metric = evaluate.load("wer") + +def main(args): + # Load model (FILL ME!) + model = WhisperForConditionalGeneration.from_pretrained(args.model_id, torch_dtype=torch.bfloat16).to(args.device) + processor = WhisperProcessor.from_pretrained(args.model_id) + + def benchmark(batch): + # Load audio inputs + audios = [audio["array"] for audio in batch["audio"]] + batch["audio_length_s"] = [len(audio) / batch["audio"][0]["sampling_rate"] for audio in audios] + minibatch_size = len(audios) + + # Start timing + start_time = time.time() + + # INFERENCE (FILL ME! Replacing 1-3 with steps from your library) + # 1. Pre-processing + inputs = processor(audios, sampling_rate=16_000, return_tensors="pt").to(args.device) + inputs["input_features"] = inputs["input_features"].to(torch.bfloat16) + # 2. Generation + pred_ids = model.generate(**inputs) + # 3. Post-processing + pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True) + + # End timing + runtime = time.time() - start_time + + # normalize by minibatch size since we want the per-sample time + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + batch["predictions"] = [data_utils.normalizer(pred) for pred in pred_text] + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + warmup_dataset = data_utils.load_data(args) + warmup_dataset = data_utils.prepare_data(warmup_dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = warmup_dataset.take(num_warmup_samples) + else: + warmup_dataset = warmup_dataset.select(range(min(num_warmup_samples, len(warmup_dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True)) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with 🤗 Transformers", + ) + parser.add_argument( + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=10, + help="Number of warm-up steps to run before launching the timed runs.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) + +``` + +
+ +4) Create one bash file per model type following the conversion `run_.sh`. + - The bash script should follow the same steps as other libraries. You can copy the example for [run_whisper.sh](./transformers/run_whisper.sh) and update it to your library + - Different model sizes of the same type should share the script. For example `Wav2Vec` and `Wav2Vec2` would be two separate scripts, but different size of `Wav2Vec2` would be part of the same script. + - **Important:** for a given model, you can tune decoding hyper-parameters to maximize benchmark performance (e.g. batch size, beam size, etc.). However, you must use the **same decoding hyper-parameters** for each dataset in the benchmark. For more details, refer to the [ESB paper](https://arxiv.org/abs/2210.13352). +5) Submit a PR for your changes. + +# Add a new model + +To add a model from a new library for evaluation in this benchmark, you can follow the steps noted above. + +To add a model from an existing library, we can simplify the steps to: + +1) If the model is already supported, but of a different size, simply add the new model size to the list of models run by the corresponding bash script. +2) If the model is entirely new, create a new bash script based on others of that library and add the new model and its sizes to that script. +3) Run the evaluation script to obtain a list of predictions for the new model on each of the datasets. +4) Submit a PR for your changes. + +# Citation + + +```bibtex +@misc{open-asr-leaderboard, + title = {Open Automatic Speech Recognition Leaderboard}, + author = {Srivastav, Vaibhav and Majumdar, Somshubra and Koluguri, Nithin and Moumen, Adel and Gandhi, Sanchit and Hugging Face Team and Nvidia NeMo Team and SpeechBrain Team}, + year = 2023, + publisher = {Hugging Face}, + howpublished = "\\url{https://huggingface.co/spaces/huggingface.co/spaces/open-asr-leaderboard/leaderboard}" +} +``` diff --git a/api/run_api.sh b/api/run_api.sh new file mode 100755 index 0000000..871a140 --- /dev/null +++ b/api/run_api.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +export OPENAI_API_KEY="your_api_key" +export ASSEMBLYAI_API_KEY="your_api_key" +export ELEVENLABS_API_KEY="your_api_key" +export REVAI_API_KEY="your_api_key" + +MODEL_IDs=( + "openai/gpt-4o-transcribe" + "openai/gpt-4o-mini-transcribe" + "openai/whisper-1" + "assembly/best" + "elevenlabs/scribe_v1" + "revai/machine" # please use --use_url=True + "revai/fusion" # please use --use_url=True +) + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + python run_eval.py \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --model_name ${MODEL_ID} + + python run_eval.py \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --model_name ${MODEL_ID} + + python run_eval.py \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --model_name ${MODEL_ID} + + python run_eval.py \ + --dataset_path "hf-audio/esb-datasets-test-only-sorted" \ + --dataset "librispeech" \ + --split "test.clean" \ + --model_name ${MODEL_ID} + + python run_eval.py \ + --dataset_path "hf-audio/esb-datasets-test-only-sorted" \ + --dataset "librispeech" \ + --split "test.other" \ + --model_name ${MODEL_ID} + + python run_eval.py \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --model_name ${MODEL_ID} + + python run_eval.py \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --model_name ${MODEL_ID} + + python run_eval.py \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --model_name ${MODEL_ID} + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/api/run_eval.py b/api/run_eval.py new file mode 100644 index 0000000..6c826c8 --- /dev/null +++ b/api/run_eval.py @@ -0,0 +1,289 @@ +import argparse +import datasets +import evaluate +import soundfile as sf +import tempfile +import time +import os +import requests +from tqdm import tqdm +from dotenv import load_dotenv +from io import BytesIO +import assemblyai as aai +import openai +from elevenlabs.client import ElevenLabs +from rev_ai import apiclient +from rev_ai.models import CustomVocabulary, CustomerUrlData +from normalizer import data_utils +import concurrent.futures + +load_dotenv() + +def fetch_audio_urls(dataset_path, dataset, split, batch_size=100, max_retries=20): + API_URL = "https://datasets-server.huggingface.co/rows" + + size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_path}&config={dataset}&split={split}" + size_response = requests.get(size_url).json() + total_rows = size_response['size']['config']['num_rows'] + audio_urls = [] + for offset in tqdm(range(0, total_rows, batch_size), desc="Fetching audio URLs"): + params = { + "dataset": dataset_path, + "config": dataset, + "split": split, + "offset": offset, + "length": min(batch_size, total_rows - offset) + } + + retries = 0 + while retries <= max_retries: + try: + response = requests.get(API_URL, params=params) + response.raise_for_status() + data = response.json() + audio_urls.extend(data['rows']) + break + except (requests.exceptions.RequestException, ValueError) as e: + retries += 1 + print(f"Error fetching data: {e}, retrying ({retries}/{max_retries})...") + time.sleep(10) + if retries >= max_retries: + raise Exception("Max retries exceeded while fetching data.") + time.sleep(1) + return audio_urls + +def transcribe_with_retry(model_name, audio_file_path, sample, max_retries=10, use_url=False): + retries = 0 + while retries <= max_retries: + try: + if model_name.startswith("assembly/"): + aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") + transcriber = aai.Transcriber() + config = aai.TranscriptionConfig( + speech_model=model_name.split("/")[1], + language_code="en", + ) + if use_url: + audio_url = sample['row']['audio'][0]['src'] + audio_duration = sample['row']['audio_length_s'] + if audio_duration < 0.160: + print(f"Skipping audio duration {audio_duration}s") + return "." + transcript = transcriber.transcribe(audio_url, config=config) + else: + audio_duration = len(sample["audio"]["array"]) / sample["audio"]["sampling_rate"] + if audio_duration < 0.160: + print(f"Skipping audio duration {audio_duration}s") + return "." + transcript = transcriber.transcribe(audio_file_path, config=config) + + if transcript.status == aai.TranscriptStatus.error: + raise Exception(f"AssemblyAI transcription error: {transcript.error}") + return transcript.text + + elif model_name.startswith("openai/"): + if use_url: + response = requests.get(sample['row']['audio'][0]['src']) + audio_data = BytesIO(response.content) + response = openai.Audio.transcribe( + model=model_name.split("/")[1], + file=audio_data, + response_format="text", + language="en", + temperature=0.0, + ) + else: + with open(audio_file_path, "rb") as audio_file: + response = openai.Audio.transcribe( + model=model_name.split("/")[1], + file=audio_file, + response_format="text", + language="en", + temperature=0.0, + ) + return response.strip() + + elif model_name.startswith("elevenlabs/"): + client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) + if use_url: + response = requests.get(sample['row']['audio'][0]['src']) + audio_data = BytesIO(response.content) + transcription = client.speech_to_text.convert( + file=audio_data, + model_id=model_name.split("/")[1], + language_code="eng", + tag_audio_events=True, + + ) + else: + with open(audio_file_path, "rb") as audio_file: + transcription = client.speech_to_text.convert( + file=audio_file, + model_id=model_name.split("/")[1], + language_code="eng", + tag_audio_events=True, + ) + return transcription.text + + elif model_name.startswith("revai/"): + access_token = os.getenv("REVAI_API_KEY") + client = apiclient.RevAiAPIClient(access_token) + + if use_url: + # Submit job with URL for Rev.ai + job = client.submit_job_url( + transcriber=model_name.split("/")[1], + source_config=CustomerUrlData(sample['row']['audio'][0]['src']), + metadata="benchmarking_job", + ) + else: + # Submit job with local file + job = client.submit_job_local_file( + transcriber=model_name.split("/")[1], + filename=audio_file_path, + metadata="benchmarking_job", + ) + + # Polling until job is done + while True: + job_details = client.get_job_details(job.id) + if job_details.status.name in ["IN_PROGRESS", "TRANSCRIBING"]: + time.sleep(0.1) + continue + elif job_details.status.name == "FAILED": + raise Exception("RevAI transcription failed.") + elif job_details.status.name == "TRANSCRIBED": + break + + transcript_object = client.get_transcript_object(job.id) + + # Combine all words from all monologues + transcript_text = [] + for monologue in transcript_object.monologues: + for element in monologue.elements: + transcript_text.append(element.value) + + return "".join(transcript_text) if transcript_text else "" + + else: + raise ValueError("Invalid model prefix, must start with 'assembly/', 'openai/', 'elevenlabs/' or 'revai/'") + + except Exception as e: + retries += 1 + if retries > max_retries: + return "." + + if not use_url: + sf.write(audio_file_path, sample["audio"]["array"], sample["audio"]["sampling_rate"], format="WAV") + delay = 1 + print(f"API Error: {str(e)}. Retrying in {delay}s... (Attempt {retries}/{max_retries})") + time.sleep(delay) + + +def transcribe_dataset(dataset_path, dataset, split, model_name, use_url=False, max_samples=None, max_workers=4): + if use_url: + audio_rows = fetch_audio_urls(dataset_path, dataset, split) + if max_samples: + audio_rows = audio_rows[:max_samples] + ds = audio_rows + else: + ds = datasets.load_dataset(dataset_path, dataset, split=split, streaming=False) + ds = data_utils.prepare_data(ds) + if max_samples: + ds = ds.take(max_samples) + + results = {"references": [], "predictions": [], "audio_length_s": [], "transcription_time_s": []} + + print(f"Transcribing with model: {model_name}") + + def process_sample(sample): + if use_url: + reference = sample['row']['text'].strip() or " " + audio_duration = sample['row']['audio_length_s'] + start = time.time() + try: + transcription = transcribe_with_retry(model_name, None, sample, use_url=True) + except Exception as e: + print(f"Failed to transcribe after retries: {e}") + return None + + else: + reference = sample.get("norm_text", "").strip() or " " + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: + sf.write(tmpfile.name, sample["audio"]["array"], sample["audio"]["sampling_rate"], format="WAV") + tmp_path = tmpfile.name + audio_duration = len(sample["audio"]["array"]) / sample["audio"]["sampling_rate"] + + start = time.time() + try: + transcription = transcribe_with_retry(model_name, tmp_path, sample, use_url=False) + except Exception as e: + print(f"Failed to transcribe after retries: {e}") + os.unlink(tmp_path) + return None + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + else: + print(f"File {tmp_path} does not exist") + + transcription_time = time.time() - start + return reference, transcription, audio_duration, transcription_time + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_sample = {executor.submit(process_sample, sample): sample for sample in ds} + for future in tqdm(concurrent.futures.as_completed(future_to_sample), total=len(future_to_sample), desc="Transcribing"): + result = future.result() + if result: + reference, transcription, audio_duration, transcription_time = result + results["predictions"].append(transcription) + results["references"].append(reference) + results["audio_length_s"].append(audio_duration) + results["transcription_time_s"].append(transcription_time) + + results["predictions"] = [data_utils.normalizer(transcription) or " " for transcription in results["predictions"]] + results["references"] = [data_utils.normalizer(reference) or " " for reference in results["references"]] + + manifest_path = data_utils.write_manifest( + results["references"], + results["predictions"], + model_name.replace("/", "-"), + dataset_path, + dataset, + split, + audio_length=results["audio_length_s"], + transcription_time=results["transcription_time_s"], + ) + + print("Results saved at path:", manifest_path) + + wer_metric = evaluate.load("wer") + wer = wer_metric.compute(references=results["references"], predictions=results["predictions"]) + wer_percent = round(100 * wer, 2) + rtfx = round(sum(results["audio_length_s"]) / sum(results["transcription_time_s"]), 2) + + print("WER:", wer_percent, "%") + print("RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Unified Transcription Script with Concurrency") + parser.add_argument("--dataset_path", required=True) + parser.add_argument("--dataset", required=True) + parser.add_argument("--split", default="test") + parser.add_argument("--model_name", required=True, help="Prefix model name with 'assembly/', 'openai/', or 'elevenlabs/'") + parser.add_argument("--max_samples", type=int, default=None) + parser.add_argument("--max_workers", type=int, default=300, help="Number of concurrent threads") + parser.add_argument("--use_url", action="store_true", help="Use URL-based audio fetching instead of datasets") + + args = parser.parse_args() + + transcribe_dataset( + dataset_path=args.dataset_path, + dataset=args.dataset, + split=args.split, + model_name=args.model_name, + use_url=args.use_url, + max_samples=args.max_samples, + max_workers=args.max_workers, + ) diff --git a/ctranslate2/run_eval.py b/ctranslate2/run_eval.py new file mode 100644 index 0000000..cdd0d3f --- /dev/null +++ b/ctranslate2/run_eval.py @@ -0,0 +1,141 @@ +"""Run evaluation for ctranslate2 whisper models.""""" +import argparse +import os +import time + +import evaluate +from faster_whisper import WhisperModel +from tqdm import tqdm + +from normalizer import data_utils + +wer_metric = evaluate.load("wer") + + +def main(args) -> None: + """Main function to run evaluation on a dataset.""" + asr_model = WhisperModel( + model_size_or_path=args.model_id, + compute_type="float16", + device="cuda", + device_index=args.device + ) + + def benchmark(batch): + start_time = time.time() + segments, _ = asr_model.transcribe(batch["audio"]["array"], language="en") + outputs = [segment._asdict() for segment in segments] + batch["transcription_time_s"] = time.time() - start_time + batch["predictions"] = data_utils.normalizer("".join([segment["text"] for segment in outputs])).strip() + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + if args.streaming: + warmup_dataset = dataset.take(args.warmup_steps) + else: + warmup_dataset = dataset.select(range(min(args.warmup_steps, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, remove_columns=["audio"])) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map(benchmark, remove_columns=["audio"]) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with faster-whisper", + ) + parser.add_argument( + '--dataset_path', type=str, default='esb/datasets', help='Dataset path. By default, it is `esb/datasets`' + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`" + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest='streaming', + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=5, + help="Number of warm-up steps to run before launching the timed runs.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) diff --git a/ctranslate2/run_whisper.sh b/ctranslate2/run_whisper.sh new file mode 100755 index 0000000..e294e3c --- /dev/null +++ b/ctranslate2/run_whisper.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("tiny.en" "small.en" "base.en" "medium.en" "large-v1" "large-v2" "large-v3") +DEVICE_INDEX=0 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/data/sample_4469669.wav b/data/sample_4469669.wav new file mode 100644 index 0000000..e9708e6 Binary files /dev/null and b/data/sample_4469669.wav differ diff --git a/data/sample_ami-es2015b.wav b/data/sample_ami-es2015b.wav new file mode 100644 index 0000000..74aaf02 Binary files /dev/null and b/data/sample_ami-es2015b.wav differ diff --git a/granite/run_eval.py b/granite/run_eval.py new file mode 100644 index 0000000..da84947 --- /dev/null +++ b/granite/run_eval.py @@ -0,0 +1,219 @@ +import argparse +import os +import torch +from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, models +import evaluate +from normalizer import data_utils +import time +from tqdm import tqdm + +# ensure installed transformers supports granite_speech +assert hasattr(models, "granite_speech") + +wer_metric = evaluate.load("wer") +torch.set_float32_matmul_precision('high') + +def main(args): + processor = AutoProcessor.from_pretrained(args.model_id) + tokenizer = processor.tokenizer + model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_id).to(args.device) + + # create text prompt + chat = [ + { + "role": "system", + "content": "Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant", + }, + { + "role": "user", + "content": "<|audio|>can you transcribe the speech into a written format?", + } + ] + + text = tokenizer.apply_chat_template( + chat, tokenize=False, add_generation_prompt=True + ) + + gen_kwargs = {"max_new_tokens": args.max_new_tokens, "num_beams": args.num_beams} + + def benchmark(batch, min_new_tokens=None): + # Load audio inputs + audios = [audio["array"] for audio in batch["audio"]] + minibatch_size = len(audios) + texts=[text] * minibatch_size + + # START TIMING + start_time = time.time() + + with torch.autocast(model.device.type, enabled=True): + model_inputs = processor( + texts, + audios, + device=args.device, # Computation device; returned tensors are put on CPU + return_tensors="pt", + ).to(args.device) + + # Model Inference + model_outputs = model.generate( + **model_inputs, + bos_token_id=tokenizer.bos_token_id, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id, + repetition_penalty=1.0, + **gen_kwargs, + min_new_tokens=min_new_tokens, + ) + + # Transformers includes the input IDs in the response. + num_input_tokens = model_inputs["input_ids"].shape[-1] + new_tokens = model_outputs[:, num_input_tokens:] + + output_text = tokenizer.batch_decode( + new_tokens, add_special_tokens=False, skip_special_tokens=True + ) + + # END TIMING + runtime = time.time() - start_time + + # normalize by minibatch size since we want the per-sample time + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + batch["predictions"] = [data_utils.normalizer(pred) for pred in output_text] + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = dataset.take(num_warmup_samples) + else: + warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"min_new_tokens": args.max_new_tokens})) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with 🤗 Transformers", + ) + parser.add_argument( + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=16, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--num_beams", + type=int, + default=1, + help="Number of beams for beam search.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=None, + help="Maximum number of tokens to generate (for auto-regressive models).", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=2, + help="Number of warm-up steps to run before launching the timed runs.", + ) + + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) diff --git a/granite/run_granite.sh b/granite/run_granite.sh new file mode 100755 index 0000000..6c83d42 --- /dev/null +++ b/granite/run_granite.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=( + "ibm-granite/granite-speech-3.3-2b" + "ibm-granite/granite-speech-3.3-8b" +) + +BATCH_SIZEs=( + 20 + 12 +) + +NUM_BEAMS=1 +MAX_NEW_TOKENS=200 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + BATCH_SIZE=${BATCH_SIZEs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/liteASR/run_eval.py b/liteASR/run_eval.py new file mode 100644 index 0000000..99a7d6e --- /dev/null +++ b/liteASR/run_eval.py @@ -0,0 +1,231 @@ +import argparse +import os +import torch +from torch.nn.attention import sdpa_kernel, SDPBackend +from transformers import AutoConfig, AutoModel, AutoModelForCTC, AutoProcessor, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING +import evaluate +from normalizer import data_utils +import time +from tqdm import tqdm + +wer_metric = evaluate.load("wer") +torch.set_float32_matmul_precision('high') + +def main(args): + model = AutoModel.from_pretrained(args.model_id, torch_dtype=torch.float16, trust_remote_code=True, force_download=True).to(args.device) + processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo", force_download=True) + model_input_name = processor.model_input_names[0] + + if model.can_generate(): + gen_kwargs = {"max_new_tokens": 224} + elif args.max_new_tokens: + raise ValueError("`max_new_tokens` should only be set for auto-regressive models, but got a CTC model.") + + if args.torch_compile: + model.forward = torch.compile(model.forward, mode=args.compile_mode, fullgraph=True) + if model.can_generate(): + # enable static k/v cache for autoregressive models + model.generation_config.cache_implementation = "static" + + def benchmark(batch, min_new_tokens=None): + # Load audio inputs + audios = [audio["array"] for audio in batch["audio"]] + minibatch_size = len(audios) + + # START TIMING + start_time = time.time() + + # 1. Pre-Processing + # 1.1 Pad audios to max batch size if using torch compile to prevent re-compilations + padding_size = None + if minibatch_size != args.batch_size and args.torch_compile: + padding_size = args.batch_size - minibatch_size + padding_audios = [audios[-1] for _ in range(padding_size)] + audios.extend(padding_audios) + + if not model.can_generate(): #or len(audios[0]) > processor.feature_extractor.n_samples: + # 1.2 Either CTC pre-processing (normalize to mean 0, std 1), or long-form Whisper processing + inputs = processor( + audios, + sampling_rate=16_000, + truncation=False, + padding="longest", + return_tensors="pt", + return_attention_mask=True, + ) + else: + # 1.3 Standard Whisper processing: pad audios to 30-seconds and converted to log-mel + inputs = processor(audios, sampling_rate=16_000, return_tensors="pt", device=args.device) + + inputs = inputs.to(args.device) + inputs[model_input_name] = inputs[model_input_name].to(torch.float16) + + # 2. Model Inference + with sdpa_kernel(SDPBackend.MATH if args.torch_compile else SDPBackend.FLASH_ATTENTION): + forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe") + if model.can_generate(): + # 2.1 Auto-regressive generation for encoder-decoder models + pred_ids = model.generate(**inputs, **gen_kwargs, min_new_tokens=min_new_tokens, forced_decoder_ids=forced_decoder_ids) + else: + # 2.2. Single forward pass for CTC + with torch.no_grad(): + logits = model(**inputs).logits + pred_ids = logits.argmax(-1) + + # 3. Post-processing + # 3.1 Strip padded ids from predictions + if padding_size is not None: + pred_ids = pred_ids[:-padding_size, ...] + + # 3.2 Convert token ids to text transcription + pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True) + + # END TIMING + runtime = time.time() - start_time + + # normalize by minibatch size since we want the per-sample time + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + batch["predictions"] = [data_utils.normalizer(pred) for pred in pred_text] + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = dataset.take(num_warmup_samples) + else: + warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"min_new_tokens": args.max_new_tokens})) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with 🤗 Transformers", + ) + parser.add_argument( + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation'` for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=16, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=None, + help="Maximum number of tokens to generate (for auto-regressive models).", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to JIT compile the forward pass of the model.", + ) + parser.add_argument( + "--compile_mode", + type=str, + default="max-autotune", + help="Mode for torch compiling model forward pass. Can be either 'default', 'reduce-overhead', 'max-autotune' or 'max-autotune-no-cudagraphs'.", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=10, + help="Number of warm-up steps to run before launching the timed runs.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) diff --git a/liteASR/run_liteasr.sh b/liteASR/run_liteasr.sh new file mode 100755 index 0000000..600cec9 --- /dev/null +++ b/liteASR/run_liteasr.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=( + "efficient-speech/lite-whisper-large-v3-acc" + "efficient-speech/lite-whisper-large-v3" + "efficient-speech/lite-whisper-large-v3-fast" + "efficient-speech/lite-whisper-large-v3-turbo-acc" + "efficient-speech/lite-whisper-large-v3-turbo" + "efficient-speech/lite-whisper-large-v3-turbo-fast" +) +BATCH_SIZE=64 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/moonshine/run_eval.py b/moonshine/run_eval.py new file mode 100644 index 0000000..c575ab0 --- /dev/null +++ b/moonshine/run_eval.py @@ -0,0 +1,208 @@ +import argparse +import os +import torch +from transformers import MoonshineForConditionalGeneration, AutoProcessor + +import evaluate +from normalizer import data_utils +import time +from tqdm import tqdm +import numpy as np + +wer_metric = evaluate.load("wer") +torch.set_float32_matmul_precision('high') + +def main(args): + torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + model = MoonshineForConditionalGeneration.from_pretrained(args.model_id).to(args.device).to(torch_dtype) + processor = AutoProcessor.from_pretrained(args.model_id) + + if args.torch_compile: + model.forward = torch.compile(model.forward, mode=args.compile_mode, fullgraph=True) + if model.can_generate(): + # enable static k/v cache for autoregressive models + model.generation_config.cache_implementation = "static" + + def benchmark(batch, min_new_tokens=None): + # Load audio inputs + audios = [audio["array"] for audio in batch["audio"]] + minibatch_size = len(audios) + + # START TIMING + start_time = time.time() + + # 1. Pre-Processing + # 1.1 Pad audios to max batch size if using torch compile to prevent re-compilations + padding_size = 0 + if minibatch_size != args.batch_size and args.torch_compile: + padding_size = args.batch_size - minibatch_size + padding_audios = [audios[-1] for _ in range(padding_size)] + audios.extend(padding_audios) + + inputs = processor(audios, return_tensors="pt", padding=True, sampling_rate=16000).to(args.device).to(torch_dtype) + + # Create a mask for output tokens to limit length based on input audio clip length. + # Add 2 to token limits to account for and . + token_generation_limits = [len(clip) * 6.5 // 16000 + 2 for clip in audios] + max_new_tokens = torch.tensor(token_generation_limits).reshape((-1, 1)).to(args.device) + + pred_ids = model.generate(**inputs, max_new_tokens=max_new_tokens.max()) + output_mask = torch.arange(pred_ids.shape[-1]).repeat((pred_ids.shape[0], 1)).to(args.device) + output_mask = output_mask > max_new_tokens + + eot_token = model.config.eos_token_id + pred_ids.masked_fill(output_mask, eot_token) + + # 3.2 Convert token ids to text transcription + pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True) + + # END TIMING + runtime = time.time() - start_time + + # normalize by minibatch size since we want the per-sample time + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + pred_text = pred_text if padding_size == 0 else pred_text[:-padding_size] + batch["predictions"] = [data_utils.normalizer(pred) for pred in pred_text] + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = dataset.take(num_warmup_samples) + else: + warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"min_new_tokens": args.max_new_tokens})) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with 🤗 Transformers", + ) + parser.add_argument( + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=16, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=None, + help="Maximum number of tokens to generate (for auto-regressive models).", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to JIT compile the forward pass of the model.", + ) + parser.add_argument( + "--compile_mode", + type=str, + default="max-autotune", + help="Mode for torch compiling model forward pass. Can be either 'default', 'reduce-overhead', 'max-autotune' or 'max-autotune-no-cudagraphs'.", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=10, + help="Number of warm-up steps to run before launching the timed runs.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) diff --git a/moonshine/run_moonshine.sh b/moonshine/run_moonshine.sh new file mode 100755 index 0000000..49844be --- /dev/null +++ b/moonshine/run_moonshine.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("usefulsensors/moonshine-base" "usefulsensors/moonshine-tiny") +BATCH_SIZE=64 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/nemo_asr/run_canary.sh b/nemo_asr/run_canary.sh new file mode 100755 index 0000000..bf26959 --- /dev/null +++ b/nemo_asr/run_canary.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("nvidia/canary-1b-flash") # options: "nvidia/canary-1b" "nvidia/canary-1b-flash" "nvidia/canary-180m-flash" +BATCH_SIZE=128 +DEVICE_ID=0 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/nemo_asr/run_eval.py b/nemo_asr/run_eval.py new file mode 100644 index 0000000..651cf16 --- /dev/null +++ b/nemo_asr/run_eval.py @@ -0,0 +1,219 @@ +import argparse + +import io +import os +import torch +import evaluate +import soundfile + +from tqdm import tqdm +from normalizer import data_utils +import numpy as np + +from nemo.collections.asr.models import ASRModel +import time + + +wer_metric = evaluate.load("wer") + + +def main(args): + + DATA_CACHE_DIR = os.path.join(os.getcwd(), "audio_cache") + DATASET_NAME = args.dataset + SPLIT_NAME = args.split + + CACHE_DIR = os.path.join(DATA_CACHE_DIR, DATASET_NAME, SPLIT_NAME) + if not os.path.exists(CACHE_DIR): + os.makedirs(CACHE_DIR) + + if args.device >= 0: + device = torch.device(f"cuda:{args.device}") + compute_dtype=torch.bfloat16 + else: + device = torch.device("cpu") + compute_dtype=torch.float32 + + + if args.model_id.endswith(".nemo"): + asr_model = ASRModel.restore_from(args.model_id, map_location=device) + else: + asr_model = ASRModel.from_pretrained(args.model_id, map_location=device) # type: ASRModel + + asr_model.to(compute_dtype) + asr_model.eval() + + dataset = data_utils.load_data(args) + + def download_audio_files(batch): + + # download audio files and write the paths, transcriptions and durations to a manifest file + audio_paths = [] + durations = [] + + for id, sample in zip(batch["id"], batch["audio"]): + + # first step added here to make ID and wav filenames unique + # several datasets like earnings22 have a hierarchical structure + # for eg. earnings22/test/4432298/281.wav, earnings22/test/4450488/281.wav + # lhotse uses the filename (281.wav) here as unique ID to create and name cuts + # ref: https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/collation.py#L186 + id = id.replace('/', '_').removesuffix('.wav') + + audio_path = os.path.join(CACHE_DIR, f"{id}.wav") + + if "array" in sample: + audio_array = np.float32(sample["array"]) + sample_rate = 16000 + + elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream + with io.BytesIO(sample["bytes"]) as audio_file: + audio_array, sample_rate = soundfile.read(audio_file, dtype="float32") + + else: + raise ValueError("Sample must have either 'array' or 'bytes' key") + + if not os.path.exists(audio_path): + os.makedirs(os.path.dirname(audio_path), exist_ok=True) + soundfile.write(audio_path, audio_array, sample_rate) + + audio_paths.append(audio_path) + durations.append(len(audio_array) / sample_rate) + + + batch["references"] = batch["norm_text"] + batch["audio_filepaths"] = audio_paths + batch["durations"] = durations + + return batch + + + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples !") + dataset = dataset.take(args.max_eval_samples) + + dataset = data_utils.prepare_data(dataset) + if asr_model.cfg.decoding.strategy != "beam": + asr_model.cfg.decoding.strategy = "greedy_batch" + asr_model.change_decoding_strategy(asr_model.cfg.decoding) + + # prepraing the offline dataset + dataset = dataset.map(download_audio_files, batch_size=args.batch_size, batched=True, remove_columns=["audio"]) + + # Write manifest from daraset batch using json and keys audio_filepath, duration, text + + all_data = { + "audio_filepaths": [], + "durations": [], + "references": [], + } + + data_itr = iter(dataset) + for data in tqdm(data_itr, desc="Downloading Samples"): + for key in all_data: + all_data[key].append(data[key]) + + # Sort audio_filepaths and references based on durations values + sorted_indices = sorted(range(len(all_data["durations"])), key=lambda k: all_data["durations"][k], reverse=True) + all_data["audio_filepaths"] = [all_data["audio_filepaths"][i] for i in sorted_indices] + all_data["references"] = [all_data["references"][i] for i in sorted_indices] + all_data["durations"] = [all_data["durations"][i] for i in sorted_indices] + + + total_time = 0 + for _ in range(2): # warmup once and calculate rtf + if _ == 0: + audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches + else: + audio_files = all_data["audio_filepaths"] + start_time = time.time() + with torch.cuda.amp.autocast(enabled=False, dtype=compute_dtype), torch.inference_mode(), torch.no_grad(): + if 'canary' in args.model_id: + transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, pnc='no', num_workers=1) + else: + transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, num_workers=1) + end_time = time.time() + if _ == 1: + total_time += end_time - start_time + total_time = total_time + + # normalize transcriptions with English normalizer + if isinstance(transcriptions, tuple) and len(transcriptions) == 2: + transcriptions = transcriptions[0] + predictions = [data_utils.normalizer(pred.text) for pred in transcriptions] + + avg_time = total_time / len(all_data["audio_filepaths"]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_data["references"], + predictions, + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_data["durations"], + transcription_time=[avg_time] * len(all_data["audio_filepaths"]), + ) + + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute(references=all_data['references'], predictions=predictions) + wer = round(100 * wer, 2) + + # transcription_time = sum(all_results["transcription_time"]) + audio_length = sum(all_data["durations"]) + rtfx = audio_length / total_time + rtfx = round(rtfx, 2) + + print("RTFX:", rtfx) + print("WER:", wer, "%") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", type=str, required=True, help="Model identifier. Should be loadable with NVIDIA NeMo.", + ) + parser.add_argument( + '--dataset_path', type=str, default='esb/datasets', help='Dataset path. By default, it is `esb/datasets`' + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", type=int, default=32, help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest='streaming', + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=True) + + main(args) diff --git a/nemo_asr/run_fast_conformer_ctc.sh b/nemo_asr/run_fast_conformer_ctc.sh new file mode 100644 index 0000000..3ce5ce9 --- /dev/null +++ b/nemo_asr/run_fast_conformer_ctc.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +#considering FC-XL, FC-XXL, FC-L, C-L, C-S CTC models +MODEL_IDs=("nvidia/parakeet-ctc-1.1b" "nvidia/parakeet-ctc-0.6b" "nvidia/stt_en_fastconformer_ctc_large" "nvidia/stt_en_conformer_ctc_large" "nvidia/stt_en_conformer_ctc_small") +BATCH_SIZE=64 +DEVICE_ID=0 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/nemo_asr/run_fast_conformer_rnnt.sh b/nemo_asr/run_fast_conformer_rnnt.sh new file mode 100644 index 0000000..3f5a0ff --- /dev/null +++ b/nemo_asr/run_fast_conformer_rnnt.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +#considering latest model +MODEL_IDs=("nvidia/parakeet-tdt-0.6b-v2") +# For previous parakeet models: FC-L, FC-XL, FC-XXL, C-L and C-S RNNT models +# ("nvidia/parakeet-tdt-1.1b" "nvidia/parakeet-rnnt-1.1b" "nvidia/parakeet-rnnt-0.6b" "nvidia/stt_en_fastconformer_transducer_large" "nvidia/stt_en_conformer_transducer_large" "stt_en_conformer_transducer_small") +BATCH_SIZE=128 +DEVICE_ID=0 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/normalizer/data_utils.py b/normalizer/data_utils.py new file mode 100644 index 0000000..0f8ad71 --- /dev/null +++ b/normalizer/data_utils.py @@ -0,0 +1,59 @@ +from datasets import load_dataset, Audio +from normalizer import EnglishTextNormalizer + +from .eval_utils import read_manifest, write_manifest + + +def is_target_text_in_range(ref): + if ref.strip() == "ignore time segment in scoring": + return False + else: + return ref.strip() != "" + + +def get_text(sample): + if "text" in sample: + return sample["text"] + elif "sentence" in sample: + return sample["sentence"] + elif "normalized_text" in sample: + return sample["normalized_text"] + elif "transcript" in sample: + return sample["transcript"] + elif "transcription" in sample: + return sample["transcription"] + else: + raise ValueError( + f"Expected transcript column of either 'text', 'sentence', 'normalized_text' or 'transcript'. Got sample of " + ".join{sample.keys()}. Ensure a text column name is present in the dataset." + ) + +normalizer = EnglishTextNormalizer() + + +def normalize(batch): + batch["original_text"] = get_text(batch) + batch["norm_text"] = normalizer(batch["original_text"]) + return batch + + +def load_data(args): + dataset = load_dataset( + args.dataset_path, + args.dataset, + split=args.split, + streaming=args.streaming, + token=True, + ) + + return dataset + +def prepare_data(dataset): + # Re-sample to 16kHz and normalise transcriptions + dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) + dataset = dataset.map(normalize) + dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"]) + + return dataset + + diff --git a/normalizer/eval_utils.py b/normalizer/eval_utils.py new file mode 100644 index 0000000..2efde2a --- /dev/null +++ b/normalizer/eval_utils.py @@ -0,0 +1,210 @@ +import os +import glob +import json + +import evaluate +from collections import defaultdict + + +def read_manifest(manifest_path: str): + """ + Reads a manifest file (jsonl format) and returns a list of dictionaries containing samples. + """ + data = [] + with open(manifest_path, "r", encoding="utf-8") as f: + for line in f: + if len(line) > 0: + datum = json.loads(line) + data.append(datum) + return data + + +def write_manifest( + references: list, + transcriptions: list, + model_id: str, + dataset_path: str, + dataset_name: str, + split: str, + audio_length: list = None, + transcription_time: list = None, +): + """ + Writes a manifest file (jsonl format) and returns the path to the file. + + Args: + references: Ground truth reference texts. + transcriptions: Model predicted transcriptions. + model_id: String identifier for the model. + dataset_path: Path to the dataset. + dataset_name: Name of the dataset. + split: Dataset split name. + audio_length: Length of each audio sample in seconds. + transcription_time: Transcription time of each sample in seconds. + + Returns: + Path to the manifest file. + """ + model_id = model_id.replace("/", "-") + dataset_path = dataset_path.replace("/", "-") + dataset_name = dataset_name.replace("/", "-") + + if len(references) != len(transcriptions): + raise ValueError( + f"The number of samples in `references` ({len(references)}) " + f"must match `transcriptions` ({len(transcriptions)})." + ) + + if audio_length is not None and len(audio_length) != len(references): + raise ValueError( + f"The number of samples in `audio_length` ({len(audio_length)}) " + f"must match `references` ({len(references)})." + ) + if transcription_time is not None and len(transcription_time) != len(references): + raise ValueError( + f"The number of samples in `transcription_time` ({len(transcription_time)}) " + f"must match `references` ({len(references)})." + ) + + audio_length = ( + audio_length if audio_length is not None else len(references) * [None] + ) + transcription_time = ( + transcription_time + if transcription_time is not None + else len(references) * [None] + ) + + basedir = "./results/" + if not os.path.exists(basedir): + os.makedirs(basedir) + + manifest_path = os.path.join( + basedir, f"MODEL_{model_id}_DATASET_{dataset_path}_{dataset_name}_{split}.jsonl" + ) + + with open(manifest_path, "w", encoding="utf-8") as f: + for idx, (text, transcript, audio_length, transcription_time) in enumerate( + zip(references, transcriptions, audio_length, transcription_time) + ): + datum = { + "audio_filepath": f"sample_{idx}", # dummy value for Speech Data Processor + "duration": audio_length, + "time": transcription_time, + "text": text, + "pred_text": transcript, + } + f.write(f"{json.dumps(datum, ensure_ascii=False)}\n") + return manifest_path + + +def score_results(directory: str, model_id: str = None): + """ + Scores all result files in a directory and returns a composite score over all evaluated datasets. + + Args: + directory: Path to the result directory, containing one or more jsonl files. + model_id: Optional, model name to filter out result files based on model name. + + Returns: + Composite score over all evaluated datasets and a dictionary of all results. + """ + + # Strip trailing slash + if directory.endswith(os.pathsep): + directory = directory[:-1] + + # Find all result files in the directory + result_files = list(glob.glob(f"{directory}/**/*.jsonl", recursive=True)) + result_files = list(sorted(result_files)) + + # Filter files belonging to a specific model id + if model_id is not None and model_id != "": + print("Filtering models by id:", model_id) + model_id = model_id.replace("/", "-") + result_files = [fp for fp in result_files if model_id in fp] + + # Check if any result files were found + if len(result_files) == 0: + raise ValueError(f"No result files found in {directory}") + + # Utility function to parse the file path and extract model id, dataset path, dataset name and split + def parse_filepath(fp: str): + model_index = fp.find("MODEL_") + fp = fp[model_index:] + ds_index = fp.find("DATASET_") + model_id = fp[:ds_index].replace("MODEL_", "").rstrip("_") + author_index = model_id.find("-") + model_id = model_id[:author_index] + "/" + model_id[author_index + 1 :] + + ds_fp = fp[ds_index:] + dataset_id = ds_fp.replace("DATASET_", "").rstrip(".jsonl") + return model_id, dataset_id + + # Compute WER results per dataset, and RTFx over all datasets + results = {} + wer_metric = evaluate.load("wer") + + for result_file in result_files: + manifest = read_manifest(result_file) + model_id_of_file, dataset_id = parse_filepath(result_file) + + references = [datum["text"] for datum in manifest] + predictions = [datum["pred_text"] for datum in manifest] + + time = [datum["time"] for datum in manifest] + duration = [datum["duration"] for datum in manifest] + compute_rtfx = all(time) and all(duration) + + wer = wer_metric.compute(references=references, predictions=predictions) + wer = round(100 * wer, 2) + + if compute_rtfx: + audio_length = sum(duration) + inference_time = sum(time) + rtfx = round(sum(duration) / sum(time), 4) + else: + audio_length = inference_time = rtfx = None + + result_key = f"{model_id_of_file} | {dataset_id}" + results[result_key] = {"wer": wer, "audio_length": audio_length, "inference_time": inference_time, "rtfx": rtfx} + + print("*" * 80) + print("Results per dataset:") + print("*" * 80) + + for k, v in results.items(): + metrics = f"{k}: WER = {v['wer']:0.2f} %" + if v["rtfx"] is not None: + metrics += f", RTFx = {v['rtfx']:0.2f}" + print(metrics) + + # composite WER should be computed over all datasets and with the same key + composite_wer = defaultdict(float) + composite_audio_length = defaultdict(float) + composite_inference_time = defaultdict(float) + count_entries = defaultdict(int) + for k, v in results.items(): + key = k.split("|")[0].strip() + composite_wer[key] += v["wer"] + if v["rtfx"] is not None: + composite_audio_length[key] += v["audio_length"] + composite_inference_time[key] += v["inference_time"] + else: + composite_audio_length[key] = composite_inference_time[key] = None + count_entries[key] += 1 + + # normalize scores & print + print() + print("*" * 80) + print("Composite Results:") + print("*" * 80) + for k, v in composite_wer.items(): + wer = v / count_entries[k] + print(f"{k}: WER = {wer:0.2f} %") + for k in composite_audio_length: + if composite_audio_length[k] is not None: + rtfx = composite_audio_length[k] / composite_inference_time[k] + print(f"{k}: RTFx = {rtfx:0.2f}") + print("*" * 80) + return composite_wer, results diff --git a/normalizer/normalizer.py b/normalizer/normalizer.py index 3fbcf96..6fc418b 100644 --- a/normalizer/normalizer.py +++ b/normalizer/normalizer.py @@ -17,7 +17,7 @@ import unicodedata from fractions import Fraction from typing import Iterator, List, Match, Optional, Union -from english_abbreviations import english_spelling_normalizer +from .english_abbreviations import english_spelling_normalizer import regex diff --git a/phi/run_eval.py b/phi/run_eval.py new file mode 100644 index 0000000..b3d9207 --- /dev/null +++ b/phi/run_eval.py @@ -0,0 +1,256 @@ +import argparse +import os +import torch +from transformers import AutoModelForCausalLM, AutoProcessor, StoppingCriteria, StoppingCriteriaList +import evaluate +from normalizer import data_utils +import time +from tqdm import tqdm + +wer_metric = evaluate.load("wer") +torch.set_float32_matmul_precision('high') + +class MultipleTokenBatchStoppingCriteria(StoppingCriteria): + """Stopping criteria capable of receiving multiple stop-tokens and handling batched inputs.""" + + def __init__(self, stop_tokens: torch.LongTensor, batch_size: int = 1) -> None: + """Initialize the multiple token batch stopping criteria. + + Args: + stop_tokens: Stop-tokens. + batch_size: Batch size. + + """ + + self.stop_tokens = stop_tokens + self.max_stop_tokens = stop_tokens.shape[-1] + self.stop_tokens_idx = torch.zeros(batch_size, dtype=torch.long, device=stop_tokens.device) + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + # Only gather the maximum number of inputs compatible with stop tokens + # and checks whether generated inputs are equal to `stop_tokens` + generated_inputs = torch.eq(input_ids[:, -self.max_stop_tokens :].unsqueeze(1), self.stop_tokens) + equal_generated_inputs = torch.all(generated_inputs, dim=2) + + # Mark the position where a stop token has been produced for each input in the batch, + # but only if the corresponding entry is not already set + sequence_idx = torch.any(equal_generated_inputs, dim=1) + sequence_set_mask = self.stop_tokens_idx == 0 + self.stop_tokens_idx[sequence_idx & sequence_set_mask] = input_ids.shape[-1] + + return torch.all(self.stop_tokens_idx) + + +def main(args): + model = AutoModelForCausalLM.from_pretrained( + args.model_id, + trust_remote_code=True, + torch_dtype="auto", + _attn_implementation="flash_attention_2", + ).to(args.device) + model.eval() + processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True) + + user = "<|user|>" + assistant = "<|assistant|>" + prompt_suffix = "<|end|>" + + prompt = f"{user}<|audio_1|>{args.user_prompt}{prompt_suffix}{assistant}" + + gen_kwargs = {"max_new_tokens": args.max_new_tokens, "num_beams": args.num_beams} + + stop_tokens = [prompt_suffix, processor.tokenizer.eos_token] + stop_tokens_ids = processor.tokenizer(stop_tokens, add_special_tokens=False, padding="longest", return_tensors="pt")["input_ids"] + stop_tokens_ids = stop_tokens_ids.to(model.device) + + def benchmark(batch, min_new_tokens=None): + # Load audio inputs + audios = [(audio["array"], audio["sampling_rate"]) for audio in batch["audio"]] + minibatch_size = len(audios) + gen_kwargs["stopping_criteria"] = StoppingCriteriaList( + [MultipleTokenBatchStoppingCriteria(stop_tokens_ids, batch_size=args.num_beams * minibatch_size)] + ) + + # START TIMING + start_time = time.time() + + with torch.autocast(model.device.type, enabled=True): + inputs = processor(text=[prompt] * minibatch_size, audios=audios, return_tensors="pt").to(args.device) + + # Model Inference + pred_ids = model.generate( + **inputs, + pad_token_id=processor.tokenizer.pad_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + **gen_kwargs, + min_new_tokens=min_new_tokens, + ) + + # Gather the sequence index of the stop token + stop_tokens_idx = gen_kwargs["stopping_criteria"][0].stop_tokens_idx.reshape(minibatch_size, -1)[:, 0] + + # If a stop token was produced, we need to remove its length from the found index, + # however there might be a chance that the stop token was not produced and the index + # returned is the length of the generated sequence + stop_tokens_idx = torch.where( + stop_tokens_idx > 0, + stop_tokens_idx - stop_tokens_ids.shape[-1], + pred_ids.shape[-1], + ) + + # Convert token ids to text transcription + pred_text = [ + processor.decode(_pred_ids[inputs["input_ids"].shape[1] : _stop_tokens_idx], skip_special_tokens=True, clean_up_tokenization_spaces=False) + for _pred_ids, _stop_tokens_idx in zip(pred_ids, stop_tokens_idx) + ] + + # END TIMING + runtime = time.time() - start_time + + # normalize by minibatch size since we want the per-sample time + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + batch["predictions"] = [data_utils.normalizer(pred) for pred in pred_text] + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = dataset.take(num_warmup_samples) + else: + warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"min_new_tokens": args.max_new_tokens})) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with 🤗 Transformers", + ) + parser.add_argument( + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=16, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--num_beams", + type=int, + default=1, + help="Number of beams for beam search.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=None, + help="Maximum number of tokens to generate (for auto-regressive models).", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=2, + help="Number of warm-up steps to run before launching the timed runs.", + ) + parser.add_argument( + "--user_prompt", + type=str, + default="Transcribe the audio clip into text.", + help="User prompt string.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) diff --git a/phi/run_phi4_multimodal.sh b/phi/run_phi4_multimodal.sh new file mode 100755 index 0000000..31ceab0 --- /dev/null +++ b/phi/run_phi4_multimodal.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("microsoft/Phi-4-multimodal-instruct") +BATCH_SIZE=32 +NUM_BEAMS=1 +MAX_NEW_TOKENS=512 + +num_models=${#MODEL_IDs[@]} +default_user_prompt="Transcribe the audio clip into text." + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="${default_user_prompt}" + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="${default_user_prompt}" + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="Transcribe the audio clip to English text." + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="${default_user_prompt}" + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="${default_user_prompt}" + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="${default_user_prompt}" + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="${default_user_prompt}" + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --num_beams=${NUM_BEAMS} \ + --max_eval_samples=-1 \ + --max_new_tokens=${MAX_NEW_TOKENS} \ + --user_prompt="${default_user_prompt}" + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/requirements/requirements.txt b/requirements/requirements.txt new file mode 100644 index 0000000..c1d3d44 --- /dev/null +++ b/requirements/requirements.txt @@ -0,0 +1,6 @@ +torch +transformers +evaluate +datasets +librosa +jiwer diff --git a/requirements/requirements_ctranslate2.txt b/requirements/requirements_ctranslate2.txt new file mode 100644 index 0000000..9fe6175 --- /dev/null +++ b/requirements/requirements_ctranslate2.txt @@ -0,0 +1,5 @@ +datasets +evaluate +faster-whisper>=0.8.0 +jiwer +librosa diff --git a/requirements/requirements_granite.txt b/requirements/requirements_granite.txt new file mode 100644 index 0000000..4856d27 --- /dev/null +++ b/requirements/requirements_granite.txt @@ -0,0 +1,7 @@ +evaluate +datasets==3.4.1 +peft==0.13.1 +torch==2.5.1 +torchaudio==2.5.1 +transformers @ https://github.com/huggingface/transformers/archive/main.zip +soundfile diff --git a/requirements/requirements_moonshine.txt b/requirements/requirements_moonshine.txt new file mode 100644 index 0000000..4dede0f --- /dev/null +++ b/requirements/requirements_moonshine.txt @@ -0,0 +1,9 @@ +torch +evaluate +librosa +jiwer +einops +datasets==3.2.0 +numba==0.60.0 +numpy==2.0.2 +git+https://github.com/huggingface/transformers.git#egg=transformers diff --git a/requirements/requirements_nemo.txt b/requirements/requirements_nemo.txt new file mode 100644 index 0000000..e5fca82 --- /dev/null +++ b/requirements/requirements_nemo.txt @@ -0,0 +1,6 @@ +git+https://github.com/NVIDIA/NeMo.git@r2.3.0#egg=nemo_toolkit[asr] +tqdm +soundfile +librosa +IPython # Workaround for https://github.com/NVIDIA/NeMo/pull/9890#discussion_r1701028427 +cuda-python>=12.4 # Used for fast TDT and RNN-T inference diff --git a/requirements/requirements_phi.txt b/requirements/requirements_phi.txt new file mode 100644 index 0000000..914186d --- /dev/null +++ b/requirements/requirements_phi.txt @@ -0,0 +1,18 @@ +wheel +evaluate +datasets +librosa +jiwer +peft +backoff +torchaudio +flash_attn==2.7.4.post1 +torch==2.6.0 +transformers==4.48.2 +accelerate==1.3.0 +soundfile +pillow +scipy +torchvision==0.21.0 +backoff==2.2.1 +peft==0.13.2 diff --git a/requirements/requirements_speechbrain.txt b/requirements/requirements_speechbrain.txt new file mode 100644 index 0000000..1f354c8 --- /dev/null +++ b/requirements/requirements_speechbrain.txt @@ -0,0 +1,4 @@ +torch +speechbrain +evaluate +datasets \ No newline at end of file diff --git a/requirements/requirements_trtllm.txt b/requirements/requirements_trtllm.txt new file mode 100755 index 0000000..00dbdd2 --- /dev/null +++ b/requirements/requirements_trtllm.txt @@ -0,0 +1,5 @@ +tiktoken +jiwer +tensorrt-llm==0.15.0.dev2024101500 +soundfile +librosa diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt new file mode 100644 index 0000000..ca86666 --- /dev/null +++ b/requirements/requirements_vllm.txt @@ -0,0 +1,7 @@ +torch +transformers +evaluate +datasets +librosa +jiwer +vllm \ No newline at end of file diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh new file mode 100644 index 0000000..553ff4e --- /dev/null +++ b/speechbrain/run_conformer.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +SOURCE="speechbrain/asr-conformer-largescaleasr" +BATCH_SIZE=32 +DEVICE_ID=0 + +# Run with CTC+Attn +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --no-streaming \ + --beam_size=10 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +# Evaluate results +RUNDIR=`pwd` && \ +cd ../normalizer && \ +python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ +cd $RUNDIR diff --git a/speechbrain/run_conformersmall.sh b/speechbrain/run_conformersmall.sh new file mode 100644 index 0000000..49be35d --- /dev/null +++ b/speechbrain/run_conformersmall.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +SOURCE="speechbrain/asr-conformersmall-transformerlm-librispeech" + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +# Evaluate results +RUNDIR=`pwd` && \ +cd ../normalizer && \ +python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ +cd $RUNDIR \ No newline at end of file diff --git a/speechbrain/run_crdnn_rnnlm.sh b/speechbrain/run_crdnn_rnnlm.sh new file mode 100644 index 0000000..e90a7ff --- /dev/null +++ b/speechbrain/run_crdnn_rnnlm.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +SOURCE="speechbrain/asr-crdnn-rnnlm-librispeech" + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +# Evaluate results +RUNDIR=`pwd` && \ +cd ../normalizer && \ +python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ +cd $RUNDIR \ No newline at end of file diff --git a/speechbrain/run_crdnn_transformerlm.sh b/speechbrain/run_crdnn_transformerlm.sh new file mode 100644 index 0000000..4be7b37 --- /dev/null +++ b/speechbrain/run_crdnn_transformerlm.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +SOURCE="speechbrain/asr-crdnn-transformerlm-librispeech" + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +# Evaluate results +RUNDIR=`pwd` && \ +cd ../normalizer && \ +python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ +cd $RUNDIR \ No newline at end of file diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py new file mode 100644 index 0000000..9e840cd --- /dev/null +++ b/speechbrain/run_eval.py @@ -0,0 +1,267 @@ +"""Script to evaluate a pretrained SpeechBrain model from the 🤗 Hub. + +Authors +* Adel Moumen 2023 +* Sanchit Gandhi 2024 +""" +import argparse +import time + +import evaluate +from normalizer import data_utils +from tqdm import tqdm +import torch +import speechbrain.inference.ASR as ASR +from speechbrain.utils.data_utils import batch_pad_right +import os + +def get_model( + speechbrain_repository: str, + speechbrain_pretrained_class_name: str, + beam_size: int, + ctc_weight_decode: float, + **kwargs, +): + """Fetch a pretrained SpeechBrain model from the SpeechBrain 🤗 Hub. + + Arguments + --------- + speechbrain_repository : str + The name of the SpeechBrain repository to fetch the pretrained model from. E.g. `asr-crdnn-rnnlm-librispeech`. + speechbrain_pretrained_class_name : str + The name of the SpeechBrain pretrained class to fetch. E.g. `EncoderASR`. + See: https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/pretrained/interfaces.py + beam_size : int + Size of the beam for decoding. + ctc_weight_decode : float + Weight of the CTC prob for decoding with joint CTC/Attn. + **kwargs + Additional keyword arguments to pass to override the default run options of the pretrained model. + + Returns + ------- + SpeechBrain pretrained model + The Pretrained model. + + Example + ------- + >>> from open_asr_leaderboard.speechbrain.run_eval import get_model + >>> model = get_model("asr-crdnn-rnnlm-librispeech", "EncoderASR", device="cuda:0") + """ + + run_opt_defaults = { + "device": "cuda", + "data_parallel_count": -1, + "data_parallel_backend": False, + "distributed_launch": False, + "distributed_backend": "nccl", + "jit_module_keys": None, + "precision": "fp16", + } + + run_opts = {**run_opt_defaults} + + overrides = {} + if beam_size: + overrides["test_beam_size"] = beam_size + + if ctc_weight_decode == 0.0: + overrides["scorer"] = None + overrides["ctc_weight_decode"] = ctc_weight_decode + + kwargs = { + "source": f"{speechbrain_repository}", + "savedir": f"pretrained_models/{speechbrain_repository}", + "run_opts": run_opts, + "overrides": overrides, + } + + try: + model_class = getattr(ASR, speechbrain_pretrained_class_name) + except AttributeError: + raise AttributeError( + f"SpeechBrain Pretrained class: {speechbrain_pretrained_class_name} not found in pretrained.py" + ) + + return model_class.from_hparams(**kwargs) + + +def main(args): + """Run the evaluation script.""" + if args.device == -1: + device = "cpu" + else: + device = f"cuda:{args.device}" + + model = get_model( + args.source, + args.speechbrain_pretrained_class_name, + args.beam_size, + args.ctc_weight_decode, + device=device + ) + + def benchmark(batch): + # Load audio inputs + audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]] + minibatch_size = len(audios) + + audios, audio_lens = batch_pad_right(audios) + audios = audios.to(device) + audio_lens = audio_lens.to(device) + + start_time = time.time() + with torch.autocast(device_type="cuda"): + predictions, _ = model.transcribe_batch(audios, audio_lens) + runtime = time.time() - start_time + + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + batch["predictions"] = [data_utils.normalizer(pred) for pred in predictions] + batch["references"] = batch["norm_text"] + return batch + + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = dataset.take(num_warmup_samples) + else: + warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True)) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.source, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer_metric = evaluate.load("wer") + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--source", + type=str, + required=True, + help="SpeechBrain model repository. E.g. `asr-crdnn-rnnlm-librispeech`", + ) + + parser.add_argument( + "--speechbrain_pretrained_class_name", + type=str, + required=True, + help="SpeechBrain pretrained class name. E.g. `EncoderASR`", + ) + + parser.add_argument( + "--dataset_path", + type=str, + default="hf-audio/esb-datasets-test-only-sorted", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=16, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=2, + help="Number of warm-up steps to run before launching the timed runs.", + ) + parser.add_argument( + "--beam_size", + type=int, + default=None, + help="Beam size for decoding" + ) + parser.add_argument( + "--ctc_weight_decode", + type=float, + default=0.3, + help="Weight of CTC for joint CTC/Att. decoding" + ) + args = parser.parse_args() + parser.set_defaults(streaming=True) + + main(args) diff --git a/speechbrain/run_transformer_transformerlm.sh b/speechbrain/run_transformer_transformerlm.sh new file mode 100644 index 0000000..7d47de7 --- /dev/null +++ b/speechbrain/run_transformer_transformerlm.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +SOURCE="speechbrain/asr-transformer-transformerlm-librispeech" + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=4 \ + --max_eval_samples=-1 + +# Evaluate results +RUNDIR=`pwd` && \ +cd ../normalizer && \ +python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ +cd $RUNDIR \ No newline at end of file diff --git a/speechbrain/run_wav2vec2_commonvoice.sh b/speechbrain/run_wav2vec2_commonvoice.sh new file mode 100644 index 0000000..0fcdae8 --- /dev/null +++ b/speechbrain/run_wav2vec2_commonvoice.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +SOURCE="speechbrain/asr-wav2vec2-commonvoice-en" + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=16 \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=16 \ + --max_eval_samples=-1 + +# Evaluate results +RUNDIR=`pwd` && \ +cd ../normalizer && \ +python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ +cd $RUNDIR \ No newline at end of file diff --git a/speechbrain/run_wav2vec2_librispeech.sh b/speechbrain/run_wav2vec2_librispeech.sh new file mode 100644 index 0000000..2c68bb0 --- /dev/null +++ b/speechbrain/run_wav2vec2_librispeech.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +SOURCE="speechbrain/asr-wav2vec2-librispeech" +BATCH_SIZE=32 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +python run_eval.py \ + --source=${SOURCE} \ + --speechbrain_pretrained_class_name="EncoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + +# Evaluate results +RUNDIR=`pwd` && \ +cd ../normalizer && \ +python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ +cd $RUNDIR \ No newline at end of file diff --git a/tensorrtllm/run_eval.py b/tensorrtllm/run_eval.py new file mode 100755 index 0000000..8384ce8 --- /dev/null +++ b/tensorrtllm/run_eval.py @@ -0,0 +1,353 @@ +import argparse +import os +import torch +import json +from tensorrt_llm.runtime import ModelRunnerCpp +from tensorrt_llm.bindings import GptJsonConfig +import numpy as np +from collections import OrderedDict +from pathlib import Path +from whisper_utils import log_mel_spectrogram, get_tokenizer +import evaluate +from normalizer import data_utils +import time +from tqdm import tqdm +from pathlib import Path +import re +from concurrent.futures import ThreadPoolExecutor + +wer_metric = evaluate.load("wer") + +def read_config(component, engine_dir): + engine_dir = Path(engine_dir) + config_path = engine_dir / component / 'config.json' + with open(config_path, 'r') as f: + config = json.load(f) + model_config = OrderedDict() + model_config.update(config['pretrained_config']) + model_config.update(config['build_config']) + return model_config + +class WhisperTRTLLM(object): + + def __init__(self, + engine_dir, + assets_dir="assets", + batch_size=64): + encoder_config = read_config('encoder', engine_dir) + decoder_config = read_config('decoder', engine_dir) + self.n_mels = encoder_config['n_mels'] + self.num_languages = encoder_config['num_languages'] + is_multilingual = (decoder_config['vocab_size'] >= 51865) + if is_multilingual: + tokenizer_name = "multilingual" + assert (Path(assets_dir) / "multilingual.tiktoken").exists( + ), "multilingual.tiktoken file is not existed in assets_dir" + else: + tokenizer_name = "gpt2" + assert (Path(assets_dir) / "gpt2.tiktoken").exists( + ), "gpt2.tiktoken file is not existed in assets_dir" + self.text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" if is_multilingual else "<|startoftranscript|><|notimestamps|>" + self.tokenizer = get_tokenizer(name=tokenizer_name, + num_languages=self.num_languages, + tokenizer_dir=assets_dir) + self.eot_id = self.tokenizer.encode( + "<|endoftext|>", + allowed_special=self.tokenizer.special_tokens_set)[0] + json_config = GptJsonConfig.parse_file(Path(engine_dir) / 'decoder' / 'config.json') + assert json_config.model_config.supports_inflight_batching + runner_kwargs = dict(engine_dir=engine_dir, + is_enc_dec=True, + max_batch_size=batch_size, + max_input_len=3000, + max_output_len=96, + max_beam_width=1, + debug_mode=False, + kv_cache_free_gpu_memory_fraction=0.9) + self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs) + + def process_single_batch(self, mel_batch, decoder_input_ids, mel_input_lengths, max_new_tokens): + outputs = self.model_runner_cpp.generate( + batch_input_ids=decoder_input_ids, + encoder_input_features=mel_batch, + encoder_output_lengths=mel_input_lengths // 2, + max_new_tokens=max_new_tokens, + end_id=self.eot_id, + pad_id=self.eot_id, + num_beams=1, + output_sequence_lengths=True, + return_dict=True + ) + + output_ids = outputs['output_ids'].cpu().numpy().tolist() + texts = [] + for i in range(len(output_ids)): + text = self.tokenizer.decode(output_ids[i][0]).strip() + text = re.sub(r'<\|.*?\|>', '', text) + texts.append(text) + return texts + + def process_batch(self, mel, mel_input_lengths, num_threads=4, max_new_tokens=96): + prompt_id = self.tokenizer.encode( + self.text_prefix, allowed_special=self.tokenizer.special_tokens_set) + prompt_id = torch.tensor(prompt_id) + batch_size = len(mel) + decoder_input_ids = prompt_id.repeat(batch_size, 1) + + with torch.no_grad(): + if isinstance(mel, list): + mel = torch.stack([m.transpose(1, 2).type(torch.float16).squeeze(0) for m in mel]) + else: + mel = mel.transpose(1, 2) + + num_threads = min(num_threads, batch_size) + mel_batches = torch.split(mel, batch_size // num_threads) + mel_input_lengths_batches = torch.split(mel_input_lengths, batch_size // num_threads) + + texts_list = [] + with ThreadPoolExecutor(max_workers=num_threads) as executor: + futures = [] + for i, mel_batch in enumerate(mel_batches): + current_length = mel_batch.size(0) + futures.append(executor.submit( + self.process_single_batch, + mel_batch, + decoder_input_ids[:current_length], + mel_input_lengths_batches[i], + max_new_tokens + )) + + for future in futures: + texts_list.extend(future.result()) + + return texts_list + +def longest_common_substring(s1, s2): + len1, len2 = len(s1), len(s2) + dp = [[0] * (len2 + 1) for _ in range(len1 + 1)] + + longest_length = 0 + end_index_s1 = 0 + + for i in range(1, len1 + 1): + for j in range(1, len2 + 1): + if s1[i - 1] == s2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + if dp[i][j] > longest_length: + longest_length = dp[i][j] + end_index_s1 = i + else: + dp[i][j] = 0 + + return s1[end_index_s1 - longest_length:end_index_s1] + +def chunk_audio(audio, chunk_length, overlap_length, sample_rate): + chunk_size = int(chunk_length * sample_rate) + overlap_size = int(overlap_length * sample_rate) + + chunks = [] + start = 0 + + while start < len(audio): + end = min(start + chunk_size, len(audio)) + chunks.append(audio[start:end]) + start += chunk_size - overlap_size + + return chunks + +def main(args): + asr_model = WhisperTRTLLM(engine_dir=args.model_id) + + def benchmark(batch, min_new_tokens=None): + # Load audio inputs + max_duration, sample_rate = 30, 16000 + audios_origin = [audio["array"].astype(np.float32) for audio in batch["audio"]] + minibatch_size = len(audios_origin) + audios, audio_index = [], [] + + chunk_length = 25 + overlap_length = 5 + for i, audio in enumerate(audios_origin): + if len(audio) > max_duration * sample_rate: + audio_chunks = chunk_audio(audio, chunk_length, overlap_length, sample_rate) + for chunk in audio_chunks: + audios.append(chunk) + audio_index.append(i) + else: + audios.append(audio) + audio_index.append(i) + audios = [torch.from_numpy(audio) for audio in audios] + + # START TIMING + start_time = time.time() + longest_duration = int(sample_rate * max_duration) + + features = [ + log_mel_spectrogram(wave, + asr_model.n_mels, + padding=longest_duration - wave.shape[-1], + device='cuda').unsqueeze(0) + for wave in audios + ] + + features_input_lengths = torch.tensor([f.shape[2] for f in features], + dtype=torch.int32, + device='cuda') + + texts_origin = asr_model.process_batch(features, features_input_lengths, num_threads=4) + + texts = [] + for i in range(minibatch_size): + text_chunks = [] + for j in range(len(texts_origin)): + if audio_index[j] == i: + text_chunks.append(texts_origin[j]) + + if len(text_chunks) > 1: + merged_text = text_chunks[0] + for t in text_chunks[1:]: + lcs = longest_common_substring(merged_text, t) + merged_text += t[len(lcs):] + + texts.append(merged_text) + else: + texts.append(text_chunks[0]) + # END TIMING + runtime = time.time() - start_time + + print(f"Batch size: {minibatch_size}, Time taken: {runtime:.2f} s, texts_origin_len: {len(texts_origin)}, texts_len: {len(texts)}") + # normalize by minibatch size since we want the per-sample time + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + batch["predictions"] = [data_utils.normalizer(pred) for pred in texts] + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = dataset.take(num_warmup_samples) + else: + warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"min_new_tokens": args.max_new_tokens})) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with 🤗 Transformers", + ) + parser.add_argument( + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=16, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=None, + help="Maximum number of tokens to generate (for auto-regressive models).", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=10, + help="Number of warm-up steps to run before launching the timed runs.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) diff --git a/tensorrtllm/run_whisper.sh b/tensorrtllm/run_whisper.sh new file mode 100755 index 0000000..2b80072 --- /dev/null +++ b/tensorrtllm/run_whisper.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +download_model() { + local MODEL_ID=$1 + local MODEL_TRT_LLM=${MODEL_ID}_tllm_checkpoint + echo "Downloading $MODEL_ID from Hugging Face" + mkdir -p $MODEL_TRT_LLM + huggingface-cli download --local-dir whisper-${MODEL_ID}-trt-llm-checkpoint yuekai/whisper-${MODEL_ID}-trt-llm-checkpoint + wget -nc --directory-prefix=assets "$URL" + wget -nc --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz + wget -nc --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken + wget -nc --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken + +} + +build_model() { + local model_id=$1 + local checkpoint_dir="whisper-${model_id}-trt-llm-checkpoint" + local output_dir="whisper_${model_id}" + + local INFERENCE_PRECISION=float16 + local MAX_BEAM_WIDTH=4 + local MAX_BATCH_SIZE=256 + + echo "Building encoder for model: $model_id" + trtllm-build --checkpoint_dir "${checkpoint_dir}/encoder" \ + --output_dir "${output_dir}/encoder" \ + --moe_plugin disable \ + --enable_xqa disable \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --gemm_plugin disable \ + --bert_attention_plugin "$INFERENCE_PRECISION" \ + --max_input_len 3000 --max_seq_len 3000 + + echo "Building decoder for model: $model_id" + trtllm-build --checkpoint_dir "${checkpoint_dir}/decoder" \ + --output_dir "${output_dir}/decoder" \ + --moe_plugin disable \ + --enable_xqa disable \ + --max_beam_width "$MAX_BEAM_WIDTH" \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --max_seq_len 114 \ + --max_input_len 14 \ + --max_encoder_input_len 3000 \ + --gemm_plugin "$INFERENCE_PRECISION" \ + --bert_attention_plugin "$INFERENCE_PRECISION" \ + --gpt_attention_plugin "$INFERENCE_PRECISION" +} + +MODEL_IDs=("large-v3-turbo" "large-v3" "large-v2" "large-v1" "medium" "base" "small" "tiny" "medium.en" "base.en" "small.en" "tiny.en") +DEVICE_INDEX=0 +BATCH_SIZE=64 + +num_models=${#MODEL_IDs[@]} + +pip install -r ../requirements/requirements_trtllm.txt + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + download_model $MODEL_ID + build_model $MODEL_ID + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python3 run_eval.py \ + --model_id=whisper_${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_INDEX} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python3 -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" > log_${MODEL_ID}.txt && \ + cd $RUNDIR + +done diff --git a/tensorrtllm/whisper_utils.py b/tensorrtllm/whisper_utils.py new file mode 100755 index 0000000..81164db --- /dev/null +++ b/tensorrtllm/whisper_utils.py @@ -0,0 +1,254 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from https://github.com/openai/whisper + +import os +import base64 +from functools import lru_cache +from pathlib import Path + +from typing import Optional, Union + +import numpy as np + +import torch +import torch.nn.functional as F +import tiktoken + +Pathlike = Union[str, Path] +N_FFT = 400 +HOP_LENGTH = 160 + +LANGUAGES = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "he": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", + "yue": "cantonese", +} + +def get_tokenizer(name: str = "multilingual", + num_languages: int = 99, + tokenizer_dir: str = None): + if tokenizer_dir is None: + vocab_path = os.path.join(os.path.dirname(__file__), + f"assets/{name}.tiktoken") + else: + vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken") + ranks = { + base64.b64decode(token): int(rank) + for token, rank in (line.split() for line in open(vocab_path) if line) + } + n_vocab = len(ranks) + special_tokens = {} + + specials = [ + "<|endoftext|>", + "<|startoftranscript|>", + *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]], + "<|translate|>", + "<|transcribe|>", + "<|startoflm|>", + "<|startofprev|>", + "<|nospeech|>", + "<|notimestamps|>", + *[f"<|{i * 0.02:.2f}|>" for i in range(1501)], + ] + + for token in specials: + special_tokens[token] = n_vocab + n_vocab += 1 + + return tiktoken.Encoding( + name=os.path.basename(vocab_path), + explicit_n_vocab=n_vocab, + pat_str= + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + mergeable_ranks=ranks, + special_tokens=special_tokens, + ) + +@lru_cache(maxsize=None) +def mel_filters(device, + n_mels: int, + mel_filters_dir: str = None) -> torch.Tensor: + """ + load the mel filterbank matrix for projecting STFT into a Mel spectrogram. + Allows decoupling librosa dependency; saved using: + + np.savez_compressed( + "mel_filters.npz", + mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), + ) + """ + assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" + if mel_filters_dir is None: + mel_filters_path = os.path.join(os.path.dirname(__file__), "assets", + "mel_filters.npz") + else: + mel_filters_path = os.path.join(mel_filters_dir, "mel_filters.npz") + with np.load(mel_filters_path) as f: + return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + +def log_mel_spectrogram( + audio: Union[str, np.ndarray, torch.Tensor], + n_mels: int, + padding: int = 0, + device: Optional[Union[str, torch.device]] = None, + return_duration: bool = False, + mel_filters_dir: str = None, +): + """ + Compute the log-Mel spectrogram of + + Parameters + ---------- + audio: Union[str, np.ndarray, torch.Tensor], shape = (*) + The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz + + n_mels: int + The number of Mel-frequency filters, only 80 and 128 are supported + + padding: int + Number of zero samples to pad to the right + + device: Optional[Union[str, torch.device]] + If given, the audio tensor is moved to this device before STFT + + Returns + ------- + torch.Tensor, shape = (80 or 128, n_frames) + A Tensor that contains the Mel spectrogram + """ + assert torch.is_tensor(audio), f"Unsupported audio type: {type(audio)}" + + if device is not None: + audio = audio.to(device) + if padding > 0: + # pad to N_SAMPLES + audio = F.pad(audio, (0, padding)) + window = torch.hann_window(N_FFT).to(audio.device) + stft = torch.stft(audio, + N_FFT, + HOP_LENGTH, + window=window, + return_complex=True) + magnitudes = stft[..., :-1].abs()**2 + + filters = mel_filters(audio.device, n_mels, mel_filters_dir) + mel_spec = filters @ magnitudes + + log_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + if return_duration: + return log_spec, duration + else: + return log_spec diff --git a/transformers/run_data2vec.sh b/transformers/run_data2vec.sh new file mode 100644 index 0000000..b82e61d --- /dev/null +++ b/transformers/run_data2vec.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("facebook/data2vec-audio-large-960h" "facebook/data2vec-audio-base-960h") +BATCH_SIZE=8 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done \ No newline at end of file diff --git a/transformers/run_eval.py b/transformers/run_eval.py index 4dcffe0..89c4272 100644 --- a/transformers/run_eval.py +++ b/transformers/run_eval.py @@ -1,84 +1,159 @@ import argparse - -from transformers import pipeline -from normalizer import EnglishTextNormalizer -from datasets import load_dataset, Audio +import os +import torch +from torch.nn.attention import sdpa_kernel, SDPBackend +from transformers import AutoConfig, AutoModelForSpeechSeq2Seq, AutoModelForCTC, AutoProcessor, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING import evaluate +from normalizer import data_utils +import time +from tqdm import tqdm wer_metric = evaluate.load("wer") - - -def is_target_text_in_range(ref): - if ref.strip() == "ignore time segment in scoring": - return False - else: - return ref.strip() != "" - - -def get_text(sample): - if "text" in sample: - return sample["text"] - elif "sentence" in sample: - return sample["sentence"] - elif "normalized_text" in sample: - return sample["normalized_text"] - elif "transcript" in sample: - return sample["transcript"] - elif "transcription" in sample: - return sample["transcription"] - else: - raise ValueError( - f"Expected transcript column of either 'text', 'sentence', 'normalized_text' or 'transcript'. Got sample of " - ".join{sample.keys()}. Ensure a text column name is present in the dataset." - ) - - -normalizer = EnglishTextNormalizer() - - -def normalize(batch): - batch["norm_text"] = normalizer(get_text(batch)) - return batch - - -def data(dataset): - for i, item in enumerate(dataset): - yield {**item["audio"], "reference": item["norm_text"]} - +torch.set_float32_matmul_precision('high') def main(args): - asr_pipe = pipeline( - "automatic-speech-recognition", model=args.model_id, device=args.device + config = AutoConfig.from_pretrained(args.model_id) + cls_model = AutoModelForSpeechSeq2Seq if type(config) in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else AutoModelForCTC + model = cls_model.from_pretrained(args.model_id, torch_dtype=torch.bfloat16, attn_implementation="sdpa").to(args.device) + processor = AutoProcessor.from_pretrained(args.model_id) + model_input_name = processor.model_input_names[0] + + if model.can_generate(): + gen_kwargs = {"max_new_tokens": args.max_new_tokens} + # for multilingual Whisper-checkpoints we see a definitive WER boost by setting the language and task args + if getattr(model.generation_config, "is_multilingual"): + gen_kwargs["language"] = "en" + gen_kwargs["task"] = "transcribe" + elif args.max_new_tokens: + raise ValueError("`max_new_tokens` should only be set for auto-regressive models, but got a CTC model.") + + if args.torch_compile: + model.forward = torch.compile(model.forward, mode=args.compile_mode, fullgraph=True) + if model.can_generate(): + # enable static k/v cache for autoregressive models + model.generation_config.cache_implementation = "static" + + def benchmark(batch, min_new_tokens=None): + # Load audio inputs + audios = [audio["array"] for audio in batch["audio"]] + minibatch_size = len(audios) + + # START TIMING + start_time = time.time() + + # 1. Pre-Processing + # 1.1 Pad audios to max batch size if using torch compile to prevent re-compilations + padding_size = None + if minibatch_size != args.batch_size and args.torch_compile: + padding_size = args.batch_size - minibatch_size + padding_audios = [audios[-1] for _ in range(padding_size)] + audios.extend(padding_audios) + + if not model.can_generate(): #or len(audios[0]) > processor.feature_extractor.n_samples: + # 1.2 Either CTC pre-processing (normalize to mean 0, std 1), or long-form Whisper processing + inputs = processor( + audios, + sampling_rate=16_000, + truncation=False, + padding="longest", + return_tensors="pt", + return_attention_mask=True, + ) + else: + # 1.3 Standard Whisper processing: pad audios to 30-seconds and converted to log-mel + inputs = processor(audios, sampling_rate=16_000, return_tensors="pt", device=args.device) + + inputs = inputs.to(args.device) + inputs[model_input_name] = inputs[model_input_name].to(torch.bfloat16) + + # 2. Model Inference + with sdpa_kernel(SDPBackend.MATH if args.torch_compile else SDPBackend.FLASH_ATTENTION): + if model.can_generate(): + # 2.1 Auto-regressive generation for encoder-decoder models + pred_ids = model.generate(**inputs, **gen_kwargs, min_new_tokens=min_new_tokens) + else: + # 2.2. Single forward pass for CTC + with torch.no_grad(): + logits = model(**inputs).logits + pred_ids = logits.argmax(-1) + + # 3. Post-processing + # 3.1 Strip padded ids from predictions + if padding_size is not None: + pred_ids = pred_ids[:-padding_size, ...] + + # 3.2 Convert token ids to text transcription + pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True) + + # END TIMING + runtime = time.time() - start_time + + # normalize by minibatch size since we want the per-sample time + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + + # normalize transcriptions with English normalizer + batch["predictions"] = [data_utils.normalizer(pred) for pred in pred_text] + batch["references"] = batch["norm_text"] + return batch + + if args.warmup_steps is not None: + dataset = data_utils.load_data(args) + dataset = data_utils.prepare_data(dataset) + + num_warmup_samples = args.warmup_steps * args.batch_size + if args.streaming: + warmup_dataset = dataset.take(num_warmup_samples) + else: + warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) + warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"min_new_tokens": args.max_new_tokens})) + + for _ in tqdm(warmup_dataset, desc="Warming up..."): + continue + + dataset = data_utils.load_data(args) + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], ) - dataset = load_dataset( - "esb/datasets", - args.dataset_name, - split=args.split, - streaming=args.streaming, - use_auth_token=True, + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], ) + print("Results saved at path:", os.path.abspath(manifest_path)) - # Only uncomment for debugging - dataset = dataset.take(args.max_eval_samples) - - # Re-sample to 16kHz and normalise transcriptions - dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) - dataset = dataset.map(normalize) - dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"]) - - predictions = [] - references = [] - - # run streamed inference - for out in asr_pipe(data(dataset), batch_size=args.batch_size): - predictions.append(normalizer(out["text"])) - references.append(out["reference"][0]) - - wer = wer_metric.compute(references=references, predictions=predictions) + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) wer = round(100 * wer, 2) + rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) + print("WER:", wer, "%", "RTFx:", rtfx) - print("WER:", wer) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -90,11 +165,17 @@ def main(args): help="Model identifier. Should be loadable with 🤗 Transformers", ) parser.add_argument( - "--dataset_name", + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", type=str, required=True, help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " - "can be found at `https://huggingface.co/datasets/esb/datasets`" + "can be found at `https://huggingface.co/datasets/esb/datasets`", ) parser.add_argument( "--split", @@ -121,12 +202,35 @@ def main(args): help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", ) parser.add_argument( - "--streaming", - type=bool, - default=True, + "--no-streaming", + dest="streaming", + action="store_false", help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=None, + help="Maximum number of tokens to generate (for auto-regressive models).", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to JIT compile the forward pass of the model.", + ) + parser.add_argument( + "--compile_mode", + type=str, + default="max-autotune", + help="Mode for torch compiling model forward pass. Can be either 'default', 'reduce-overhead', 'max-autotune' or 'max-autotune-no-cudagraphs'.", + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=10, + help="Number of warm-up steps to run before launching the timed runs.", + ) args = parser.parse_args() + parser.set_defaults(streaming=False) main(args) - diff --git a/transformers/run_hubert.sh b/transformers/run_hubert.sh new file mode 100755 index 0000000..aa0d111 --- /dev/null +++ b/transformers/run_hubert.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=( "facebook/hubert-large-ls960-ft" "facebook/hubert-xlarge-ls960-ft" "patrickvonplaten/hubert-xlarge-ls960-ft-4-gram") +BATCH_SIZE=64 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/transformers/run_mms.sh b/transformers/run_mms.sh new file mode 100755 index 0000000..6337314 --- /dev/null +++ b/transformers/run_mms.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("facebook/mms-1b-all" "facebook/mms-1b-fl102") +BATCH_SIZE=48 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/transformers/run_wav2vec2.sh b/transformers/run_wav2vec2.sh old mode 100644 new mode 100755 index 7cf1c57..6ac2e66 --- a/transformers/run_wav2vec2.sh +++ b/transformers/run_wav2vec2.sh @@ -1,5 +1,92 @@ -python run_eval.py \ - --model_id="facebook/wav2vec2-base-960h" \ - --dataset="librispeech_asr" \ - --split="test" \ - --device=0 +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("facebook/wav2vec2-base-960h" "facebook/wav2vec2-large-960h" "facebook/wav2vec2-large-960h-lv60-self" "facebook/wav2vec2-large-robust-ft-libri-960h") +BATCH_SIZE=64 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/transformers/run_wav2vec2_conformer.sh b/transformers/run_wav2vec2_conformer.sh new file mode 100755 index 0000000..b2cdccf --- /dev/null +++ b/transformers/run_wav2vec2_conformer.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("facebook/wav2vec2-conformer-rel-pos-large-960h-ft" "facebook/wav2vec2-conformer-rope-large-960h-ft") +BATCH_SIZE=32 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/transformers/run_whisper.sh b/transformers/run_whisper.sh old mode 100644 new mode 100755 index 75b9b6e..b3813ff --- a/transformers/run_whisper.sh +++ b/transformers/run_whisper.sh @@ -1,5 +1,92 @@ -python run_eval.py \ - --model_id="openai/whisper-tiny" \ - --dataset="librispeech_asr" \ - --split="test" \ - --device=0 +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("openai/whisper-tiny.en" "openai/whisper-small.en" "openai/whisper-base.en" "openai/whisper-medium.en" "openai/whisper-large" "openai/whisper-large-v2" "openai/whisper-large-v3" "distil-whisper/distil-medium.en" "distil-whisper/distil-large-v2" "distil-whisper/distil-large-v3" "nyrahealth/CrisperWhisper") +BATCH_SIZE=64 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done diff --git a/vllm/run_eval.py b/vllm/run_eval.py new file mode 100644 index 0000000..127dd7d --- /dev/null +++ b/vllm/run_eval.py @@ -0,0 +1,163 @@ +"""Run evaluation for vllm whisper models.""" "" +import argparse +import os +import time + +import evaluate +from tqdm import tqdm +from normalizer import data_utils + +from vllm import LLM +from vllm.sampling_params import SamplingParams + +wer_metric = evaluate.load("wer") + + +def main(args) -> None: + """Main function to run evaluation on a dataset.""" + + device_id = "auto" + if (args.device > 0): + device_id = f"cuda:{args.device}" + + llm = LLM( + model=args.model_id, + max_model_len=448, + max_num_seqs=args.batch_size, + limit_mm_per_prompt={"audio": 1}, + kv_cache_dtype="fp8", + device=device_id + ) + + def make_prompt(chunk, sr): + return { + "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + "multi_modal_data": { + "audio": (chunk, sr), + }, + } + + def process_vllm(batch): + start_time = time.time() + batch_size = len(batch["audio"]) + sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + max_tokens=200, + ) + # dataset resamples to 16kHz + prompts = [ make_prompt(sample["array"], 16000.0) for sample in batch["audio"] ] + outputs = llm.generate(prompts, sampling_params) + # average transcription time over the whole batch + batch["transcription_time_s"] = [ (time.time() - start_time) / batch_size ] * batch_size + batch["predictions"] = [ + data_utils.normalizer("".join([output.outputs[0].text])).strip() + for output in outputs + ] + batch["references"] = batch["norm_text"] + return batch + + dataset = data_utils.load_data(args) + + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples!") + if args.streaming: + dataset = dataset.take(args.max_eval_samples) + else: + dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) + dataset = data_utils.prepare_data(dataset) + + dataset = dataset.map( + process_vllm, batch_size=args.batch_size * 2, batched=True, remove_columns=["audio"] + ) + + all_results = { + "audio_length_s": [], + "transcription_time_s": [], + "predictions": [], + "references": [], + } + result_iter = iter(dataset) + for result in tqdm(result_iter, desc="Samples..."): + for key in all_results: + all_results[key].append(result[key]) + + # Write manifest results (WER and RTFX) + manifest_path = data_utils.write_manifest( + all_results["references"], + all_results["predictions"], + args.model_id, + args.dataset_path, + args.dataset, + args.split, + audio_length=all_results["audio_length_s"], + transcription_time=all_results["transcription_time_s"], + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute( + references=all_results["references"], predictions=all_results["predictions"] + ) + wer = round(100 * wer, 2) + rtfx = round( + sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2 + ) + print("WER:", wer, "%", "RTFx:", rtfx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_id", + type=str, + required=True, + help="Model identifier. Should be loadable with faster-whisper", + ) + parser.add_argument( + "--dataset_path", + type=str, + default="esb/datasets", + help="Dataset path. By default, it is `esb/datasets`", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " + "can be found at `https://huggingface.co/datasets/esb/datasets`", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", + ) + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=128, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=False) + + main(args) diff --git a/vllm/run_whisper.sh b/vllm/run_whisper.sh new file mode 100755 index 0000000..b3813ff --- /dev/null +++ b/vllm/run_whisper.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +export PYTHONPATH="..":$PYTHONPATH + +MODEL_IDs=("openai/whisper-tiny.en" "openai/whisper-small.en" "openai/whisper-base.en" "openai/whisper-medium.en" "openai/whisper-large" "openai/whisper-large-v2" "openai/whisper-large-v3" "distil-whisper/distil-medium.en" "distil-whisper/distil-large-v2" "distil-whisper/distil-large-v3" "nyrahealth/CrisperWhisper") +BATCH_SIZE=64 + +num_models=${#MODEL_IDs[@]} + +for (( i=0; i<${num_models}; i++ )); +do + MODEL_ID=${MODEL_IDs[$i]} + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.clean" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + python run_eval.py \ + --model_id=${MODEL_ID} \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=0 \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 + + # Evaluate results + RUNDIR=`pwd` && \ + cd ../normalizer && \ + python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ + cd $RUNDIR + +done