diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 0412c5f379..a378bc6baa 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,9 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB +# Note that we have 400 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/3792 . +# Please also sync the value with the one in Dockerfile. +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py new file mode 100644 index 0000000000..36e1b6c013 --- /dev/null +++ b/.buildkite/generate_index.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os + +template = """ + +
+Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.
- -Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:
-[Bugfix] for bug fixes.[CI/Build] for build or continuous integration improvements.[Doc] for documentation fixes and improvements.[Model] for adding a new model or improving an existing model. Model name should appear in the title.[Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.) [Kernel] for changes affecting CUDA kernels or other compute kernels.[Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)[Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).[Misc] for PRs that do not fit the above categories. Please use this sparingly.Note: If the PR spans more than one category, please include all relevant prefixes.
- -The PR need to meet the following code quality standards:
- -format.sh to format your code.docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
-Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.
The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:
- - action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!
- - --| Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack | +| Documentation | Blog | Paper | Twitter/X | Developer Slack |
+--- + +We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9) + +--- *Latest News* 🔥 -- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! -- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! + +- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). +- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). +- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! +- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). +- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! +- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). @@ -28,13 +38,17 @@ Easy, fast, and cheap LLM serving for everyone - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). --- + ## About + vLLM is a fast and easy-to-use library for LLM inference and serving. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. + vLLM is fast with: - State-of-the-art serving throughput -- Efficient management of attention key and value memory with **PagedAttention** +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. @@ -42,7 +56,7 @@ vLLM is fast with: - Speculative decoding - Chunked prefill -**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. +**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. vLLM is flexible and easy to use with: @@ -57,7 +71,7 @@ vLLM is flexible and easy to use with: vLLM seamlessly supports most popular open-source models on HuggingFace, including: - Transformer-like LLMs (e.g., Llama) -- Mixture-of-Expert LLMs (e.g., Mixtral) +- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) - Embedding Models (e.g. E5-Mistral) - Multi-modal LLMs (e.g., LLaVA) @@ -65,16 +79,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod ## Getting Started -Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): +Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source): ```bash pip install vllm ``` -Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more. -- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) -- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) -- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) +Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. +- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) +- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) +- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) ## Contributing @@ -87,33 +101,40 @@ vLLM is a community project. Our compute resources for development and testing a - +Cash Donations: - a16z +- Dropbox +- Sequoia Capital +- Skywork AI +- ZhenFund + +Compute Resources: - AMD - Anyscale - AWS - Crusoe Cloud - Databricks - DeepInfra -- Dropbox - Google Cloud - Lambda Lab +- Nebius +- Novita AI - NVIDIA - Replicate - Roblox - RunPod -- Sequoia Capital -- Skywork AI - Trainy - UC Berkeley - UC San Diego -- ZhenFund + +Slack Sponsor: Anyscale We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. ## Citation If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180): + ```bibtex @inproceedings{kwon2023efficient, title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, @@ -125,8 +146,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Contact Us -* For technical questions and feature requests, please use Github issues or discussions. -* For discussing with fellow users, please use Discord. -* For coordinating contributions and development, please use Slack. -* For security disclosures, please use Github's security advisory feature. -* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. +- For technical questions and feature requests, please use Github issues or discussions. +- For discussing with fellow users and coordinating contributions and development, please use Slack. +- For security disclosures, please use Github's security advisory feature. +- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. + +## Media Kit + +- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). diff --git a/SECURITY.md b/SECURITY.md index ad3f1f16ab..47196a1f12 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). --- diff --git a/benchmarks/README.md b/benchmarks/README.md index 192d6c4022..367ef93457 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -3,6 +3,27 @@ ## Downloading the ShareGPT dataset You can download the dataset by running: + ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json ``` + +## Downloading the ShareGPT4V dataset + +The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts +will ignore a datapoint if the referred image is missing. + +```bash +wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +mkdir coco -p +wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip +unzip coco/train2017.zip -d coco/ +``` + +# Downloading the BurstGPT dataset + +You can download the BurstGPT v1.1 dataset by running: + +```bash +wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv +``` diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 4813fde27f..364b087b84 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import sys @@ -22,8 +24,10 @@ class RequestFuncInput: prompt_len: int output_len: int model: str + model_name: Optional[str] = None best_of: int = 1 logprobs: Optional[int] = None + extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False @@ -33,9 +37,11 @@ class RequestFuncOutput: generated_text: str = "" success: bool = False latency: float = 0.0 + output_tokens: int = 0 ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies + tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" @@ -47,13 +53,15 @@ async def async_request_tgi( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, "do_sample": True, "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. + "truncate": request_func_input.prompt_len, # TGI does not accept ignore_eos flag. } payload = { @@ -75,11 +83,11 @@ async def async_request_tgi( continue chunk_bytes = chunk_bytes.decode("utf-8") - #NOTE: Sometimes TGI returns a ping response without + # NOTE: Sometimes TGI returns a ping response without # any data, we should skip it. if chunk_bytes.startswith(":"): continue - chunk = remove_prefix(chunk_bytes, "data:") + chunk = chunk_bytes.removeprefix("data:") data = json.loads(chunk) timestamp = time.perf_counter() @@ -118,7 +126,8 @@ async def async_request_trt_llm( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -144,15 +153,15 @@ async def async_request_trt_llm( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data:") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data:") data = json.loads(chunk) output.generated_text += data["text_output"] timestamp = time.perf_counter() # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -182,7 +191,8 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { @@ -230,17 +240,25 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" } @@ -249,32 +267,34 @@ async def async_request_openai_completions( output.prompt_len = request_func_input.prompt_len generated_text = "" - ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: + first_chunk_received = False async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": data = json.loads(chunk) # NOTE: Some completion API might have a last # usage summary response without a token so we # want to check a token was generated - if data["choices"][0]["text"]: + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") timestamp = time.perf_counter() # First token - if ttft == 0.0: + if not first_chunk_received: + first_chunk_received = True ttft = time.perf_counter() - st output.ttft = ttft @@ -284,11 +304,19 @@ async def async_request_openai_completions( most_recent_timestamp) most_recent_timestamp = timestamp - generated_text += data["choices"][0]["text"] - + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + if first_chunk_received: + output.success = True + else: + output.success = False + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!") output.generated_text = generated_text - output.success = True - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False @@ -311,12 +339,14 @@ async def async_request_openai_chat_completions( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "messages": [ { "role": "user", @@ -324,10 +354,16 @@ async def async_request_openai_chat_completions( }, ], "temperature": 0.0, - "max_tokens": request_func_input.output_len, + "max_completion_tokens": request_func_input.output_len, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", @@ -349,19 +385,17 @@ async def async_request_openai_chat_completions( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) - delta = data["choices"][0]["delta"] - if delta.get("content", None): + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -369,13 +403,16 @@ async def async_request_openai_chat_completions( output.itl.append(timestamp - most_recent_timestamp) - generated_text += delta["content"] + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False @@ -389,14 +426,6 @@ async def async_request_openai_chat_completions( return output -# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix):] - return text - - def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download @@ -411,14 +440,35 @@ def get_model(pretrained_model_name_or_path: str) -> str: def get_tokenizer( - pretrained_model_name_or_path: str, trust_remote_code: bool + pretrained_model_name_or_path: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path): pretrained_model_name_or_path = get_model( pretrained_model_name_or_path) - return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError( + "Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + if tokenizer_mode == "mistral": + try: + from vllm.transformers_utils.tokenizer import MistralTokenizer + except ImportError as e: + raise ImportError("MistralTokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use mistral tokenizer mode.") from e + return MistralTokenizer.from_pretrained( + str(pretrained_model_name_or_path)) + else: + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) ASYNC_REQUEST_FUNCS = { diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py new file mode 100644 index 0000000000..2b41834baf --- /dev/null +++ b/benchmarks/benchmark_guided.py @@ -0,0 +1,495 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Benchmark guided decoding throughput.""" +import argparse +import dataclasses +import json +import os +import random +import time +from typing import List + +import datasets +import pandas as pd +import uvloop +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) +from vllm.sampling_params import GuidedDecodingParams +from vllm.utils import FlexibleArgumentParser, merge_async_iterators + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str = 'json' + completion: str = None + + +def run_vllm(requests: List[SampleRequest], + engine_args: EngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False) -> float: + from vllm import LLM, SamplingParams + llm = LLM(**vars(engine_args)) + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + # create a list containing random selected true or false + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if guided_decoding_rate > 0 else None, + )) + llm.generate(prompts, sampling_params, use_tqdm=False) + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + **{request.structure_type: request.schema}) + if i in guided_decoding_req_idx else None, + )) + + start = time.perf_counter() + outputs = llm.generate(prompts, sampling_params, use_tqdm=False) + ret = [] + for output, request in zip(outputs, requests): + generated_text = output.outputs[0].text + ret.append({ + "generated": generated_text, + "expected": request.completion + }) + end = time.perf_counter() + return end - start, ret + + +async def run_vllm_async( + requests: List[SampleRequest], + engine_args: AsyncEngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False, + disable_frontend_multiprocessing: bool = False) -> float: + from vllm import SamplingParams + + async with build_async_engine_client_from_engine_args( + engine_args, disable_frontend_multiprocessing) as llm: + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + json=request.schema) + if guided_decoding_rate > 0 else None, + )) + generators = [] + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if i in guided_decoding_req_idx else None, + )) + + generators = [] + start_time = [] + latencies = [] + start = time.perf_counter() + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + start_time.append(time.perf_counter()) + latencies.append([]) + all_gens = merge_async_iterators(*generators) + generated_texts = [''] * len(requests) + async for i, res in all_gens: + generated_texts[i] = res.outputs[0].text + lat = time.perf_counter() - start_time[i] + latencies[i].append(lat) + ret = [{ + 'generated': gt, + 'expected': req.completion + } for gt, req in zip(generated_texts, requests)] + end = time.perf_counter() + first_latency = pd.Series([lat[0] * 1000 for lat in latencies]) + next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000 + for lat in latencies]) + return end - start, ret, (first_latency, next_latency) + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + if args.dataset == 'json': + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join(dir_path, + "structured_schemas", + "structured_schema_1.json") + with open(args.json_schema_path) as f: + schema = json.load(f) + prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + args.warmup = False + requests: List[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", + split="train") + print(f"dataset has {len(dataset)} entries") + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], + tokenize=False) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + completion=completion)) + + return requests + + +def evaluate(ret, args): + + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import re + actual = actual.replace('\n', '').replace(' ', '').strip() + try: + actual = re.search(r'\{.*\}', actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import re + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == 'json': + return _eval_correctness_json(expected, actual) + elif args.structure_type == 'regex': + return _eval_correctness_regex(expected, actual) + elif args.structure_type == 'choice': + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res['expected'], res['generated']) + res['correctness'] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return (sum(not_none_scores) / len(not_none_scores) * + 100) if len(not_none_scores) > 0 else None + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # async engine is working for 'regex', 'choice' and 'grammar' + if args.dataset == 'grammar': + args.structure_type = 'grammar' + args.async_engine = False + elif args.dataset == 'regex': + args.structure_type = 'regex' + args.async_engine = False + elif args.dataset == 'choice': + args.structure_type = 'choice' + args.async_engine = False + else: + args.structure_type = 'json' + + if args.no_guided_decoding: + args.guided_decoding_ratio = 0 + if args.save_results: + result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += f"_async{args.async_engine}" + result_file_name += f"_warmup{args.warmup}" + result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}" + result_file_name += ".txt" + else: + result_file_name = None + + # Synthesize a prompt with the given input length. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + requests = sample_requests(tokenizer, args) + + if args.async_engine: + engine_args = AsyncEngineArgs.from_cli_args(args) + elapsed_time, ret, (first_latency, next_latency) = uvloop.run( + run_vllm_async(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup, + args.disable_frontend_multiprocessing)) + else: + engine_args = EngineArgs.from_cli_args(args) + elapsed_time, ret = run_vllm(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup) + first_latency, next_latency = None, None + + score = evaluate(ret, args) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if first_latency is not None: + latency_breakdown = "\nFirst token latency(msecs):\n" + latency_breakdown += f"{first_latency.describe()}" + latency_breakdown += "\nNext token latency(msecs):\n" + latency_breakdown += f"{next_latency.describe()}" + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s", + f"Correct rate is {score} %", + f"{latency_breakdown if first_latency is not None else ''}") + + # Output JSON results if specified + if args.output_json or result_file_name: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "total_output_tokens": total_output_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}", + "output_tokens_per_second": + f"{total_output_tokens / elapsed_time:.2f}", + "correct_rate(%)": score + } + results = {"outputs": ret, **results} + if first_latency is not None: + results["first_token_latency(msecs)"] = first_latency.describe( + ).to_dict() + results["next_token_latency(msecs)"] = next_latency.describe( + ).to_dict() + if args.output_json: + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + elif result_file_name: + with open(result_file_name, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark guided decoding.") + parser = AsyncEngineArgs.add_cli_args(parser) + + parser.add_argument("--output-len", + type=int, + default=512, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument( + "--dataset", + default='json', + choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--json_schema_path", + type=str, + default=None, + help="Path to json schema.") + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", + type=int, + default=10, + help="Number of prompts to process.") + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the throughput results in JSON format.') + parser.add_argument("--async-engine", + action='store_true', + default=False, + help="Use vLLM async engine rather than LLM class.") + parser.add_argument("--no-guided-decoding", + action='store_true', + default=False, + help="Whether to disable JSON decoding or not.") + parser.add_argument("--guided-decoding-ratio", + type=float, + default=1.0, + help="Ratio of Guided Decoding requests") + parser.add_argument("--disable-frontend-multiprocessing", + action='store_true', + default=False, + help="Disable decoupled async engine frontend.") + parser.add_argument("--warmup", + action="store_true", + default=False, + help="Run warmup prompts before benchmark.") + parser.add_argument("--save-results", + action="store_true", + default=False, + help="save output results.") + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + main(args) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index ea1a7788f6..b041626550 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,52 +1,47 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark the latency of processing a single batch of requests.""" + import argparse +import dataclasses import json +import os import time from pathlib import Path -from typing import List, Optional +from typing import Any, Dict, List, Optional import numpy as np import torch +from benchmark_utils import convert_to_pytorch_benchmark_format from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: Dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={"latency": results["latencies"]}, + extra_info={k: results[k] + for k in ["avg_latency", "percentiles"]}) + if pt_records: + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + with open(pt_file, "w") as f: + json.dump(pt_records, f) + + def main(args: argparse.Namespace): print(args) + engine_args = EngineArgs.from_cli_args(args) + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - ) + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams( n=args.n, @@ -63,6 +58,21 @@ def main(args: argparse.Namespace): "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] + def llm_generate(): + if not args.use_beam_search: + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + else: + llm.beam_search( + dummy_prompts, + BeamSearchParams( + beam_width=args.n, + max_tokens=args.output_len, + ignore_eos=True, + ), + ) + def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: with torch.profiler.profile( @@ -71,16 +81,13 @@ def run_to_completion(profile_dir: Optional[str] = None): torch.profiler.ProfilerActivity.CUDA, ], on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) + str(profile_dir)), + ) as p: + llm_generate() + print(p.key_averages().table(sort_by="self_cuda_time_total")) else: start_time = time.perf_counter() - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) + llm_generate() end_time = time.perf_counter() latency = end_time - start_time return latency @@ -92,9 +99,8 @@ def run_to_completion(profile_dir: Optional[str] = None): if args.profile: profile_dir = args.profile_result_dir if not profile_dir: - profile_dir = Path( - "." - ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + profile_dir = (Path(".") / "vllm_benchmark_result" / + f"latency_result_{time.time()}") print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -106,9 +112,9 @@ def run_to_completion(profile_dir: Optional[str] = None): latencies = np.array(latencies) percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) - print(f'Avg latency: {np.mean(latencies)} seconds') + print(f"Avg latency: {np.mean(latencies)} seconds") for percentage, percentile in zip(percentages, percentiles): - print(f'{percentage}% percentile latency: {percentile} seconds') + print(f"{percentage}% percentile latency: {percentile} seconds") # Output JSON results if specified if args.output_json: @@ -119,162 +125,52 @@ def run_to_completion(profile_dir: Optional[str] = None): } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) -if __name__ == '__main__': +if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt.') - parser.add_argument('--use-beam-search', action='store_true') - parser.add_argument('--num-iters-warmup', - type=int, - default=10, - help='Number of iterations to run for warmup.') - parser.add_argument('--num-iters', - type=int, - default=30, - help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') + description="Benchmark the latency of processing a single batch of " + "requests till completion.") + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) parser.add_argument( - '--max-model-len', + "--n", type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - '--profile', - action='store_true', - help='profile the generation process of a single batch') + default=1, + help="Number of generated sequences per prompt.", + ) + parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( - '--profile-result-dir', - type=str, - default=None, - help=('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument('--block-size', + "--num-iters-warmup", + type=int, + default=10, + help="Number of iterations to run for warmup.", + ) + parser.add_argument("--num-iters", type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") + default=30, + help="Number of iterations to run.") parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", + "--profile", + action="store_true", + help="profile the generation process of a single batch", ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') - parser.add_argument( - '--output-json', - type=str, - default=None, - help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') parser.add_argument( - '--load-format', + "--profile-result-dir", type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') + help=("path to save the pytorch profiler output. Can be visualized " + "with ui.perfetto.dev or Tensorboard."), + ) parser.add_argument( - '--otlp-traces-endpoint', + "--output-json", type=str, default=None, - help='Target URL to which OpenTelemetry traces will be sent.') + help="Path to save the latency results in JSON format.", + ) + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py new file mode 100644 index 0000000000..21480578ed --- /dev/null +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Offline benchmark to test the long document QA throughput. + +Example usage: + # This workload samples 8 different prompts with a default input + # length of 20000 tokens, then replicates each prompt 2 times + # in random order. + python benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --repeat-count 2 + +Commandline arguments: + --num-documents: The number of documents to sample prompts from. + + --document-length: The length of each document in tokens. + (Optional, default: 20000) + + --output-len: The number of tokens to generate for each prompt. + (Optional, default: 10) + + --repeat-count: The number of times to repeat each prompt. + (Optional, default: 2) + + --repeat-mode: The mode to repeat prompts. The supported modes are: + - 'random': shuffle the prompts randomly. (Default) + - 'tile': the entire prompt list is repeated in sequence. (Potentially + lowest cache hit) + - 'interleave': each prompt is repeated consecutively before + moving to the next element. (Highest cache hit) + + --shuffle-seed: Random seed when the repeat mode is "random". + (Optional, default: 0) + +In the meantime, it also supports all the vLLM engine args to initialize the +LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more +details. +""" + +import dataclasses +import random +import time + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def test_long_document_qa(llm=None, sampling_params=None, prompts=None): + """ + Test long document QA with the given prompts and sampling parameters. + Print the time spent in processing all the prompts. + + Args: + llm: The language model used for generating responses. + sampling_params: Sampling parameter used to generate the response. + prompts: A list of prompt strings to be processed by the LLM. + """ + start_time = time.time() + llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + print(f"Time to execute all requests: {end_time - start_time:.4f} secs") + + +def repeat_prompts(prompts, repeat_count, mode: str): + """ + Repeat each prompt in the list for a specified number of times. + The order of prompts in the output list depends on the mode. + + Args: + prompts: A list of prompts to be repeated. + repeat_count: The number of times each prompt is repeated. + mode: The mode of repetition. Supported modes are: + - 'random': Shuffle the prompts randomly after repetition. + - 'tile': Repeat the entire prompt list in sequence. + Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. + - 'interleave': Repeat each prompt consecutively before moving to + the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. + + Returns: + A list of repeated prompts in the specified order. + + Raises: + ValueError: If an invalid mode is provided. + """ + print("Repeat mode: ", mode) + if mode == 'random': + repeated_prompts = prompts * repeat_count + random.shuffle(repeated_prompts) + return repeated_prompts + elif mode == 'tile': + return prompts * repeat_count + elif mode == 'interleave': + repeated_prompts = [] + for prompt in prompts: + repeated_prompts.extend([prompt] * repeat_count) + return repeated_prompts + else: + raise ValueError(f"Invalid mode: {mode}, only support " + "'random', 'tile', 'interleave'") + + +def main(args): + random.seed(args.shuffle_seed) + + # Prepare the prompts: + # we append the document id at the beginning to avoid any of the document + # being the prefix of other documents + prompts = [ + str(i) + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents) + ] + + prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) + + warmup_prompts = [ + "This is warm up request " + str(i) + \ + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents)] + + # Create the LLM engine + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + + print("------warm up------") + test_long_document_qa( + llm=llm, + prompts=warmup_prompts, + sampling_params=sampling_params, + ) + + print("------start generating------") + test_long_document_qa( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description= + 'Benchmark the performance with or without automatic prefix caching.') + + parser.add_argument( + '--document-length', + type=int, + # Roughly the number of tokens for a system paper, + # excluding images + default=20000, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--num-documents', + type=int, + default=8, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--output-len', type=int, default=10) + + parser.add_argument('--repeat-count', + type=int, + default=2, + help='Number of times to repeat each prompt') + + parser.add_argument("--repeat-mode", + type=str, + default='random', + help='The mode to repeat prompts. The supported ' + 'modes are "random", "tile", and "interleave". ' + 'See repeat_prompts() in the source code for details.') + + parser.add_argument("--shuffle-seed", + type=int, + default=0, + help='Random seed when the repeat mode is "random"') + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index a354358e43..23822856b8 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Benchmark the efficiency of prefix caching. @@ -10,7 +11,8 @@ --model meta-llama/Llama-2-7b-chat-hf \ --enable-prefix-caching \ --num-prompts 1 \ - --repeat-count 100 + --repeat-count 100 \ + --input-length-range 128:256 ShareGPT example usage: # This command samples 20 prompts with input lengths @@ -25,6 +27,7 @@ --input-length-range 128:256 """ +import dataclasses import json import random import time @@ -33,6 +36,7 @@ from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser try: @@ -52,13 +56,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None): print(f"cost time {end_time - start_time}") -def sample_requests( +@dataclasses.dataclass +class Request: + prompt: str + prompt_len: int + output_len: int + + +def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str: + vocab = tokenizer.get_vocab() + # Remove the special tokens. + vocab = { + k: v + for k, v in vocab.items() if k not in tokenizer.all_special_ids + } + return random.choices(list(vocab.values()), k=length) + + +def sample_requests_from_dataset( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: Tuple[int, int], fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> List[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -75,31 +96,55 @@ def sample_requests( random.shuffle(dataset) min_len, max_len = input_length_range + assert min_len >= 0 and max_len >= min_len, "input_length_range too small" # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_requests: List[Request] = [] + for i in range(len(dataset)): - if len(filtered_dataset) == num_requests: + if len(filtered_requests) == num_requests: break # Tokenize the prompts and completions. - prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids + prompt_token_ids = tokenizer(dataset[i][0]).input_ids + prompt = tokenizer.decode(prompt_token_ids) completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - continue + output_len = (len(completion_token_ids) + if fixed_output_len is None else fixed_output_len) if min_len <= prompt_len <= max_len: - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_requests.append(Request(prompt, prompt_len, output_len)) + + return filtered_requests + + +def sample_requests_from_random( + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + input_length_range: Tuple[int, int], + fixed_output_len: Optional[int], + prefix_len: int, +) -> List[Request]: - return filtered_dataset + requests = [] + prefix_token_ids = sample_tokens(tokenizer, prefix_len) + min_len, max_len = input_length_range + + for i in range(num_requests): + unique_part_token_ids = sample_tokens( + tokenizer, + random.randint(min_len - prefix_len, max_len - prefix_len)) + prompt_token_ids = prefix_token_ids + unique_part_token_ids + prompt = tokenizer.decode(prompt_token_ids) + prompt_len = len(prompt_token_ids) + assert (min_len <= prompt_len <= max_len + ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}" + requests.append(Request(prompt, prompt_len, fixed_output_len)) + return requests -def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], +def repeat_and_sort_requests(requests: List[Request], repeat_count: int, sort: bool = False) -> List[str]: repeated_requests = requests * repeat_count @@ -107,7 +152,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], repeated_requests.sort(key=lambda x: x[1]) else: random.shuffle(repeated_requests) - return [req[0] for req in repeated_requests] + return [req.prompt for req in repeated_requests] def main(args): @@ -115,9 +160,12 @@ def main(args): input_length_range = tuple(map(int, args.input_length_range.split(':'))) random.seed(args.seed) if args.dataset_path is not None: - print(f"Start to sample {args.num_prompts} prompts" - "from {args.dataset_path}") - filtered_datasets = sample_requests( + if args.prefix_len > 0: + raise ValueError("prefix-len is not supported when " + "dataset-path is provided.") + print(f"Start to sample {args.num_prompts} prompts " + f"from {args.dataset_path}") + filtered_requests = sample_requests_from_dataset( dataset_path=args.dataset_path, num_requests=args.num_prompts, tokenizer=tokenizer, @@ -125,31 +173,34 @@ def main(args): fixed_output_len=args.output_len, ) else: - prompt_len = len(tokenizer(PROMPT).input_ids) - filtered_datasets = [(PROMPT, prompt_len, args.output_len) - ] * args.num_prompts + print(f"Start to sample {args.num_prompts} prompts from random") + filtered_requests = sample_requests_from_random( + num_requests=args.num_prompts, + tokenizer=tokenizer, + input_length_range=input_length_range, + fixed_output_len=args.output_len, + prefix_len=args.prefix_len, + ) + + # Print some helpful stats of the requests. + print(f"Sampled {len(filtered_requests)} requests.") + prompt_lens = [req.prompt_len for req in filtered_requests] + print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}") + print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}") + print(f"Min Prompt Length: {min(prompt_lens)}") + print(f"Max Prompt Length: {max(prompt_lens)}") + + engine_args = EngineArgs.from_cli_args(args) - llm = LLM(model=args.model, - tokenizer_mode='auto', - trust_remote_code=True, - enforce_eager=True, - tensor_parallel_size=args.tensor_parallel_size, - enable_prefix_caching=args.enable_prefix_caching) + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) - print("Testing filtered datasets") - prompts = repeat_and_sort_requests(filtered_datasets, + print("Testing filtered requests") + prompts = repeat_and_sort_requests(filtered_requests, repeat_count=args.repeat_count, sort=args.sort) - print("------warm up------") - test_prefix( - llm=llm, - prompts=prompts, - sampling_params=sampling_params, - ) - print("------start generating------") test_prefix( llm=llm, @@ -162,37 +213,37 @@ def main(args): parser = FlexibleArgumentParser( description= 'Benchmark the performance with or without automatic prefix caching.') - parser.add_argument('--model', - type=str, - default='baichuan-inc/Baichuan2-13B-Chat') parser.add_argument("--dataset-path", type=str, default=None, help="Path to the dataset.") - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--output-len', type=int, default=10) - parser.add_argument('--enable-prefix-caching', - action='store_true', - help='enable prefix caching') parser.add_argument('--num-prompts', type=int, - default=1, + required=True, help="Number of the prompts sampled from dataset") parser.add_argument('--repeat-count', type=int, - default=100, + default=1, help='Number of times to repeat each prompt') parser.add_argument('--sort', action='store_true', help='Sort prompts by input length') parser.add_argument('--input-length-range', type=str, - default='128:256', + required=True, help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') - parser.add_argument("--seed", - type=int, - default=0, - help='Random seed for reproducibility') + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Specifies the length of a common prefix to be " + "added to the input prompt. The input-length-range will " + "subtract this length when filtering prompts. Only used " + "when dataset-path is not provided.", + ) + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 8843e3a927..a32065e4e7 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark offline prioritization.""" import argparse +import dataclasses import json import random import time @@ -7,7 +9,8 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -62,46 +65,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - gpu_memory_utilization: float = 0.9, - download_dir: Optional[str] = None, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - disable_log_stats=False, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts = [] @@ -142,16 +110,8 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.model, args.tokenizer, - args.quantization, args.tensor_parallel_size, - args.seed, args.n, args.trust_remote_code, - args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, - args.enable_chunked_prefill, - args.max_num_batched_tokens, - args.gpu_memory_utilization, args.download_dir) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -173,7 +133,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -191,13 +151,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -206,81 +159,13 @@ def main(args: argparse.Namespace): type=int, default=200, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - "--device", - type=str, - default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1381004c9f..9760737cce 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 r"""Benchmark online serving throughput. On the server side, run one of the following commands: @@ -25,6 +26,7 @@ import argparse import asyncio import base64 +import gc import io import json import os @@ -36,6 +38,7 @@ from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple import numpy as np +import pandas as pd from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, RequestFuncOutput) from datasets import load_dataset @@ -53,6 +56,10 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +from benchmark_utils import convert_to_pytorch_benchmark_format + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + @dataclass class BenchmarkMetrics: @@ -60,6 +67,7 @@ class BenchmarkMetrics: total_input: int total_output: int request_throughput: float + request_goodput: float output_throughput: float total_token_throughput: float mean_ttft_ms: float @@ -126,6 +134,35 @@ def sample_sharegpt_requests( return filtered_dataset +def sample_burstgpt_requests( + dataset_path: str, + num_requests: int, + random_seed: int, + tokenizer: PreTrainedTokenizerBase, +) -> List[Tuple[str, int, int, None]]: + df = pd.read_csv(dataset_path) + gpt4_df = df[df["Model"] == "GPT-4"] + # Remove the failed requests (i.e., response length is 0) + gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] + # Randomly sample num_requests from the dataset + if num_requests <= len(gpt4_df): + gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed) + else: + gpt4_df = gpt4_df.sample(n=num_requests, + random_state=random_seed, + replace=True) + # Convert the dataframe to a list of tuples + dataset = gpt4_df.values.tolist() + input_requests = [] + for i in range(num_requests): + input_len = int(dataset[i][2]) + output_len = int(dataset[i][3]) + prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size + for j in range(input_len)]) + input_requests.append((prompt, input_len, output_len, None)) + return input_requests + + def sample_sonnet_requests( dataset_path: str, num_requests: int, @@ -196,15 +233,72 @@ def sample_sonnet_requests( return sampled_requests +def sample_vision_arena_requests( + dataset, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int] = None, +) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: + sampled_requests: List[Tuple[str, int, int, Dict[str, + Collection[str]]]] = [] + for data in dataset: + if len(sampled_requests) == num_requests: + break + + prompt = data["turns"][0][0]['content'] + + prompt_token_ids = tokenizer(prompt).input_ids + if fixed_output_len is None: + # Default max output len is set to 128 + print("--hf-output-len is not provided. Using default value 128.") + fixed_output_len = 128 + + prompt_len = len(prompt_token_ids) + output_len = fixed_output_len + + assert isinstance( + data["images"][0], + Image), ("Input image format must be `PIL.Image.Image`, " + f"given {type(data['image'])}.") + image: Image = data["images"][0] + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + mm_content = { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + + sampled_requests.append((prompt, prompt_len, output_len, mm_content)) + + return sampled_requests + + def sample_hf_requests( dataset_path: str, - dataset_subset: str, + dataset_subset: Optional[str], dataset_split: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, random_seed: int, fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: + + # Special case for vision_arena dataset + if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \ + and dataset_subset is None: + assert dataset_split == "train" + dataset = load_dataset(dataset_path, + name=dataset_subset, + split=dataset_split, + streaming=True) + dataset = dataset.shuffle(seed=random_seed) + return sample_vision_arena_requests(dataset, num_requests, tokenizer, + fixed_output_len) + dataset = load_dataset(dataset_path, name=dataset_subset, split=dataset_split, @@ -248,6 +342,19 @@ def sample_hf_requests( "url": f"data:image/jpeg;base64,{image_base64}" }, } + elif "image" in data and isinstance(data["image"], str): + if (data["image"].startswith("http://") or \ + data["image"].startswith("file://")): + image_url = data["image"] + else: + image_url = f"file://{data['image']}" + + mm_content = { + "type": "image_url", + "image_url": { + "url": image_url + }, + } else: mm_content = None @@ -294,8 +401,33 @@ def sample_random_requests( async def get_request( input_requests: List[Tuple[str, int, int]], request_rate: float, + burstiness: float = 1.0, ) -> AsyncGenerator[Tuple[str, int, int], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + theta = 1.0 / (request_rate * burstiness) + for request in input_requests: yield request @@ -303,8 +435,9 @@ async def get_request( # If the request rate is infinity, then we don't need to wait. continue - # Sample the request interval from the exponential distribution. - interval = np.random.exponential(1.0 / request_rate) + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) # The next request will be sent after the interval. await asyncio.sleep(interval) @@ -316,28 +449,39 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], + goodput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 completed = 0 + good_completed = 0 itls: List[float] = [] tpots: List[float] = [] + all_tpots: List[float] = [] ttfts: List[float] = [] e2els: List[float] = [] for i in range(len(outputs)): if outputs[i].success: - # We use the tokenizer to count the number of output tokens for all - # serving backends instead of looking at len(outputs[i].itl) since - # multiple output tokens may be bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer(outputs[i].generated_text, - add_special_tokens=False).input_ids) + output_len = outputs[i].output_tokens + + if output_len is None: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] + tpot = 0 if output_len > 1: - tpots.append( - (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) itls += outputs[i].itl ttfts.append(outputs[i].ttft) e2els.append(outputs[i].latency) @@ -345,6 +489,28 @@ def calculate_metrics( else: actual_output_lens.append(0) + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(goodput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(goodput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append(goodput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + if completed == 0: warnings.warn( "All requests failed. This is likely due to a misconfiguration " @@ -355,6 +521,7 @@ def calculate_metrics( total_input=total_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * @@ -373,9 +540,9 @@ def calculate_metrics( median_itl_ms=np.median(itls or 0) * 1000, percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], - mean_e2el_ms=np.median(e2els or 0) * 1000, + mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.mean(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], ) @@ -388,16 +555,21 @@ async def benchmark( api_url: str, base_url: str, model_id: str, + model_name: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], best_of: int, request_rate: float, + burstiness: float, disable_tqdm: bool, profile: bool, selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, + goodput_config_dict: Dict[str, float], + max_concurrency: Optional[int], + lora_modules: Optional[List[str]], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -413,6 +585,7 @@ async def benchmark( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( model=model_id, + model_name=model_name, prompt=test_prompt, api_url=api_url, prompt_len=test_prompt_len, @@ -422,6 +595,7 @@ async def benchmark( multi_modal_content=test_mm_content, ignore_eos=ignore_eos, ) + test_output = await request_func(request_func_input=test_input) if not test_output.success: raise ValueError( @@ -430,9 +604,15 @@ async def benchmark( else: print("Initial test run completed. Starting main benchmark run...") + if lora_modules: + # For each input request, choose a LoRA module at random. + lora_modules = iter( + [random.choice(lora_modules) for _ in range(len(input_requests))]) + if profile: print("Starting profiler...") profile_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=test_prompt, api_url=base_url + "/start_profile", prompt_len=test_prompt_len, @@ -445,15 +625,43 @@ async def benchmark( if profile_output.success: print("Profiler started") + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + benchmark_start_time = time.perf_counter() tasks: List[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate): + async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request - request_func_input = RequestFuncInput(model=model_id, + req_model_id, req_model_name = model_id, model_name + if lora_modules: + req_lora_module = next(lora_modules) + req_model_id, req_model_name = req_lora_module, req_lora_module + + request_func_input = RequestFuncInput(model=req_model_id, + model_name=req_model_name, prompt=prompt, api_url=api_url, prompt_len=prompt_len, @@ -464,8 +672,8 @@ async def benchmark( ignore_eos=ignore_eos) tasks.append( asyncio.create_task( - request_func(request_func_input=request_func_input, - pbar=pbar))) + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -495,6 +703,7 @@ async def benchmark( tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -506,6 +715,9 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) + if goodput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", @@ -517,6 +729,8 @@ async def benchmark( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, + "request_goodput:": + metrics.request_goodput if goodput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -570,6 +784,67 @@ def process_one_metric( return result +def check_goodput_args(args): + # Check and parse goodput arguments + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return goodput_config_dict + + +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return goodput_config_dict + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: Dict[str, Any], + file_name: str) -> None: + metrics = [ + "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", + "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", + "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + ] + # These raw data might be useful, but they are rather big. They can be added + # later if needed + ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={k: [results[k]] + for k in metrics}, + extra_info={ + k: results[k] + for k in results if k not in metrics and k not in ignored_metrics + }) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" + with open(pt_file, "w") as f: + json.dump(pt_records, f) + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -577,7 +852,9 @@ def main(args: argparse.Namespace): backend = args.backend model_id = args.model + model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + tokenizer_mode = args.tokenizer_mode if args.base_url is not None: api_url = f"{args.base_url}{args.endpoint}" @@ -587,6 +864,7 @@ def main(args: argparse.Namespace): base_url = f"http://{args.host}:{args.port}" tokenizer = get_tokenizer(tokenizer_id, + tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) if args.dataset is not None: @@ -610,6 +888,14 @@ def main(args: argparse.Namespace): fixed_output_len=args.sharegpt_output_len, ) + elif args.dataset_name == "burstgpt": + input_requests = sample_burstgpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + random_seed=args.seed, + tokenizer=tokenizer, + ) + elif args.dataset_name == "sonnet": # Do not format the prompt, pass to message directly if args.backend == "openai-chat": @@ -664,17 +950,25 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") + goodput_config_dict = check_goodput_args(args) + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() + benchmark_result = asyncio.run( benchmark( backend=backend, api_url=api_url, base_url=base_url, model_id=model_id, + model_name=model_name, tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, best_of=args.best_of, request_rate=args.request_rate, + burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), @@ -682,6 +976,9 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, + goodput_config_dict=goodput_config_dict, + max_concurrency=args.max_concurrency, + lora_modules=args.lora_modules, )) # Save config and results to json @@ -709,21 +1006,26 @@ def main(args: argparse.Namespace): ) # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf") + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") + result_json["burstiness"] = args.burstiness + result_json["max_concurrency"] = args.max_concurrency # Merge with benchmark result result_json = {**result_json, **benchmark_result} # Save to file base_model_id = model_id.split("/")[-1] - file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + max_concurrency_str = (f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None else "") + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa if args.result_filename: file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) with open(file_name, "w", encoding='utf-8') as outfile: json.dump(result_json, outfile) + save_to_pytorch_benchmark_format(args, result_json, file_name) if __name__ == "__main__": @@ -741,7 +1043,8 @@ def main(args: argparse.Namespace): default=None, help="Server or API base url if not using http host and port.", ) - parser.add_argument("--host", type=str, default="localhost") + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") parser.add_argument("--port", type=int, default=8000) parser.add_argument( "--endpoint", @@ -760,7 +1063,7 @@ def main(args: argparse.Namespace): "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "sonnet", "random", "hf"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], help="Name of the dataset to benchmark on.", ) parser.add_argument("--dataset-path", @@ -768,6 +1071,19 @@ def main(args: argparse.Namespace): default=None, help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + parser.add_argument( "--model", type=str, @@ -810,8 +1126,20 @@ def main(args: argparse.Namespace): default=float("inf"), help="Number of requests per second. If this is inf, " "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.", + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", ) parser.add_argument("--seed", type=int, default=0) parser.add_argument( @@ -881,6 +1209,17 @@ def main(args: argparse.Namespace): "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve") # group for dataset specific arguments sonnet_group = parser.add_argument_group("sonnet dataset options") @@ -962,5 +1301,30 @@ def main(args: argparse.Namespace): "from the sampled HF dataset.", ) + parser.add_argument( + '--tokenizer-mode', + type=str, + default="auto", + choices=['auto', 'slow', 'mistral', 'custom'], + help='The tokenizer mode.\n\n* "auto" will use the ' + 'fast tokenizer if available.\n* "slow" will ' + 'always use the slow tokenizer. \n* ' + '"mistral" will always use the `mistral_common` tokenizer. \n*' + '"custom" will use --tokenizer to select the preregistered tokenizer.') + + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ") + + parser.add_argument("--lora-modules", + nargs='+', + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.") + args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py new file mode 100644 index 0000000000..04942b06ff --- /dev/null +++ b/benchmarks/benchmark_serving_guided.py @@ -0,0 +1,883 @@ +# SPDX-License-Identifier: Apache-2.0 +r"""Benchmark online serving throughput with guided decoding. + +On the server side, run one of the following commands: + (vLLM OpenAI API server) + vllm serve