|
| 1 | +"""Benchmark the latency of processing a single batch of requests.""" |
| 2 | +import argparse |
| 3 | +import json |
| 4 | +import os |
| 5 | +import time |
| 6 | +from contextlib import contextmanager, nullcontext |
| 7 | +from pathlib import Path |
| 8 | +from typing import List, Optional |
| 9 | + |
| 10 | +import numpy as np |
| 11 | +import torch |
| 12 | +from rpdTracerControl import rpdTracerControl as rpd |
| 13 | +from tqdm import tqdm |
| 14 | + |
| 15 | +from vllm import LLM, SamplingParams |
| 16 | +from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs |
| 17 | +from vllm.inputs import PromptType |
| 18 | +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS |
| 19 | +from vllm.utils import FlexibleArgumentParser |
| 20 | + |
| 21 | + |
| 22 | +def main(args: argparse.Namespace): |
| 23 | + print(args) |
| 24 | + |
| 25 | + @contextmanager |
| 26 | + def rpd_profiler_context(): |
| 27 | + llm.start_profile() |
| 28 | + yield |
| 29 | + llm.stop_profile() |
| 30 | + rpd.top_totals() |
| 31 | + |
| 32 | + @contextmanager |
| 33 | + def torch_profiler_context(profile_dir: Optional[str] = None, |
| 34 | + trace_file_name=None): |
| 35 | + p = torch.profiler.profile( |
| 36 | + activities=[ |
| 37 | + torch.profiler.ProfilerActivity.CPU, |
| 38 | + torch.profiler.ProfilerActivity.CUDA, |
| 39 | + ], |
| 40 | + on_trace_ready=torch.profiler.tensorboard_trace_handler( |
| 41 | + str(profile_dir))) |
| 42 | + p.start() |
| 43 | + try: |
| 44 | + with torch.no_grad(): |
| 45 | + yield p |
| 46 | + finally: |
| 47 | + p.stop() |
| 48 | + print(p.key_averages().table(sort_by="self_cuda_time_total", |
| 49 | + row_limit=-1)) |
| 50 | + |
| 51 | + def get_profiling_context(profile_dir: Optional[str] = None, |
| 52 | + trace_file_name=None): |
| 53 | + if args.profile_torch: |
| 54 | + return torch_profiler_context(profile_dir, trace_file_name) |
| 55 | + elif args.profile_rpd: |
| 56 | + return rpd_profiler_context() |
| 57 | + else: |
| 58 | + return nullcontext() |
| 59 | + |
| 60 | + # NOTE(woosuk): If the request cannot be processed in a single batch, |
| 61 | + # the engine will automatically process the request in multiple batches. |
| 62 | + llm = LLM( |
| 63 | + model=args.model, |
| 64 | + speculative_model=args.speculative_model, |
| 65 | + num_speculative_tokens=args.num_speculative_tokens, |
| 66 | + speculative_draft_tensor_parallel_size=\ |
| 67 | + args.speculative_draft_tensor_parallel_size, |
| 68 | + tokenizer=args.tokenizer, |
| 69 | + quantization=args.quantization, |
| 70 | + tensor_parallel_size=args.tensor_parallel_size, |
| 71 | + trust_remote_code=args.trust_remote_code, |
| 72 | + dtype=args.dtype, |
| 73 | + max_model_len=args.max_model_len, |
| 74 | + enforce_eager=args.enforce_eager, |
| 75 | + kv_cache_dtype=args.kv_cache_dtype, |
| 76 | + quantization_param_path=args.quantization_param_path, |
| 77 | + device=args.device, |
| 78 | + ray_workers_use_nsight=args.ray_workers_use_nsight, |
| 79 | + use_v2_block_manager=args.use_v2_block_manager, |
| 80 | + enable_chunked_prefill=args.enable_chunked_prefill, |
| 81 | + download_dir=args.download_dir, |
| 82 | + block_size=args.block_size, |
| 83 | + gpu_memory_utilization=args.gpu_memory_utilization, |
| 84 | + load_format=args.load_format, |
| 85 | + distributed_executor_backend=args.distributed_executor_backend, |
| 86 | + otlp_traces_endpoint=args.otlp_traces_endpoint, |
| 87 | + enable_prefix_caching=args.enable_prefix_caching, |
| 88 | + num_scheduler_steps=args.num_scheduler_steps, |
| 89 | + ) |
| 90 | + |
| 91 | + sampling_params = SamplingParams( |
| 92 | + n=args.n, |
| 93 | + temperature=0.0 if args.use_beam_search else 1.0, |
| 94 | + top_p=1.0, |
| 95 | + use_beam_search=args.use_beam_search, |
| 96 | + ignore_eos=True, |
| 97 | + max_tokens=args.output_len, |
| 98 | + ) |
| 99 | + print(sampling_params) |
| 100 | + dummy_prompt_token_ids = np.random.randint(10000, |
| 101 | + size=(args.batch_size, |
| 102 | + args.input_len)) |
| 103 | + dummy_prompts: List[PromptType] = [{ |
| 104 | + "prompt_token_ids": batch |
| 105 | + } for batch in dummy_prompt_token_ids.tolist()] |
| 106 | + |
| 107 | + def run_to_completion(profile_dir: Optional[str] = None): |
| 108 | + if profile_dir: |
| 109 | + with get_profiling_context(): |
| 110 | + llm.generate(dummy_prompts, |
| 111 | + sampling_params=sampling_params, |
| 112 | + use_tqdm=False) |
| 113 | + else: |
| 114 | + start_time = time.perf_counter() |
| 115 | + llm.generate(dummy_prompts, |
| 116 | + sampling_params=sampling_params, |
| 117 | + use_tqdm=False) |
| 118 | + end_time = time.perf_counter() |
| 119 | + latency = end_time - start_time |
| 120 | + return latency |
| 121 | + |
| 122 | + print("Warming up...") |
| 123 | + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): |
| 124 | + run_to_completion(profile_dir=None) |
| 125 | + |
| 126 | + if args.profile_torch or args.profile_rpd: |
| 127 | + profile_dir = args.profile_dir |
| 128 | + if not profile_dir: |
| 129 | + profile_dir = Path(".") / "vllm_benchmark_latency_result" |
| 130 | + os.makedirs(profile_dir, exist_ok=True) |
| 131 | + print(f"Profiling (results will be saved to '{profile_dir}')...") |
| 132 | + run_to_completion(profile_dir=profile_dir) |
| 133 | + return |
| 134 | + |
| 135 | + # Benchmark. |
| 136 | + latencies = [] |
| 137 | + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): |
| 138 | + latencies.append(run_to_completion(profile_dir=None)) |
| 139 | + latencies = np.array(latencies) |
| 140 | + percentages = [10, 25, 50, 75, 90, 99] |
| 141 | + percentiles = np.percentile(latencies, percentages) |
| 142 | + print(f'Avg latency: {np.mean(latencies)} seconds') |
| 143 | + for percentage, percentile in zip(percentages, percentiles): |
| 144 | + print(f'{percentage}% percentile latency: {percentile} seconds') |
| 145 | + |
| 146 | + # Output JSON results if specified |
| 147 | + if args.output_json: |
| 148 | + results = { |
| 149 | + "avg_latency": np.mean(latencies), |
| 150 | + "latencies": latencies.tolist(), |
| 151 | + "percentiles": dict(zip(percentages, percentiles.tolist())), |
| 152 | + } |
| 153 | + with open(args.output_json, "w") as f: |
| 154 | + json.dump(results, f, indent=4) |
| 155 | + |
| 156 | + |
| 157 | +if __name__ == '__main__': |
| 158 | + parser = FlexibleArgumentParser( |
| 159 | + description='Benchmark the latency of processing a single batch of ' |
| 160 | + 'requests till completion.') |
| 161 | + parser.add_argument('--model', type=str, default='facebook/opt-125m') |
| 162 | + parser.add_argument('--speculative-model', type=str, default=None) |
| 163 | + parser.add_argument('--num-speculative-tokens', type=int, default=None) |
| 164 | + parser.add_argument('--speculative-draft-tensor-parallel-size', |
| 165 | + '-spec-draft-tp', |
| 166 | + type=int, |
| 167 | + default=None) |
| 168 | + parser.add_argument('--tokenizer', type=str, default=None) |
| 169 | + parser.add_argument('--quantization', |
| 170 | + '-q', |
| 171 | + choices=[*QUANTIZATION_METHODS, None], |
| 172 | + default=None) |
| 173 | + parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) |
| 174 | + parser.add_argument('--input-len', type=int, default=32) |
| 175 | + parser.add_argument('--output-len', type=int, default=128) |
| 176 | + parser.add_argument('--batch-size', type=int, default=8) |
| 177 | + parser.add_argument('--n', |
| 178 | + type=int, |
| 179 | + default=1, |
| 180 | + help='Number of generated sequences per prompt.') |
| 181 | + parser.add_argument('--use-beam-search', action='store_true') |
| 182 | + parser.add_argument('--num-iters-warmup', |
| 183 | + type=int, |
| 184 | + default=10, |
| 185 | + help='Number of iterations to run for warmup.') |
| 186 | + parser.add_argument('--num-iters', |
| 187 | + type=int, |
| 188 | + default=30, |
| 189 | + help='Number of iterations to run.') |
| 190 | + parser.add_argument('--trust-remote-code', |
| 191 | + action='store_true', |
| 192 | + help='trust remote code from huggingface') |
| 193 | + parser.add_argument( |
| 194 | + '--max-model-len', |
| 195 | + type=int, |
| 196 | + default=None, |
| 197 | + help='Maximum length of a sequence (including prompt and output). ' |
| 198 | + 'If None, will be derived from the model.') |
| 199 | + parser.add_argument( |
| 200 | + '--dtype', |
| 201 | + type=str, |
| 202 | + default='auto', |
| 203 | + choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], |
| 204 | + help='data type for model weights and activations. ' |
| 205 | + 'The "auto" option will use FP16 precision ' |
| 206 | + 'for FP32 and FP16 models, and BF16 precision ' |
| 207 | + 'for BF16 models.') |
| 208 | + parser.add_argument('--enforce-eager', |
| 209 | + action='store_true', |
| 210 | + help='enforce eager mode and disable CUDA graph') |
| 211 | + parser.add_argument( |
| 212 | + '--kv-cache-dtype', |
| 213 | + type=str, |
| 214 | + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], |
| 215 | + default="auto", |
| 216 | + help='Data type for kv cache storage. If "auto", will use model ' |
| 217 | + 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' |
| 218 | + 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') |
| 219 | + parser.add_argument( |
| 220 | + '--quantization-param-path', |
| 221 | + type=str, |
| 222 | + default=None, |
| 223 | + help='Path to the JSON file containing the KV cache scaling factors. ' |
| 224 | + 'This should generally be supplied, when KV cache dtype is FP8. ' |
| 225 | + 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' |
| 226 | + 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' |
| 227 | + 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' |
| 228 | + 'instead supported for common inference criteria.') |
| 229 | + parser.add_argument( |
| 230 | + '--quantized-weights-path', |
| 231 | + type=str, |
| 232 | + default=None, |
| 233 | + help='Path to the safetensor file containing the quantized weights ' |
| 234 | + 'and scaling factors. This should generally be supplied, when ' |
| 235 | + 'quantization is FP8.') |
| 236 | + parser.add_argument( |
| 237 | + '--profile-torch', |
| 238 | + action='store_true', |
| 239 | + help='profile the generation process of a single batch') |
| 240 | + parser.add_argument( |
| 241 | + '--profile-rpd', |
| 242 | + action='store_true', |
| 243 | + help='profile the generation process of a single batch') |
| 244 | + parser.add_argument( |
| 245 | + '--profile-dir', |
| 246 | + type=str, |
| 247 | + default=os.getenv('VLLM_RPD_PROFILER_DIR', default=None), |
| 248 | + help=('path to save the profiler output. Can be visualized ' |
| 249 | + 'with ui.perfetto.dev or Tensorboard.')) |
| 250 | + parser.add_argument("--device", |
| 251 | + type=str, |
| 252 | + default="auto", |
| 253 | + choices=DEVICE_OPTIONS, |
| 254 | + help='device type for vLLM execution') |
| 255 | + parser.add_argument('--block-size', |
| 256 | + type=int, |
| 257 | + default=16, |
| 258 | + help='block size of key/value cache') |
| 259 | + parser.add_argument( |
| 260 | + '--enable-chunked-prefill', |
| 261 | + action='store_true', |
| 262 | + help='If True, the prefill requests can be chunked based on the ' |
| 263 | + 'max_num_batched_tokens') |
| 264 | + parser.add_argument("--enable-prefix-caching", |
| 265 | + action='store_true', |
| 266 | + help="Enable automatic prefix caching") |
| 267 | + parser.add_argument('--use-v2-block-manager', action='store_true') |
| 268 | + parser.add_argument( |
| 269 | + "--ray-workers-use-nsight", |
| 270 | + action='store_true', |
| 271 | + help="If specified, use nsight to profile ray workers", |
| 272 | + ) |
| 273 | + parser.add_argument('--download-dir', |
| 274 | + type=str, |
| 275 | + default=None, |
| 276 | + help='directory to download and load the weights, ' |
| 277 | + 'default to the default cache dir of huggingface') |
| 278 | + parser.add_argument( |
| 279 | + '--output-json', |
| 280 | + type=str, |
| 281 | + default=None, |
| 282 | + help='Path to save the latency results in JSON format.') |
| 283 | + parser.add_argument('--gpu-memory-utilization', |
| 284 | + type=float, |
| 285 | + default=0.9, |
| 286 | + help='the fraction of GPU memory to be used for ' |
| 287 | + 'the model executor, which can range from 0 to 1.' |
| 288 | + 'If unspecified, will use the default value of 0.9.') |
| 289 | + parser.add_argument( |
| 290 | + '--load-format', |
| 291 | + type=str, |
| 292 | + default=EngineArgs.load_format, |
| 293 | + choices=[ |
| 294 | + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', |
| 295 | + 'bitsandbytes' |
| 296 | + ], |
| 297 | + help='The format of the model weights to load.\n\n' |
| 298 | + '* "auto" will try to load the weights in the safetensors format ' |
| 299 | + 'and fall back to the pytorch bin format if safetensors format ' |
| 300 | + 'is not available.\n' |
| 301 | + '* "pt" will load the weights in the pytorch bin format.\n' |
| 302 | + '* "safetensors" will load the weights in the safetensors format.\n' |
| 303 | + '* "npcache" will load the weights in pytorch format and store ' |
| 304 | + 'a numpy cache to speed up the loading.\n' |
| 305 | + '* "dummy" will initialize the weights with random values, ' |
| 306 | + 'which is mainly for profiling.\n' |
| 307 | + '* "tensorizer" will load the weights using tensorizer from ' |
| 308 | + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' |
| 309 | + 'section for more information.\n' |
| 310 | + '* "bitsandbytes" will load the weights using bitsandbytes ' |
| 311 | + 'quantization.\n') |
| 312 | + parser.add_argument( |
| 313 | + '--distributed-executor-backend', |
| 314 | + choices=['ray', 'mp'], |
| 315 | + default=None, |
| 316 | + help='Backend to use for distributed serving. When more than 1 GPU ' |
| 317 | + 'is used, will be automatically set to "ray" if installed ' |
| 318 | + 'or "mp" (multiprocessing) otherwise.') |
| 319 | + parser.add_argument( |
| 320 | + '--otlp-traces-endpoint', |
| 321 | + type=str, |
| 322 | + default=None, |
| 323 | + help='Target URL to which OpenTelemetry traces will be sent.') |
| 324 | + parser.add_argument( |
| 325 | + "--num-scheduler-steps", |
| 326 | + type=int, |
| 327 | + default=1, |
| 328 | + help="Maximum number of forward steps per scheduler call.") |
| 329 | + args = parser.parse_args() |
| 330 | + main(args) |
0 commit comments