|
1 | 1 | # SPDX-License-Identifier: Apache-2.0 |
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
3 | | -"""Benchmark the latency of processing a single batch of requests.""" |
| 3 | +import sys |
4 | 4 |
|
5 | | -import argparse |
6 | | -import dataclasses |
7 | | -import json |
8 | | -import os |
9 | | -import time |
10 | | -from typing import Any, Optional |
11 | | - |
12 | | -import numpy as np |
13 | | -from tqdm import tqdm |
14 | | -from typing_extensions import deprecated |
15 | | - |
16 | | -import vllm.envs as envs |
17 | | -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json |
18 | | -from vllm import LLM, SamplingParams |
19 | | -from vllm.engine.arg_utils import EngineArgs |
20 | | -from vllm.inputs import PromptType |
21 | | -from vllm.sampling_params import BeamSearchParams |
22 | | -from vllm.utils import FlexibleArgumentParser |
23 | | - |
24 | | - |
25 | | -def save_to_pytorch_benchmark_format( |
26 | | - args: argparse.Namespace, results: dict[str, Any] |
27 | | -) -> None: |
28 | | - pt_records = convert_to_pytorch_benchmark_format( |
29 | | - args=args, |
30 | | - metrics={"latency": results["latencies"]}, |
31 | | - extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, |
32 | | - ) |
33 | | - if pt_records: |
34 | | - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" |
35 | | - write_to_json(pt_file, pt_records) |
36 | | - |
37 | | - |
38 | | -@deprecated( |
39 | | - "benchmark_latency.py is deprecated and will be removed in a " |
40 | | - "future version. Please use 'vllm bench latency' instead.", |
41 | | -) |
42 | | -def main(args: argparse.Namespace): |
43 | | - print(args) |
44 | | - |
45 | | - engine_args = EngineArgs.from_cli_args(args) |
46 | | - |
47 | | - # NOTE(woosuk): If the request cannot be processed in a single batch, |
48 | | - # the engine will automatically process the request in multiple batches. |
49 | | - llm = LLM(**dataclasses.asdict(engine_args)) |
50 | | - assert llm.llm_engine.model_config.max_model_len >= ( |
51 | | - args.input_len + args.output_len |
52 | | - ), ( |
53 | | - "Please ensure that max_model_len is greater than" |
54 | | - " the sum of input_len and output_len." |
55 | | - ) |
56 | | - |
57 | | - sampling_params = SamplingParams( |
58 | | - n=args.n, |
59 | | - temperature=1.0, |
60 | | - top_p=1.0, |
61 | | - ignore_eos=True, |
62 | | - max_tokens=args.output_len, |
63 | | - detokenize=not args.disable_detokenize, |
64 | | - ) |
65 | | - print(sampling_params) |
66 | | - dummy_prompt_token_ids = np.random.randint( |
67 | | - 10000, size=(args.batch_size, args.input_len) |
68 | | - ) |
69 | | - dummy_prompts: list[PromptType] = [ |
70 | | - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() |
71 | | - ] |
72 | | - |
73 | | - def llm_generate(): |
74 | | - if not args.use_beam_search: |
75 | | - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) |
76 | | - else: |
77 | | - llm.beam_search( |
78 | | - dummy_prompts, |
79 | | - BeamSearchParams( |
80 | | - beam_width=args.n, |
81 | | - max_tokens=args.output_len, |
82 | | - ignore_eos=True, |
83 | | - ), |
84 | | - ) |
85 | | - |
86 | | - def run_to_completion(profile_dir: Optional[str] = None): |
87 | | - if profile_dir: |
88 | | - llm.start_profile() |
89 | | - llm_generate() |
90 | | - llm.stop_profile() |
91 | | - else: |
92 | | - start_time = time.perf_counter() |
93 | | - llm_generate() |
94 | | - end_time = time.perf_counter() |
95 | | - latency = end_time - start_time |
96 | | - return latency |
97 | | - |
98 | | - print("Warming up...") |
99 | | - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): |
100 | | - run_to_completion(profile_dir=None) |
101 | | - |
102 | | - if args.profile: |
103 | | - profile_dir = envs.VLLM_TORCH_PROFILER_DIR |
104 | | - print(f"Profiling (results will be saved to '{profile_dir}')...") |
105 | | - run_to_completion(profile_dir=profile_dir) |
106 | | - return |
107 | | - |
108 | | - # Benchmark. |
109 | | - latencies = [] |
110 | | - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): |
111 | | - latencies.append(run_to_completion(profile_dir=None)) |
112 | | - latencies = np.array(latencies) |
113 | | - percentages = [10, 25, 50, 75, 90, 99] |
114 | | - percentiles = np.percentile(latencies, percentages) |
115 | | - print(f"Avg latency: {np.mean(latencies)} seconds") |
116 | | - for percentage, percentile in zip(percentages, percentiles): |
117 | | - print(f"{percentage}% percentile latency: {percentile} seconds") |
118 | | - |
119 | | - # Output JSON results if specified |
120 | | - if args.output_json: |
121 | | - results = { |
122 | | - "avg_latency": np.mean(latencies), |
123 | | - "latencies": latencies.tolist(), |
124 | | - "percentiles": dict(zip(percentages, percentiles.tolist())), |
125 | | - } |
126 | | - with open(args.output_json, "w") as f: |
127 | | - json.dump(results, f, indent=4) |
128 | | - save_to_pytorch_benchmark_format(args, results) |
129 | | - |
130 | | - |
131 | | -def create_argument_parser(): |
132 | | - parser = FlexibleArgumentParser( |
133 | | - description="Benchmark the latency of processing a single batch of " |
134 | | - "requests till completion." |
135 | | - ) |
136 | | - parser.add_argument("--input-len", type=int, default=32) |
137 | | - parser.add_argument("--output-len", type=int, default=128) |
138 | | - parser.add_argument("--batch-size", type=int, default=8) |
139 | | - parser.add_argument( |
140 | | - "--n", |
141 | | - type=int, |
142 | | - default=1, |
143 | | - help="Number of generated sequences per prompt.", |
144 | | - ) |
145 | | - parser.add_argument("--use-beam-search", action="store_true") |
146 | | - parser.add_argument( |
147 | | - "--num-iters-warmup", |
148 | | - type=int, |
149 | | - default=10, |
150 | | - help="Number of iterations to run for warmup.", |
151 | | - ) |
152 | | - parser.add_argument( |
153 | | - "--num-iters", type=int, default=30, help="Number of iterations to run." |
154 | | - ) |
155 | | - parser.add_argument( |
156 | | - "--profile", |
157 | | - action="store_true", |
158 | | - help="profile the generation process of a single batch", |
159 | | - ) |
160 | | - parser.add_argument( |
161 | | - "--output-json", |
162 | | - type=str, |
163 | | - default=None, |
164 | | - help="Path to save the latency results in JSON format.", |
165 | | - ) |
166 | | - parser.add_argument( |
167 | | - "--disable-detokenize", |
168 | | - action="store_true", |
169 | | - help=( |
170 | | - "Do not detokenize responses (i.e. do not include " |
171 | | - "detokenization time in the latency measurement)" |
172 | | - ), |
173 | | - ) |
174 | | - |
175 | | - parser = EngineArgs.add_cli_args(parser) |
176 | | - # V1 enables prefix caching by default which skews the latency |
177 | | - # numbers. We need to disable prefix caching by default. |
178 | | - parser.set_defaults(enable_prefix_caching=False) |
| 5 | +if __name__ == "__main__": |
| 6 | + print("""DEPRECATED: This script has been moved to the vLLM CLI. |
179 | 7 |
|
180 | | - return parser |
| 8 | +Please use the following command instead: |
| 9 | + vllm bench latency |
181 | 10 |
|
| 11 | +For help with the new command, run: |
| 12 | + vllm bench latency --help |
182 | 13 |
|
183 | | -if __name__ == "__main__": |
184 | | - parser = create_argument_parser() |
185 | | - args = parser.parse_args() |
186 | | - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: |
187 | | - raise OSError( |
188 | | - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " |
189 | | - "Please set it to a valid path to use torch profiler." |
190 | | - ) |
191 | | - main(args) |
| 14 | +Alternatively, you can run the new command directly with: |
| 15 | + python -m vllm.entrypoints.cli.main bench latency --help |
| 16 | +""") |
| 17 | + sys.exit(1) |
0 commit comments