11# SPDX-License-Identifier: Apache-2.0
22"""Benchmark the latency of processing a single batch of requests."""
3+
34import argparse
45import dataclasses
56import json
@@ -26,6 +27,7 @@ def main(args: argparse.Namespace):
2627 @contextmanager
2728 def rpd_profiler_context ():
2829 from rpdTracerControl import rpdTracerControl as rpd
30+
2931 llm .start_profile ()
3032 yield
3133 llm .stop_profile ()
@@ -39,15 +41,16 @@ def torch_profiler_context(profile_result_dir: Optional[str] = None):
3941 torch .profiler .ProfilerActivity .CUDA ,
4042 ],
4143 on_trace_ready = torch .profiler .tensorboard_trace_handler (
42- str (profile_result_dir )))
44+ str (profile_result_dir )
45+ ),
46+ )
4347 p .start ()
4448 try :
4549 with torch .no_grad ():
4650 yield p
4751 finally :
4852 p .stop ()
49- print (p .key_averages ().table (sort_by = "self_cuda_time_total" ,
50- row_limit = - 1 ))
53+ print (p .key_averages ().table (sort_by = "self_cuda_time_total" , row_limit = - 1 ))
5154
5255 def get_profiling_context (profile_result_dir : Optional [str ] = None ):
5356 if args .profile_torch :
@@ -58,15 +61,16 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
5861 return nullcontext ()
5962
6063 if args .profile_torch or args .profile_rpd :
61- profile_result_dir = Path (args .profile_result_dir
62- or "./vllm_benchmark_latency_result" )
64+ profile_result_dir = Path (
65+ args .profile_result_dir or "./vllm_benchmark_latency_result"
66+ )
6367 profile_result_dir .mkdir (parents = True , exist_ok = True )
6468 name = os .path .basename (os .path .normpath (args .model ))
6569 model_trace_name = (
6670 f"{ name } _in_{ args .input_len } _out_{ args .output_len } _"
67- f"batch_{ args .batch_size } _tp_{ args .tensor_parallel_size } " )
68- print (
69- f"Profiling (results will be saved to '{ profile_result_dir } ')..." )
71+ f"batch_{ args .batch_size } _tp_{ args .tensor_parallel_size } "
72+ )
73+ print ( f"Profiling (results will be saved to '{ profile_result_dir } ')..." )
7074 if args .profile_rpd :
7175 profile_result_dir /= f"{ model_trace_name } .rpd"
7276 os .environ ["VLLM_RPD_PROFILER_DIR" ] = str (profile_result_dir )
@@ -85,26 +89,25 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
8589 max_tokens = args .output_len ,
8690 )
8791 print (sampling_params )
88- dummy_prompt_token_ids = np .random .randint (10000 ,
89- size = (args .batch_size ,
90- args . input_len ) )
91- dummy_prompts : list [PromptType ] = [{
92- "prompt_token_ids" : batch
93- } for batch in dummy_prompt_token_ids . tolist () ]
92+ dummy_prompt_token_ids = np .random .randint (
93+ 10000 , size = (args .batch_size , args . input_len )
94+ )
95+ dummy_prompts : list [PromptType ] = [
96+ { "prompt_token_ids" : batch } for batch in dummy_prompt_token_ids . tolist ()
97+ ]
9498
9599 def llm_generate ():
96100 if not args .use_beam_search :
97- llm .generate (dummy_prompts ,
98- sampling_params = sampling_params ,
99- use_tqdm = False )
101+ llm .generate (dummy_prompts , sampling_params = sampling_params , use_tqdm = False )
100102 else :
101103 llm .beam_search (
102104 dummy_prompts ,
103105 BeamSearchParams (
104106 beam_width = args .n ,
105107 max_tokens = args .output_len ,
106108 ignore_eos = True ,
107- ))
109+ ),
110+ )
108111
109112 def run_to_completion (profile_dir : Optional [str ] = None ):
110113 if profile_dir :
@@ -132,9 +135,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
132135 latencies = np .array (latencies )
133136 percentages = [10 , 25 , 50 , 75 , 90 , 99 ]
134137 percentiles = np .percentile (latencies , percentages )
135- print (f' Avg latency: { np .mean (latencies )} seconds' )
138+ print (f" Avg latency: { np .mean (latencies )} seconds" )
136139 for percentage , percentile in zip (percentages , percentiles ):
137- print (f' { percentage } % percentile latency: { percentile } seconds' )
140+ print (f" { percentage } % percentile latency: { percentile } seconds" )
138141
139142 # Output JSON results if specified
140143 if args .output_json :
@@ -147,45 +150,52 @@ def run_to_completion(profile_dir: Optional[str] = None):
147150 json .dump (results , f , indent = 4 )
148151
149152
150- if __name__ == ' __main__' :
153+ if __name__ == " __main__" :
151154 parser = FlexibleArgumentParser (
152- description = 'Benchmark the latency of processing a single batch of '
153- 'requests till completion.' )
154- parser .add_argument ('--input-len' , type = int , default = 32 )
155- parser .add_argument ('--output-len' , type = int , default = 128 )
156- parser .add_argument ('--batch-size' , type = int , default = 8 )
157- parser .add_argument ('--n' ,
158- type = int ,
159- default = 1 ,
160- help = 'Number of generated sequences per prompt.' )
161- parser .add_argument ('--use-beam-search' , action = 'store_true' )
162- parser .add_argument ('--num-iters-warmup' ,
163- type = int ,
164- default = 10 ,
165- help = 'Number of iterations to run for warmup.' )
166- parser .add_argument ('--num-iters' ,
167- type = int ,
168- default = 30 ,
169- help = 'Number of iterations to run.' )
155+ description = "Benchmark the latency of processing a single batch of "
156+ "requests till completion."
157+ )
158+ parser .add_argument ("--input-len" , type = int , default = 32 )
159+ parser .add_argument ("--output-len" , type = int , default = 128 )
160+ parser .add_argument ("--batch-size" , type = int , default = 8 )
161+ parser .add_argument (
162+ "--n" , type = int , default = 1 , help = "Number of generated sequences per prompt."
163+ )
164+ parser .add_argument ("--use-beam-search" , action = "store_true" )
165+ parser .add_argument (
166+ "--num-iters-warmup" ,
167+ type = int ,
168+ default = 10 ,
169+ help = "Number of iterations to run for warmup." ,
170+ )
170171 parser .add_argument (
171- '--profile-torch' ,
172- action = 'store_true' ,
173- help = 'profile the generation process of a single batch' )
172+ "--num-iters" , type = int , default = 30 , help = "Number of iterations to run."
173+ )
174174 parser .add_argument (
175- '--profile-rpd' ,
176- action = 'store_true' ,
177- help = 'profile the generation process of a single batch' )
175+ "--profile-torch" ,
176+ action = "store_true" ,
177+ help = "profile the generation process of a single batch" ,
178+ )
178179 parser .add_argument (
179- '--profile-result-dir' ,
180+ "--profile-rpd" ,
181+ action = "store_true" ,
182+ help = "profile the generation process of a single batch" ,
183+ )
184+ parser .add_argument (
185+ "--profile-result-dir" ,
180186 type = str ,
181- default = os .getenv ('VLLM_RPD_PROFILER_DIR' , default = None ),
182- help = ('path to save the profiler output. Can be visualized '
183- 'with ui.perfetto.dev or Tensorboard.' ))
187+ default = os .getenv ("VLLM_RPD_PROFILER_DIR" , default = None ),
188+ help = (
189+ "path to save the profiler output. Can be visualized "
190+ "with ui.perfetto.dev or Tensorboard."
191+ ),
192+ )
184193 parser .add_argument (
185- ' --output-json' ,
194+ " --output-json" ,
186195 type = str ,
187196 default = None ,
188- help = 'Path to save the latency results in JSON format.' )
197+ help = "Path to save the latency results in JSON format." ,
198+ )
189199
190200 parser = EngineArgs .add_cli_args (parser )
191201 args = parser .parse_args ()
0 commit comments