@@ -366,6 +366,17 @@ def execute_model(
366
366
# self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
367
367
# self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
368
368
369
+ def compile_cuda_graph (input_size : int ):
370
+ gc .freeze ()
371
+ start_time = time .perf_counter ()
372
+ self .model_runner ._dummy_run (input_size ,
373
+ capture_attn_cudagraph = False ,
374
+ skip_eplb = True )
375
+ end_time = time .perf_counter ()
376
+ gc .unfreeze ()
377
+ elapsed_time = end_time - start_time
378
+ logger .info ("Graph capturing finished in %.3f secs" , elapsed_time )
379
+
369
380
# ATTENTION: This code is duplicated in compile_or_warm_up_model method
370
381
# so we should clean this part before creating the vllm PR
371
382
# warm up sizes that are not in cudagraph capture sizes,
@@ -378,23 +389,21 @@ def execute_model(
378
389
self .vllm_config .compilation_config .cudagraph_capture_sizes
379
390
]
380
391
392
+ warmup_sizes_set = set (warmup_sizes )
381
393
# Just compilation with dummy run
382
- if scheduler_output .total_num_scheduled_tokens not in self ._token_compiled_cudagraphs and scheduler_output .total_num_scheduled_tokens in warmup_sizes and scheduler_output .total_num_scheduled_tokens != 0 :
394
+ if scheduler_output .total_num_scheduled_tokens not in self ._token_compiled_cudagraphs and scheduler_output .total_num_scheduled_tokens in warmup_sizes_set and scheduler_output .total_num_scheduled_tokens != 0 :
383
395
logger .info (
384
396
"DIEGO: CUDAgraph in execution time for %d input tokens" ,
385
397
scheduler_output .total_num_scheduled_tokens )
386
398
self ._token_compiled_cudagraphs .add (
387
399
scheduler_output .total_num_scheduled_tokens )
388
- gc .freeze ()
389
- start_time = time .perf_counter ()
390
- self .model_runner ._dummy_run (
391
- scheduler_output .total_num_scheduled_tokens ,
392
- capture_attn_cudagraph = False ,
393
- skip_eplb = True )
394
- end_time = time .perf_counter ()
395
- gc .unfreeze ()
396
- elapsed_time = end_time - start_time
397
- logger .info ("Graph capturing finished in %.3f secs" , elapsed_time )
400
+ compile_cuda_graph (scheduler_output .total_num_scheduled_tokens )
401
+ else :
402
+ next_comp = list (
403
+ warmup_sizes_set .difference (
404
+ self ._token_compiled_cudagraphs ))[0 ]
405
+ self ._token_compiled_cudagraphs .add (next_comp )
406
+ compile_cuda_graph (next_comp )
398
407
399
408
output = self .model_runner .execute_model (scheduler_output ,
400
409
intermediate_tensors )
0 commit comments