@@ -377,33 +377,39 @@ def compile_cuda_graph(input_size: int):
377
377
elapsed_time = end_time - start_time
378
378
logger .info ("Graph capturing finished in %.3f secs" , elapsed_time )
379
379
380
- # ATTENTION: This code is duplicated in compile_or_warm_up_model method
381
- # so we should clean this part before creating the vllm PR
382
- # warm up sizes that are not in cudagraph capture sizes,
383
- # but users still want to compile for better performance,
384
- # e.g. for the max-num-batched token size in chunked prefill.
385
- warmup_sizes = self .vllm_config .compilation_config .compile_sizes .copy ()
386
- logger .info ("Warm up sizes %s" , str (warmup_sizes ))
387
- if not self .model_config .enforce_eager :
388
- warmup_sizes = [
389
- x for x in warmup_sizes if x not in
390
- self .vllm_config .compilation_config .cudagraph_capture_sizes
391
- ]
392
-
393
- warmup_sizes_set = set (warmup_sizes )
380
+ # # ATTENTION: This code is duplicated in compile_or_warm_up_model method
381
+ # # so we should clean this part before creating the vllm PR
382
+ # # warm up sizes that are not in cudagraph capture sizes,
383
+ # # but users still want to compile for better performance,
384
+ # # e.g. for the max-num-batched token size in chunked prefill.
385
+ # warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
386
+ # logger.info("Warm up sizes %s", str(warmup_sizes))
387
+ # if not self.model_config.enforce_eager:
388
+ # warmup_sizes = [
389
+ # x for x in warmup_sizes if x not in
390
+ # self.vllm_config.compilation_config.cudagraph_capture_sizes
391
+ # ]
392
+
393
+ # warmup_sizes_set = set(warmup_sizes)
394
+
395
+ self .cudagraph_batch_sizes_set = set (
396
+ reversed (self .compilation_config .cudagraph_capture_sizes ))
394
397
# Just compilation with dummy run
395
- if scheduler_output .total_num_scheduled_tokens not in self ._token_compiled_cudagraphs and scheduler_output .total_num_scheduled_tokens in warmup_sizes_set and scheduler_output .total_num_scheduled_tokens != 0 :
398
+ if scheduler_output .total_num_scheduled_tokens not in self ._token_compiled_cudagraphs and scheduler_output .total_num_scheduled_tokens in self . cudagraph_batch_sizes_set and scheduler_output .total_num_scheduled_tokens != 0 :
396
399
logger .info (
397
- "DIEGO: CUDAgraph in execution time for %d input tokens" ,
400
+ "LAZY DIEGO: CUDAgraph in execution time for %d input tokens" ,
398
401
scheduler_output .total_num_scheduled_tokens )
399
402
self ._token_compiled_cudagraphs .add (
400
403
scheduler_output .total_num_scheduled_tokens )
401
404
compile_cuda_graph (scheduler_output .total_num_scheduled_tokens )
402
405
else :
403
- next_comp_set = warmup_sizes_set .difference (self ._token_compiled_cudagraphs )
406
+ next_comp_set = self . cudagraph_batch_sizes_set .difference (self ._token_compiled_cudagraphs )
404
407
if len (next_comp_set ) != 0 :
405
408
next_comp = list (next_comp_set )
406
409
self ._token_compiled_cudagraphs .add (next_comp [0 ])
410
+ logger .info (
411
+ "DELAYED DIEGO: CUDAgraph in execution time for %d input tokens" ,
412
+ next_comp [0 ])
407
413
compile_cuda_graph (next_comp [0 ])
408
414
409
415
output = self .model_runner .execute_model (scheduler_output ,
0 commit comments