Skip to content

Commit b3fe509

Browse files
Lazy and delayed with self.cudagraph_batch_sizes_set
Signed-off-by: Diego-Castan <[email protected]>
1 parent 04dc491 commit b3fe509

File tree

1 file changed

+23
-17
lines changed

1 file changed

+23
-17
lines changed

vllm/v1/worker/gpu_worker.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -377,33 +377,39 @@ def compile_cuda_graph(input_size: int):
377377
elapsed_time = end_time - start_time
378378
logger.info("Graph capturing finished in %.3f secs", elapsed_time)
379379

380-
# ATTENTION: This code is duplicated in compile_or_warm_up_model method
381-
# so we should clean this part before creating the vllm PR
382-
# warm up sizes that are not in cudagraph capture sizes,
383-
# but users still want to compile for better performance,
384-
# e.g. for the max-num-batched token size in chunked prefill.
385-
warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
386-
logger.info("Warm up sizes %s", str(warmup_sizes))
387-
if not self.model_config.enforce_eager:
388-
warmup_sizes = [
389-
x for x in warmup_sizes if x not in
390-
self.vllm_config.compilation_config.cudagraph_capture_sizes
391-
]
392-
393-
warmup_sizes_set = set(warmup_sizes)
380+
# # ATTENTION: This code is duplicated in compile_or_warm_up_model method
381+
# # so we should clean this part before creating the vllm PR
382+
# # warm up sizes that are not in cudagraph capture sizes,
383+
# # but users still want to compile for better performance,
384+
# # e.g. for the max-num-batched token size in chunked prefill.
385+
# warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
386+
# logger.info("Warm up sizes %s", str(warmup_sizes))
387+
# if not self.model_config.enforce_eager:
388+
# warmup_sizes = [
389+
# x for x in warmup_sizes if x not in
390+
# self.vllm_config.compilation_config.cudagraph_capture_sizes
391+
# ]
392+
393+
# warmup_sizes_set = set(warmup_sizes)
394+
395+
self.cudagraph_batch_sizes_set = set(
396+
reversed(self.compilation_config.cudagraph_capture_sizes))
394397
# Just compilation with dummy run
395-
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in warmup_sizes_set and scheduler_output.total_num_scheduled_tokens != 0:
398+
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in self.cudagraph_batch_sizes_set and scheduler_output.total_num_scheduled_tokens != 0:
396399
logger.info(
397-
"DIEGO: CUDAgraph in execution time for %d input tokens",
400+
"LAZY DIEGO: CUDAgraph in execution time for %d input tokens",
398401
scheduler_output.total_num_scheduled_tokens)
399402
self._token_compiled_cudagraphs.add(
400403
scheduler_output.total_num_scheduled_tokens)
401404
compile_cuda_graph(scheduler_output.total_num_scheduled_tokens)
402405
else:
403-
next_comp_set = warmup_sizes_set.difference(self._token_compiled_cudagraphs)
406+
next_comp_set = self.cudagraph_batch_sizes_set.difference(self._token_compiled_cudagraphs)
404407
if len(next_comp_set) != 0:
405408
next_comp = list(next_comp_set)
406409
self._token_compiled_cudagraphs.add(next_comp[0])
410+
logger.info(
411+
"DELAYED DIEGO: CUDAgraph in execution time for %d input tokens",
412+
next_comp[0])
407413
compile_cuda_graph(next_comp[0])
408414

409415
output = self.model_runner.execute_model(scheduler_output,

0 commit comments

Comments
 (0)