Skip to content

Commit 19cf84f

Browse files
Delayed approach v0
Signed-off-by: Diego-Castan <[email protected]>
1 parent 9a44436 commit 19cf84f

File tree

1 file changed

+20
-11
lines changed

1 file changed

+20
-11
lines changed

vllm/v1/worker/gpu_worker.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,17 @@ def execute_model(
366366
# self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
367367
# self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
368368

369+
def compile_cuda_graph(input_size: int):
370+
gc.freeze()
371+
start_time = time.perf_counter()
372+
self.model_runner._dummy_run(input_size,
373+
capture_attn_cudagraph=False,
374+
skip_eplb=True)
375+
end_time = time.perf_counter()
376+
gc.unfreeze()
377+
elapsed_time = end_time - start_time
378+
logger.info("Graph capturing finished in %.3f secs", elapsed_time)
379+
369380
# ATTENTION: This code is duplicated in compile_or_warm_up_model method
370381
# so we should clean this part before creating the vllm PR
371382
# warm up sizes that are not in cudagraph capture sizes,
@@ -378,23 +389,21 @@ def execute_model(
378389
self.vllm_config.compilation_config.cudagraph_capture_sizes
379390
]
380391

392+
warmup_sizes_set = set(warmup_sizes)
381393
# Just compilation with dummy run
382-
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in warmup_sizes and scheduler_output.total_num_scheduled_tokens != 0:
394+
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in warmup_sizes_set and scheduler_output.total_num_scheduled_tokens != 0:
383395
logger.info(
384396
"DIEGO: CUDAgraph in execution time for %d input tokens",
385397
scheduler_output.total_num_scheduled_tokens)
386398
self._token_compiled_cudagraphs.add(
387399
scheduler_output.total_num_scheduled_tokens)
388-
gc.freeze()
389-
start_time = time.perf_counter()
390-
self.model_runner._dummy_run(
391-
scheduler_output.total_num_scheduled_tokens,
392-
capture_attn_cudagraph=False,
393-
skip_eplb=True)
394-
end_time = time.perf_counter()
395-
gc.unfreeze()
396-
elapsed_time = end_time - start_time
397-
logger.info("Graph capturing finished in %.3f secs", elapsed_time)
400+
compile_cuda_graph(scheduler_output.total_num_scheduled_tokens)
401+
else:
402+
next_comp = list(
403+
warmup_sizes_set.difference(
404+
self._token_compiled_cudagraphs))[0]
405+
self._token_compiled_cudagraphs.add(next_comp)
406+
compile_cuda_graph(next_comp)
398407

399408
output = self.model_runner.execute_model(scheduler_output,
400409
intermediate_tensors)

0 commit comments

Comments
 (0)