Skip to content

Commit 1f7091e

Browse files
First lazy cuda graph:
* Needs to be clean - remove all the prints, all the stack of calls prints (inspect) * it is done of the execute_model method using _dummy_run Signed-off-by: Diego-Castan <[email protected]>
1 parent 2d1b7c7 commit 1f7091e

File tree

3 files changed

+38
-8
lines changed

3 files changed

+38
-8
lines changed

vllm/compilation/cuda_piecewise_backend.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
import dataclasses
4+
import dataclasses, inspect
55
from contextlib import ExitStack
66
from typing import Any, Callable, Optional
77
from unittest.mock import patch
@@ -97,6 +97,9 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
9797
need_to_compile=shape in self.compile_sizes,
9898
use_cudagraph=shape in self.cudagraph_capture_sizes,
9999
)
100+
101+
# DIEGO: to avoid printing all the stack of calls every single time
102+
self.print_stack_calls = True
100103

101104
def check_for_ending_compilation(self):
102105
if self.is_last_graph and not self.to_be_compiled_sizes:
@@ -161,6 +164,11 @@ def __call__(self, *args) -> Any:
161164
# We only log it in the debug mode.
162165
logger.debug("Capturing a cudagraph for shape %s",
163166
runtime_shape)
167+
# if self.print_stack_calls:
168+
# self.print_stack_calls = False
169+
# for frame_info in inspect.stack():
170+
# logger.debug(f"DIEGO: File: {frame_info.filename}, Line: {frame_info.lineno}, Function: {frame_info.function}")
171+
164172

165173
input_addresses = [
166174
x.data_ptr() for x in args if isinstance(x, torch.Tensor)

vllm/v1/worker/gpu_model_runner.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
import dataclasses
4+
import dataclasses, inspect
55
import gc
66
import time
77
from contextlib import contextmanager
@@ -328,6 +328,9 @@ def __init__(
328328
if self.cache_config.kv_sharing_fast_prefill:
329329
self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
330330
self.max_num_tokens, dtype=torch.int32, device=self.device)
331+
332+
# DIEGO: to avoid printing all the stack of calls every single time
333+
self.print_stack_calls = True
331334

332335
def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
333336
"""
@@ -2516,7 +2519,7 @@ def profile_run(self) -> None:
25162519
self.encoder_cache.clear()
25172520
gc.collect()
25182521

2519-
def capture_model(self, specific_token_num: Optional[int]) -> None:
2522+
def capture_model(self, specific_token_num: Optional[int] = None) -> None:
25202523
if not self.use_cuda_graph:
25212524
logger.warning(
25222525
"Skipping CUDA graph capture. To turn on CUDA graph capture, "
@@ -2534,7 +2537,7 @@ def freeze_gc():
25342537
# Optimize garbage collection during CUDA graph capture.
25352538
# Clean up, then freeze all remaining objects from being included
25362539
# in future collections.
2537-
gc.collect()
2540+
# gc.collect()
25382541
should_freeze = not envs.VLLM_ENABLE_CUDAGRAPH_GC
25392542
if should_freeze:
25402543
gc.freeze()
@@ -2573,8 +2576,15 @@ def freeze_gc():
25732576
elapsed_time = end_time - start_time
25742577
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
25752578
# This usually takes 5~20 seconds.
2576-
logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
2579+
logger.info("Graph capturing finished in %.3f secs, took %.2f GiB",
25772580
elapsed_time, cuda_graph_size / (1 << 30))
2581+
2582+
# DIEGO: print all the stack of calls just the first time
2583+
# if self.print_stack_calls == True:
2584+
# self.print_stack_calls = False
2585+
# for frame_info in inspect.stack():
2586+
# logger.info(f"DIEGO: File: {frame_info.filename}, Line: {frame_info.lineno}, Function: {frame_info.function}")
2587+
25782588

25792589
def _initialize_single_attn_backend(
25802590
self, kv_cache_spec: KVCacheSpec, layer_names: list[str]

vllm/v1/worker/gpu_worker.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
"""A GPU worker class."""
4-
import copy
4+
import copy, time
55
import gc
66
import os
77
from contextlib import AbstractContextManager, nullcontext
@@ -358,11 +358,23 @@ def execute_model(
358358
get_pp_group().recv_tensor_dict(
359359
all_gather_group=get_tp_group()))
360360

361+
# logger.info("DIEGO: Executing the model")
361362
# Adding capture model in execution time
362-
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs:
363+
# if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs:
364+
# logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
365+
# self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
366+
# self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
367+
368+
# Just compilation with dummy run
369+
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens != 0:
363370
logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
364371
self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
365-
self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
372+
start_time = time.perf_counter()
373+
self.model_runner._dummy_run(scheduler_output.total_num_scheduled_tokens, capture_attn_cudagraph=False, skip_eplb=True)
374+
end_time = time.perf_counter()
375+
elapsed_time = end_time - start_time
376+
logger.info("Graph capturing finished in %.3f secs", elapsed_time)
377+
366378

367379
output = self.model_runner.execute_model(scheduler_output,
368380
intermediate_tensors)

0 commit comments

Comments
 (0)