First lazy cuda graph:

diegocastanibm · diegocastanibm · commit 1f7091e6de20 · 2025-08-15T11:43:22.000-04:00
* Needs to be clean - remove all the prints, all the stack of calls prints (inspect)
* it is done of the execute_model method using _dummy_run

Signed-off-by: Diego-Castan &lt;diego.castan@ibm.com&gt;
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import dataclasses
+import dataclasses, inspect
 from contextlib import ExitStack
 from typing import Any, Callable, Optional
 from unittest.mock import patch
@@ -97,6 +97,9 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                 need_to_compile=shape in self.compile_sizes,
                 use_cudagraph=shape in self.cudagraph_capture_sizes,
             )
+        
+        # DIEGO: to avoid printing all the stack of calls every single time
+        self.print_stack_calls = True
 
     def check_for_ending_compilation(self):
         if self.is_last_graph and not self.to_be_compiled_sizes:
@@ -161,6 +164,11 @@ def __call__(self, *args) -> Any:
                 # We only log it in the debug mode.
                 logger.debug("Capturing a cudagraph for shape %s",
                              runtime_shape)
+                # if self.print_stack_calls:
+                #     self.print_stack_calls = False
+                #     for frame_info in inspect.stack():
+                #         logger.debug(f"DIEGO: File: {frame_info.filename}, Line: {frame_info.lineno}, Function: {frame_info.function}")
+
 
             input_addresses = [
                 x.data_ptr() for x in args if isinstance(x, torch.Tensor)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import dataclasses
+import dataclasses, inspect
 import gc
 import time
 from contextlib import contextmanager
@@ -328,6 +328,9 @@ def __init__(
         if self.cache_config.kv_sharing_fast_prefill:
             self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
                 self.max_num_tokens, dtype=torch.int32, device=self.device)
+            
+        # DIEGO: to avoid printing all the stack of calls every single time
+        self.print_stack_calls = True
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
@@ -2516,7 +2519,7 @@ def profile_run(self) -> None:
         self.encoder_cache.clear()
         gc.collect()
 
-    def capture_model(self, specific_token_num: Optional[int]) -> None:
+    def capture_model(self, specific_token_num: Optional[int] = None) -> None:
         if not self.use_cuda_graph:
             logger.warning(
                 "Skipping CUDA graph capture. To turn on CUDA graph capture, "
@@ -2534,7 +2537,7 @@ def freeze_gc():
             # Optimize garbage collection during CUDA graph capture.
             # Clean up, then freeze all remaining objects from being included
             # in future collections.
-            gc.collect()
+            # gc.collect()
             should_freeze = not envs.VLLM_ENABLE_CUDAGRAPH_GC
             if should_freeze:
                 gc.freeze()
@@ -2573,8 +2576,15 @@ def freeze_gc():
         elapsed_time = end_time - start_time
         cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
         # This usually takes 5~20 seconds.
-        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+        logger.info("Graph capturing finished in %.3f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
+        
+        # DIEGO: print all the stack of calls just the first time
+        # if self.print_stack_calls == True:
+        #     self.print_stack_calls = False
+        #     for frame_info in inspect.stack():
+        #         logger.info(f"DIEGO: File: {frame_info.filename}, Line: {frame_info.lineno}, Function: {frame_info.function}")
+
 
     def _initialize_single_attn_backend(
         self, kv_cache_spec: KVCacheSpec, layer_names: list[str]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
-import copy
+import copy, time
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
@@ -358,11 +358,23 @@ def execute_model(
                 get_pp_group().recv_tensor_dict(
                     all_gather_group=get_tp_group()))
 
+        # logger.info("DIEGO: Executing the model")
         # Adding capture model in execution time
-        if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs:
+        # if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs:
+        #     logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
+        #     self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
+        #     self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
+        
+        # Just compilation with dummy run
+        if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens != 0:
             logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
             self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
-            self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
+            start_time = time.perf_counter()
+            self.model_runner._dummy_run(scheduler_output.total_num_scheduled_tokens, capture_attn_cudagraph=False, skip_eplb=True)
+            end_time = time.perf_counter()
+            elapsed_time = end_time - start_time
+            logger.info("Graph capturing finished in %.3f secs", elapsed_time)
+            
         
         output = self.model_runner.execute_model(scheduler_output,
                                                  intermediate_tensors)