full_cudagraph support for FA2

fhl2000 · fhl2000 · commit 6302a7d44005 · 2025-06-25T11:12:11.000+08:00
Signed-off-by: fhl &lt;2410591650@qq.com&gt;
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -563,9 +563,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         self._called = True
 
-        if not self.compilation_config.use_cudagraph or \
-            not self.compilation_config.cudagraph_copy_inputs:
-            return self.split_gm
+        
 
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
@@ -585,6 +583,18 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
                 any(is_symbolic(d) for d in x.size())
         ]
 
+        if self.compilation_config.full_cuda_graph:
+            assert self.compilation_config.use_cudagraph, \
+                "full_cuda_graph mode requires use_cudagraph to be True"
+            fullgraph_wrapper = resolve_obj_by_qualname(
+                current_platform.get_fullgraph_wrapper_cls())
+            self.split_gm = fullgraph_wrapper(self.split_gm, self.vllm_config,
+                                              self.graph_pool, self.sym_tensor_indices)
+
+        if not self.compilation_config.use_cudagraph or \
+            not self.compilation_config.cudagraph_copy_inputs:
+            return self.split_gm
+
         # compiler managed cudagraph input buffers
         # we assume the first run with symbolic shapes
         # has the maximum size among all the tensors
diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py
@@ -70,3 +70,46 @@ def __call__(self, *args) -> Any:
             or a replayed static graph.
         """
         raise NotImplementedError
+
+
+class AbstractFullgraphWrapper(Protocol):
+    """
+    FullgraphWrapper interface that allows platforms to wrap the piecewise graph
+    to be viewed or captured as a full graph.
+    """
+
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, sym_shape_indices: list[int], **kwargs):
+        """
+        Initializes the FullgraphWrapper class with compilation and 
+        execution-related configurations.
+
+        Args:
+            graph (fx.GraphModule): The graph represented in fx.
+            vllm_config (VllmConfig): Global configuration for vLLM.
+            graph_pool (Any): 
+                Graph memory pool handle, e.g., 
+                    `torch.cuda.graph_pool_handle()`.
+            sym_shape_indices (list[int]): 
+                Indices of symbolic shape.
+
+        Keyword Args:
+            kwargs: Additional keyword arguments reserved for future 
+                extensions or custom platforms.
+        
+        """
+        raise NotImplementedError
+    
+    def __call__(self, *args) -> Any:
+        """
+        Executes the wrapped graph for given input args.
+
+        Args:
+            *args: Variable length input arguments to be passed into the 
+                graph. The symbolic shape is expected to be in position 
+                `sym_shape_indices[0]`.
+
+        Returns:
+            Any: Output of the executed wrapped graph.
+        """
+        raise NotImplementedError
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
@@ -96,6 +96,7 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                 runtime_shape=shape,
                 need_to_compile=shape in self.compile_sizes,
                 use_cudagraph=shape in self.cudagraph_capture_sizes,
+                usage_type="piecewise(general)",  # for logging only
             )
 
     def check_for_ending_compilation(self):
@@ -139,27 +140,32 @@ def __call__(self, *args) -> Any:
                 self.check_for_ending_compilation()
 
         # Skip CUDA graphs if this entry doesn't use them OR
-        # if we're supposed to skip them globally
-        skip_cuda_graphs = get_forward_context().skip_cuda_graphs
-        if not entry.use_cudagraph or skip_cuda_graphs:
+        # if we're supposed to treat the piecewise graphs as a whole,
+        # which implies forward_context.skip_attention_cuda_graphs is False.
+        # In the latter case, we rely on a wrapper class to capture
+        # the full cudagraph outside the fx graph.
+        skip_attention_cuda_graphs = get_forward_context().skip_attention_cuda_graphs
+        if not entry.use_cudagraph or not skip_attention_cuda_graphs:
             return entry.runnable(*args)
 
         if entry.cudagraph is None:
             if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
                 entry.num_finished_warmup += 1
                 if self.is_first_graph:
                     logger.debug(
-                        "Warming up %s/%s for shape %s",
+                        "Warming up %s/%s of %s usage for shape %s",
                         entry.num_finished_warmup,
                         self.compilation_config.cudagraph_num_of_warmups,
+                        entry.usage_type,
                         runtime_shape)
                 return entry.runnable(*args)
 
             if self.is_first_graph:
                 # Since we capture cudagraph for many different shapes and
                 # capturing is fast, we don't need to log it for every shape.
                 # We only log it in the debug mode.
-                logger.debug("Capturing a cudagraph for shape %s",
+                logger.debug("Capturing a cudagraph of %s usage for shape %s",
+                             entry.usage_type,
                              runtime_shape)
 
             input_addresses = [
@@ -216,3 +222,137 @@ def __call__(self, *args) -> Any:
 
         entry.cudagraph.replay()
         return entry.output
+
+
+class FullCudagraphWrapper:
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, sym_shape_indices: list[int],
+                 ):
+        self.graph = graph
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+        self.graph_pool = graph_pool
+        self.sym_shape_indices = sym_shape_indices
+
+        self.separate_attention_routine = vllm_config.compilation_config.separate_attention_routine
+
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
+        self.first_run_finished = False
+
+        self.cudagraph_capture_sizes: set[int] = set(
+                self.compilation_config.cudagraph_capture_sizes
+            ) if self.compilation_config.use_cudagraph else set()
+
+        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
+        self.concrete_size_entries_decode: dict[int, ConcreteSizeEntry] = {}
+
+        
+        for shape in self.cudagraph_capture_sizes:
+            self.concrete_size_entries[shape] = ConcreteSizeEntry(
+                runtime_shape=shape,
+                need_to_compile=False,
+                use_cudagraph=True,
+                usage_type="general",
+            )
+            if self.separate_attention_routine:
+                self.concrete_size_entries_decode[shape] = ConcreteSizeEntry(
+                    runtime_shape=shape,
+                    need_to_compile=False,
+                    use_cudagraph=True,
+                    usage_type="decode",
+                )
+
+    def __call__(self, *args) -> Any:
+        if not self.first_run_finished:
+            self.first_run_finished = True
+            return self.graph(*args)
+        list_args = list(args)
+        runtime_shape = list_args[self.sym_shape_indices[0]].shape[0]
+        forward_context = get_forward_context()
+
+        if forward_context.skip_attention_cuda_graphs:
+            # turn back to piecewise cudagraphs backend, which is responsible
+            # for capturing and running the piecewise cudagraphs.
+            return self.graph(*args)  
+        
+        # if not skip, the fx graph and its sub-graphs will only be supposed to 
+        # eagerly run the compiled graphs, which should be cudagraph capturable
+        # as a whole.
+        
+        concrete_size_entries = self.concrete_size_entries  # default as general usage
+        if self.separate_attention_routine and forward_context.is_pure_decoding:
+            concrete_size_entries = self.concrete_size_entries_decode
+
+        if not runtime_shape in concrete_size_entries:
+            # we don't need to do anything for this shape.
+            return self.graph(*args)
+
+        entry = concrete_size_entries[runtime_shape]
+
+        if entry.runnable is None:
+            entry.runnable = self.graph
+
+        if not entry.use_cudagraph:
+            return entry.runnable(*args)
+
+        if entry.cudagraph is None:
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
+                entry.num_finished_warmup += 1
+                logger.debug(
+                    "Warming up %s/%s of %s usage for shape %s",
+                    entry.num_finished_warmup,
+                    self.compilation_config.cudagraph_num_of_warmups,
+                    entry.usage_type,
+                    runtime_shape)
+                return entry.runnable(*args)
+
+            
+            # Since we capture cudagraph for many different shapes and
+            # capturing is fast, we don't need to log it for every shape.
+            # We only log it in the debug mode.
+            
+            logger.debug("Capturing a cudagraph of %s usage for shape %s",
+                            entry.usage_type,
+                            runtime_shape)
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            cudagraph = torch.cuda.CUDAGraph()
+
+            with ExitStack() as stack:
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    # by converting it to weak ref,
+                    # the original `output` will immediately be released
+                    # to save memory. 
+                    output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
+
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/vllm/config.py b/vllm/config.py
@@ -3981,6 +3981,14 @@ class CompilationConfig:
     splitting certain operations such as attention into subgraphs. Thus this
     flag cannot be used together with splitting_ops. This may provide
     performance benefits for smaller models."""
+    separate_attention_routine: bool = False
+    """
+    Enable a distinct attention calls routine under an attention backend for full
+    cuda graph capturing. This is because some attention backends like FlashMLA,
+    FlashInfer, FA2, etc. implement different branches for mix prefill-decode and
+    pure decode cases. This flag enables us to potentially capture the cudagraph
+    separately for each branch.
+    """
 
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
@@ -4179,13 +4187,15 @@ def init_with_cudagraph_sizes(self,
 
     def set_splitting_ops_for_v1(self):
         # NOTE: this function needs to be called
-        if self.splitting_ops and self.full_cuda_graph:
-            raise ValueError("full_cuda_graph cannot be used together with "
-                             "splitting_ops, as Full CUDA graph will override "
-                             f"the splitting_ops: {self.splitting_ops}")
-
+        # NOTE: When full_cuda_graph is True, instead of setting an empty list
+        # and capture the full cudagraph inside the flattened fx graph,
+        # we keep the piecewise fx graph structure but capture the full cudagraph
+        # outside the fx graph. This reduces some cpu overhead when the runtime
+        # batch_size is not cudagraph captured.
+        if self.separate_attention_routine:
+            assert self.full_cuda_graph, "separate_attention_routine requires full_cuda_graph to be True"
         if not self.splitting_ops:
-            self.splitting_ops = [] if self.full_cuda_graph else [
+            self.splitting_ops = [
                 "vllm.unified_attention",
                 "vllm.unified_attention_with_output",
             ]
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -94,7 +94,11 @@ class ForwardContext:
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
-    skip_cuda_graphs: bool = False
+    # determine whether to use a full cudagraph for attention or piecewise 
+    # cudagraphs that skip the attention part. By default true, we use piecewise 
+    # cudagraphs.
+    skip_attention_cuda_graphs: bool = True,
+    is_pure_decoding: bool = False
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -115,7 +119,8 @@ def set_forward_context(
     virtual_engine: int = 0,
     num_tokens: Optional[int] = None,
     num_tokens_across_dp: Optional[torch.Tensor] = None,
-    skip_cuda_graphs: bool = False,
+    skip_attention_cuda_graphs: bool = True,
+    is_pure_decoding: bool = False,
 ):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
@@ -140,7 +145,8 @@ def set_forward_context(
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata,
-        skip_cuda_graphs=skip_cuda_graphs,
+        skip_attention_cuda_graphs=skip_attention_cuda_graphs,
+        is_pure_decoding=is_pure_decoding,
     )
 
     try:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -359,6 +359,10 @@ def use_custom_allreduce(cls) -> bool:
     @classmethod
     def get_piecewise_backend_cls(cls) -> str:
         return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+    
+    @classmethod
+    def get_fullgraph_wrapper_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.FullCudagraphWrapper"  # noqa
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -524,6 +524,13 @@ def get_piecewise_backend_cls(cls) -> str:
         Get piecewise backend class for piecewise graph.
         """
         return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
+    
+    @classmethod
+    def get_fullgraph_wrapper_cls(cls) -> str:
+        """
+        Get fullgraph wrapper class for fullgraph static graph.
+        """
+        return "vllm.compilation.base_piecewise_backend.AbstractFullgraphWrapper"  # noqa
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -158,9 +158,6 @@ def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
 
         self.aot_schedule = (get_flash_attn_version() == 3)
         self.use_full_cuda_graph = compilation_config.full_cuda_graph
-        if self.use_full_cuda_graph and not self.aot_schedule:
-            raise ValueError("Full CUDA graph mode requires AOT scheduling, "
-                             "which requires FlashAttention 3.")
         self.scheduler_metadata = torch.zeros(self.runner.max_num_reqs + 1,
                                               dtype=torch.int32,
                                               device=self.runner.device)
@@ -299,8 +296,7 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
                                           max_seq_len=max_seq_len,
                                           causal=True)
 
-        if self.use_full_cuda_graph:
-            assert scheduler_metadata is not None
+        if scheduler_metadata is not None:
             n = scheduler_metadata.shape[0]
             self.scheduler_metadata[:n].copy_(scheduler_metadata,
                                               non_blocking=True)
@@ -332,7 +328,7 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
 
     def can_run_in_cudagraph(
             self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported (FA2 support checked separately)
+        # Full CUDA Graph always supported (FA2 and FA3 support)
         return True
 
     def use_cascade_attention(self, *args, **kwargs) -> bool:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -501,7 +501,11 @@ def build(self, common_prefix_len: int,
         self._plan(attn_metadata)
 
         return attn_metadata
-
+    
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        return common_attn_metadata.max_query_len == 1
+    
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         if self.kv_cache_spec.dtype != self.runner.model_config.dtype:
             # TODO: The cascade wrapper currently does not support setting
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py