vllm-project · youkaichao · Dec 25, 2025 · Nov 21, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
@@ -293,7 +293,7 @@ def benchmark_allreduce_single(
                     graph = torch.cuda.CUDAGraph()
                     graph_pool = torch.cuda.graph_pool_handle()
                     set_graph_pool_id(graph_pool)
-                    with torch.cuda.graph(graph, pool=graph_pool):
+                    with torch.cuda.graph(graph, pool=graph_pool, stream=stream):
                         for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                             allreduce_fn(graph_input)
 

diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -165,13 +165,16 @@ def test_capture_and_replay(self):
             self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
         )
         batch_descriptor = BatchDescriptor(num_tokens=10)
-
+        stream = torch.cuda.Stream()
         # 0. global warmup
-        with set_forward_context(
-            attn_metadata=None,
-            vllm_config=self.vllm_config,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-            batch_descriptor=None,
+        with (
+            set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                batch_descriptor=None,
+            ),
+            torch.cuda.stream(stream),
         ):
             wrapper(self.input_tensor)
 
@@ -184,6 +187,7 @@ def test_capture_and_replay(self):
                 batch_descriptor=batch_descriptor,
             ),
             patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_cuda_graph,
+            torch.cuda.stream(stream),
         ):
             output1 = wrapper(self.input_tensor)
             # capturing phase should generate a zero output

@@ -169,7 +169,11 @@ def __call__(self, *args, **kwargs):
                 else:
                     set_graph_pool_id(current_platform.graph_pool_handle())
                 # mind-exploding: carefully manage the reference and memory.
-                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                with torch.cuda.graph(
+                    cudagraph,
+                    pool=self.graph_pool,
+                    stream=torch.cuda.current_stream(),
+                ):
-                with torch.cuda.graph(
-                    cudagraph,
-                    pool=self.graph_pool,
-                    stream=torch.cuda.current_stream(),
-                ):
+                with torch.cuda.graph(
+                    cudagraph,
+                    pool=self.graph_pool,
+                    stream=current_stream(),
+                ):
-                with torch.cuda.graph(
-                    cudagraph,
-                    pool=self.graph_pool,
-                    stream=torch.cuda.current_stream(),
-                ):
+                with torch.cuda.graph(
+                    cudagraph,
+                    pool=self.graph_pool,
+                    stream=current_stream(),
+                ):
                     # `output` is managed by pytorch's cudagraph pool
                     output = self.runnable(*args, **kwargs)
                     if self.cudagraph_options.weak_ref_output: