bugfix: ensure graph is captured and executed on the same stream to avoid rep… (#1303)

elfiegg · web-flow · commit 63a3074c7056 · 2025-07-22T17:43:49.000-07:00
## 📌 Description Fix cuda graph replay issue when integrating mm_fp4 API. Ensure graph is captured and executed on the same stream. ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -822,10 +822,11 @@ def build_cudnn_gemm_block_scale_dequantize_graph(
     scale_type,
     o_type,
     block_size,
+    device,
 ):
     _check_cudnn_availability()
-
-    with cudnn.graph(_get_cudnn_handle(torch.cuda.current_stream())) as (graph, _):
+    stream = torch.cuda.current_stream(device)
+    with cudnn.graph(_get_cudnn_handle(stream)) as (graph, _):
         a_cudnn_tensor = graph.tensor(
             name="a", dim=a_shape, stride=a_stride, data_type=ab_type
         )
@@ -911,17 +912,17 @@ def execute_cudnn_gemm_fp4_graph(graph, a, b, a_descale, b_descale, alpha, c_fin
     }
 
     workspace = torch.empty(
-        graph.get_workspace_size(), device="cuda", dtype=torch.uint8
+        graph.get_workspace_size(), device=a.device, dtype=torch.uint8
     )
 
-    graph.execute(
-        variant_pack, workspace, handle=_get_cudnn_handle(torch.cuda.current_stream())
-    )
+    stream = torch.cuda.current_stream(a.device)
+
+    graph.execute(variant_pack, workspace, handle=_get_cudnn_handle(stream))
 
 
 @functools.lru_cache(maxsize=128)
 def build_cudnn_gemm_with_per_tensor_q_graph(
-    a_shape, a_stride, b_shape, b_stride, a_type, b_type, o_type
+    a_shape, a_stride, b_shape, b_stride, a_type, b_type, o_type, device
 ):
     """Build a cuDNN graph for GEMM with per-tensor quantization.
 
@@ -941,7 +942,8 @@ def build_cudnn_gemm_with_per_tensor_q_graph(
     """
     _check_cudnn_availability()
 
-    with cudnn.graph(_get_cudnn_handle(torch.cuda.current_stream())) as (graph, _):
+    stream = torch.cuda.current_stream(device)
+    with cudnn.graph(_get_cudnn_handle(stream)) as (graph, _):
 
         a_cudnn_tensor = graph.tensor(
             name="a", dim=a_shape, stride=a_stride, data_type=a_type
@@ -992,10 +994,11 @@ def execute_cudnn_gemm_with_per_tensor_q_graph(graph, a, b, alpha, c_final):
         UIDs.O_UID.value: c_final,
     }
 
-    cudnn_handle = _get_cudnn_handle(torch.cuda.current_stream())
+    stream = torch.cuda.current_stream(a.device)
+    cudnn_handle = _get_cudnn_handle(stream)
 
     workspace = torch.empty(
-        graph.get_workspace_size(), device="cuda", dtype=torch.uint8
+        graph.get_workspace_size(), device=a.device, dtype=torch.uint8
     )
 
     graph.execute(variant_pack, workspace, handle=cudnn_handle)
@@ -1036,6 +1039,7 @@ def _cudnn_gemm_fp8(
         _torch_data_type_to_cudnn_data_type(a.dtype),
         _torch_data_type_to_cudnn_data_type(b.dtype),
         _torch_data_type_to_cudnn_data_type(torch_out_dtype),
+        a.device,
     )
 
     execute_cudnn_gemm_with_per_tensor_q_graph(graph, a, b, dq_scale, out)
@@ -1223,6 +1227,7 @@ def mm_fp4(
         torch.float8_e4m3fn,
         _torch_data_type_to_cudnn_data_type(out_dtype),
         block_size,
+        a.device,
     )
 
     # execute the fp4 cudnn graph