tests: upgrade cutlass, fix import and skip non-SM100 cutedsl two shot allreduce (#1812)

jimmyzho · web-flow · commit 40df947162b7 · 2025-10-01T11:18:53.000-07:00
## 📌 Description upgrade cutlass-dsl python package to 4.2.1 to support distributed_helpers. Fix module level import problem and skip non-sm 100 cutedsl two shot allreduce unit tests. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/cute_dsl/gemm_allreduce_two_shot.py b/flashinfer/cute_dsl/gemm_allreduce_two_shot.py
@@ -3,20 +3,6 @@
 import torch
 import torch.distributed as dist
 
-try:
-    # cuda-python >= 12.9 (has cuda.bindings.driver)
-    from cuda.bindings import driver as cuda
-except ImportError:
-    try:
-        # cuda-python < 12.9 (no cuda.bindings.driver, use cuda as driver)
-        # from cuda import cuda is not available in cuda-python >= 13.0
-        from cuda import cuda
-    except ImportError as e:
-        raise ImportError(
-            "Could not import the 'cuda' module. "
-            "Please install cuda-python that matches your CUDA version."
-        ) from e
-
 import cutlass
 import cutlass.cute as cute
 import cutlass.utils as utils
@@ -380,7 +366,7 @@ def __call__(
         b: cute.Tensor,
         c: cute.Tensor,
         max_active_clusters: cutlass.Constexpr,
-        stream: cuda.CUstream,
+        stream,
         epilogue_op: cutlass.Constexpr = lambda x: x,
         c_mc: cute.Tensor = None,
         barrier_flag: cute.Tensor = None,
diff --git a/setup.py b/setup.py
@@ -94,6 +94,7 @@ def generate_build_meta(aot_build_meta: dict) -> None:
     "apache-tvm-ffi==0.1.0b11",
     "packaging>=24.2",
     "nvidia-cudnn-frontend>=1.13.0",
+    "nvidia-cutlass-dsl>=4.2.1",
 ]
 generate_build_meta({})
 
diff --git a/tests/unlisted/test_cute_dsl_gemm_allreduce_two_shot.py b/tests/unlisted/test_cute_dsl_gemm_allreduce_two_shot.py
@@ -30,6 +30,7 @@
 import torch.distributed._symmetric_memory as symm_mem
 
 from flashinfer.cute_dsl.gemm_allreduce_two_shot import PersistentDenseGemmKernel
+from flashinfer.utils import get_compute_capability
 
 
 logger = logging.getLogger(__name__)
@@ -482,6 +483,10 @@ def test_cute_dsl_gemm_allreduce_two_shot(world_size):
         pytest.skip(
             f"world_size {world_size} is greater than available_gpus {available_gpus}"
         )
+
+    if get_compute_capability(torch.device("cuda")) != (10, 0):
+        pytest.skip("cute_dsl_gemm_allreduce_two_shot requires SM100")
+
     print(f"Running test for world_size={world_size}")
     multi_process_parallel(
         world_size,

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,7 @@ def generate_build_meta(aot_build_meta: dict) -> None:`
`94`	`94`	`"apache-tvm-ffi==0.1.0b11",`
`95`	`95`	`"packaging>=24.2",`
`96`	`96`	`"nvidia-cudnn-frontend>=1.13.0",`
	`97`	`+ "nvidia-cutlass-dsl>=4.2.1",`
`97`	`98`	`]`
`98`	`99`	`generate_build_meta({})`
`99`	`100`