xfail the cute dsl tests for l=1 (#1868)

cyx-6 · web-flow · commit 91e6140e851a · 2025-10-06T09:31:41.000-07:00
## 📌 Description With the latest version of nvidia-cutlass-dsl, `mark_layout_dynamic` may throw errors like ``` > self._dltensor_wrapper.mark_layout_dynamic(leading_dim) E RuntimeError: Expected strides[leading_dim] == 1, but got 7340032. ``` when calling `cutlass.torch.cute_tensor_like` and `l = 1` in gemm problem size. This issue has been reported to [cutlass#2673](NVIDIA/cutlass#2673). So this PR marks the cute dsl blockscaled gemm tests where `l = 1`, due to the issue of nvidia-cutlass-dsl.  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/tests/gemm/test_cute_dsl_blockscaled_gemm.py b/tests/gemm/test_cute_dsl_blockscaled_gemm.py
@@ -19,6 +19,7 @@
 )
 from flashinfer.cute_dsl.utils import (
     get_cutlass_dtype,
+    get_num_sm,
     is_cute_dsl_available,
 )
 
@@ -56,7 +57,6 @@
 @pytest.mark.parametrize("alpha_dtype", ["float32"])
 @pytest.mark.parametrize("mma_tiler_mn", [(128, 128)])
 @pytest.mark.parametrize("cluster_shape_mn", [(1, 1)])
-@pytest.mark.parametrize("sm_count", [132, None])
 @pytest.mark.parametrize("tolerance", [1e-01])
 @pytest.mark.parametrize("iterations", [3])
 @pytest.mark.parametrize("enable_dst_signals", [False, True])
@@ -74,7 +74,6 @@ def test_blockscaled_gemm_python_interface(
     alpha_dtype: cutlass.dtype,
     mma_tiler_mn: Tuple[int, int],
     cluster_shape_mn: Tuple[int, int],
-    sm_count: int,
     tolerance: float,
     iterations: int,
     enable_dst_signals: int,
@@ -85,11 +84,13 @@ def test_blockscaled_gemm_python_interface(
 
     if not (major == 10 and minor == 0):
         pytest.skip("Cute-dsl backend is only supported on SM100.")
-    if enable_dst_signals and (sm_count is None):
-        pytest.skip("dst_signals require sm_count")
 
     l, m = lm
     k, n = kn
+    if l == 1:
+        pytest.xfail("nvidia-cutlass-dsl has issue when l=1")
+
+    sm_count = get_num_sm(device) if enable_dst_signals else None
 
     print(f"device: {device}")