Added xfail for mx_fp4 matmul on SM120 (#1766)

nvmbreughe · web-flow · commit b7be89441dda · 2025-09-24T23:11:23.000-07:00
## 📌 Description * A library bug is prevening mx_fp4 matmul on SM120 * While waiting for the patch, this test is now xfailed * Added a LibraryError class was added to handle these issues in general ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [V] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [V] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -38,7 +38,12 @@
     last_positive_power_of_2,
 )
 from .jit.cubin_loader import get_cubin
-from .utils import is_sm100a_supported, is_sm120a_supported, is_sm121a_supported
+from .utils import (
+    is_sm100a_supported,
+    is_sm120a_supported,
+    is_sm121a_supported,
+    LibraryError,
+)
 
 CUDNN_AVAILABLE = False
 try:
@@ -2112,6 +2117,15 @@ def mm_fp4(
         raise ValueError("TRTLLM FP4 GEMM is not supported on SM110.")
     if backend != "cudnn" and not use_nvfp4:
         raise ValueError("Only cudnn FP4 GEMM supports mxfp4 quantization.")
+    if (
+        backend == "cudnn"
+        and not use_nvfp4
+        and _match_sm_version(a.device, ["120"])
+        and cudnn.backend_version() < 91400
+    ):
+        raise LibraryError(
+            "cudnn FP4 GEMM with mxfp4 quantization is not supported on SM120 with cuDNN backend version < 9.14.0."
+        )
 
     # allocate the output tensor if not provided
     if out is None:
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -52,15 +52,15 @@ class TensorLayout(Enum):
 
 
 class GPUArchitectureError(Exception):
-    def __init__(self, msg: str):
-        self.msg = msg
-        super().__init__(self.msg)
+    """Custom exception for GPU architecture-related errors."""
 
-    def __str__(self):
-        return self.msg
+    pass
 
-    def __repr__(self):
-        return self.msg
+
+class LibraryError(Exception):
+    """Custom exception for library-related errors."""
+
+    pass
 
 
 def _expand_5d(x: torch.Tensor, kv_layout: str) -> torch.Tensor:
diff --git a/tests/test_mm_fp4.py b/tests/test_mm_fp4.py
@@ -8,7 +8,7 @@
     nvfp4_quantize,
     mxfp4_quantize,
 )
-from flashinfer.utils import get_compute_capability
+from flashinfer.utils import get_compute_capability, LibraryError
 
 
 # TODO: Consdier splitting this function up for the various backends
@@ -25,10 +25,10 @@ def test_mm_fp4(
 ):
     use_nvfp4 = fp4_type == "nvfp4"
 
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
     if backend == "trtllm":
         if res_dtype == torch.float16:
             pytest.skip("Skipping test for trtllm fp4 with float16")
-        compute_capability = get_compute_capability(torch.device(device="cuda"))
         if compute_capability[0] in [11, 12]:
             pytest.skip("trtllm gemm does not support SM110/SM120/SM121 GPUs.")
     if not use_128x4_sf_layout and backend != "trtllm":
@@ -71,23 +71,36 @@ def test_mm_fp4(
 
     res = torch.empty([m, n], device="cuda", dtype=res_dtype)
 
-    with autotune(auto_tuning):
-        mm_fp4(
-            input_fp4,
-            mat2_fp4.T,
-            input_inv_s,
-            mat2_inv_s.T,
-            alpha,
-            res_dtype,
-            res,
-            block_size=block_size,
-            use_8x4_sf_layout=not use_128x4_sf_layout,
-            backend=backend,
-            use_nvfp4=use_nvfp4,
-        )
+    try:
+        with autotune(auto_tuning):
+            mm_fp4(
+                input_fp4,
+                mat2_fp4.T,
+                input_inv_s,
+                mat2_inv_s.T,
+                alpha,
+                res_dtype,
+                res,
+                block_size=block_size,
+                use_8x4_sf_layout=not use_128x4_sf_layout,
+                backend=backend,
+                use_nvfp4=use_nvfp4,
+            )
 
-    cos_sim = F.cosine_similarity(reference.reshape(-1), res.reshape(-1), dim=0)
-    assert cos_sim > 0.97
+        cos_sim = F.cosine_similarity(reference.reshape(-1), res.reshape(-1), dim=0)
+        assert cos_sim > 0.97
+    except LibraryError:
+        # TODO: Remove this check once cuDNN backend version is updated to 9.14.0
+        if (
+            backend == "cudnn"
+            and not use_nvfp4
+            and (compute_capability[0] == 12 and compute_capability[1] == 0)
+        ):
+            pytest.xfail(
+                "cudnn FP4 GEMM with mxfp4 quantization is not supported on SM120 with cuDNN backend version < 9.14.0."
+            )
+        else:
+            pytest.fail("Unexpected LibraryError")
 
 
 if __name__ == "__main__":