resolve

jimmyzho · jimmyzho · commit 2d4aafe74d61 · 2025-09-25T21:30:08.000Z
diff --git a/.github/workflows/new-issue.yml b/.github/workflows/new-issue.yml
@@ -0,0 +1,35 @@
+name: Triage new issues
+
+on:
+  issues:
+    types: [opened]
+
+permissions:
+  issues: write
+
+jobs:
+  triage:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Add needs-triage label
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const issueNumber = context.issue.number;
+            const { owner, repo } = context.repo;
+            const labelName = 'needs-triage';
+            try {
+              await github.rest.repos.getLabel({ owner, repo, name: labelName });
+            } catch (error) {
+              if (error.status === 404) {
+                throw new Error(`Required label '${labelName}' does not exist in ${owner}/${repo}. Please create it in the repository settings.`);
+              }
+              throw error;
+            }
+            await github.rest.issues.addLabels({
+              owner,
+              repo,
+              issue_number: issueNumber,
+              labels: [labelName],
+            });
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -70,7 +70,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):
 
 
 class ArtifactPath:
-    TRTLLM_GEN_FMHA: str = "9ef9e6243df03ab2c3fca1f0398a38cf1011d1e1/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "7206d64e67f4c8949286246d6e2e07706af5d223/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
         "9ef9e6243df03ab2c3fca1f0398a38cf1011d1e1/batched_gemm-45beda1-7bdba93/"
     )
@@ -83,7 +83,7 @@ class ArtifactPath:
 
 class MetaInfoHash:
     TRTLLM_GEN_FMHA: str = (
-        "875f50e8f466120b1a59b94397835b86fad785942b4036823230465bc618b919"
+        "2f605255e71d673768f5bece66dde9e2e9f4c873347bfe8fefcffbf86a3c847d"
     )
     TRTLLM_GEN_BMM: str = (
         "9490085267aed30a387bfff024a0605e1ca4d39dfe06a5abc159d7d7e129bdf4"
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -38,7 +38,12 @@
     last_positive_power_of_2,
 )
 from .jit.cubin_loader import get_cubin
-from .utils import is_sm100a_supported, is_sm120a_supported, is_sm121a_supported
+from .utils import (
+    is_sm100a_supported,
+    is_sm120a_supported,
+    is_sm121a_supported,
+    LibraryError,
+)
 
 CUDNN_AVAILABLE = False
 try:
@@ -2112,6 +2117,15 @@ def mm_fp4(
         raise ValueError("TRTLLM FP4 GEMM is not supported on SM110.")
     if backend != "cudnn" and not use_nvfp4:
         raise ValueError("Only cudnn FP4 GEMM supports mxfp4 quantization.")
+    if (
+        backend == "cudnn"
+        and not use_nvfp4
+        and _match_sm_version(a.device, ["120"])
+        and cudnn.backend_version() < 91400
+    ):
+        raise LibraryError(
+            "cudnn FP4 GEMM with mxfp4 quantization is not supported on SM120 with cuDNN backend version < 9.14.0."
+        )
 
     # allocate the output tensor if not provided
     if out is None:
@@ -3078,6 +3092,11 @@ def group_deepgemm_fp8_nt_groupwise(
     """
     from flashinfer.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
 
+    if not _match_sm_version(a.device, ["100", "103"]):
+        raise ValueError(
+            "m_grouped_fp8_gemm_nt_contiguous is only supported on SM100, SM100, SM103."
+        )
+
     if out is None:
         out_dtype = out_dtype or torch.bfloat16
         out = torch.empty(a.shape[0], b.shape[1], dtype=out_dtype, device=a.device)
@@ -3206,6 +3225,11 @@ def batch_deepgemm_fp8_nt_groupwise(
     """
     from flashinfer.deep_gemm import m_grouped_fp8_gemm_nt_masked
 
+    if not _match_sm_version(a.device, ["100", "103"]):
+        raise ValueError(
+            "m_grouped_fp8_gemm_nt_masked is only supported on SM100, SM103."
+        )
+
     if out is None:
         out_dtype = out_dtype or torch.bfloat16
         out = torch.empty(
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -52,15 +52,15 @@ class TensorLayout(Enum):
 
 
 class GPUArchitectureError(Exception):
-    def __init__(self, msg: str):
-        self.msg = msg
-        super().__init__(self.msg)
+    """Custom exception for GPU architecture-related errors."""
 
-    def __str__(self):
-        return self.msg
+    pass
 
-    def __repr__(self):
-        return self.msg
+
+class LibraryError(Exception):
+    """Custom exception for library-related errors."""
+
+    pass
 
 
 def _expand_5d(x: torch.Tensor, kv_layout: str) -> torch.Tensor:
diff --git a/tests/test_attention_sink_blackwell.py b/tests/test_attention_sink_blackwell.py
@@ -18,7 +18,6 @@
 import pytest
 import torch
 from sink_attention_reference import sink_attention_unified
-from conftest import assert_close_with_mismatch_tolerance
 
 import flashinfer
 from flashinfer.utils import get_compute_capability
@@ -122,13 +121,7 @@ def test_blackwell_trtllm_gen_decode_attention_sink(
     else:
         raise ValueError(f"Unsupported dtype: {dtype}")
 
-    assert_close_with_mismatch_tolerance(
-        o_ref,
-        output,
-        atol=atol,
-        rtol=rtol,
-        max_mismatched_elements=int(output.numel() * 0.01),
-    )
+    torch.testing.assert_close(o_ref, output, atol=atol, rtol=rtol)
 
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
diff --git a/tests/test_groupwise_scaled_gemm_fp8.py b/tests/test_groupwise_scaled_gemm_fp8.py
@@ -202,6 +202,11 @@ def test_fp8_groupwise_group_deepgemm(
     group_size,
     out_dtype,
 ):
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
+    if compute_capability[0] != 10:
+        pytest.skip(
+            "group_deepgemm_fp8_nt_groupwise is only supported on SM100, SM103 in trtllm backend."
+        )
     torch.random.manual_seed(0)
     m_per_group = m // group_size
     if m_per_group < 128:
@@ -245,6 +250,11 @@ def test_fp8_groupwise_batch_deepgemm_masked(
     group_size,
     out_dtype,
 ):
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
+    if compute_capability[0] != 10:
+        pytest.skip(
+            "batch_deepgemm_fp8_nt_groupwise is only supported on SM100, SM103."
+        )
     torch.random.manual_seed(0)
     n, k = nk
     a = torch.randn((group_size, m, k), device="cuda", dtype=torch.float32)
diff --git a/tests/test_mm_fp4.py b/tests/test_mm_fp4.py
@@ -8,7 +8,7 @@
     nvfp4_quantize,
     mxfp4_quantize,
 )
-from flashinfer.utils import get_compute_capability
+from flashinfer.utils import get_compute_capability, LibraryError
 
 
 # TODO: Consdier splitting this function up for the various backends
@@ -25,10 +25,10 @@ def test_mm_fp4(
 ):
     use_nvfp4 = fp4_type == "nvfp4"
 
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
     if backend == "trtllm":
         if res_dtype == torch.float16:
             pytest.skip("Skipping test for trtllm fp4 with float16")
-        compute_capability = get_compute_capability(torch.device(device="cuda"))
         if compute_capability[0] in [11, 12]:
             pytest.skip("trtllm gemm does not support SM110/SM120/SM121 GPUs.")
     if not use_128x4_sf_layout and backend != "trtllm":
@@ -71,23 +71,36 @@ def test_mm_fp4(
 
     res = torch.empty([m, n], device="cuda", dtype=res_dtype)
 
-    with autotune(auto_tuning):
-        mm_fp4(
-            input_fp4,
-            mat2_fp4.T,
-            input_inv_s,
-            mat2_inv_s.T,
-            alpha,
-            res_dtype,
-            res,
-            block_size=block_size,
-            use_8x4_sf_layout=not use_128x4_sf_layout,
-            backend=backend,
-            use_nvfp4=use_nvfp4,
-        )
+    try:
+        with autotune(auto_tuning):
+            mm_fp4(
+                input_fp4,
+                mat2_fp4.T,
+                input_inv_s,
+                mat2_inv_s.T,
+                alpha,
+                res_dtype,
+                res,
+                block_size=block_size,
+                use_8x4_sf_layout=not use_128x4_sf_layout,
+                backend=backend,
+                use_nvfp4=use_nvfp4,
+            )
 
-    cos_sim = F.cosine_similarity(reference.reshape(-1), res.reshape(-1), dim=0)
-    assert cos_sim > 0.97
+        cos_sim = F.cosine_similarity(reference.reshape(-1), res.reshape(-1), dim=0)
+        assert cos_sim > 0.97
+    except LibraryError:
+        # TODO: Remove this check once cuDNN backend version is updated to 9.14.0
+        if (
+            backend == "cudnn"
+            and not use_nvfp4
+            and (compute_capability[0] == 12 and compute_capability[1] == 0)
+        ):
+            pytest.xfail(
+                "cudnn FP4 GEMM with mxfp4 quantization is not supported on SM120 with cuDNN backend version < 9.14.0."
+            )
+        else:
+            pytest.fail("Unexpected LibraryError")
 
 
 if __name__ == "__main__":
diff --git a/tests/test_triton_cascade.py b/tests/test_triton_cascade.py
@@ -21,7 +21,7 @@ def test_merge_state(seq_len, num_heads, head_dim):
         assert torch.allclose(v_merged, v_merged_std, atol=1e-2)
         assert torch.allclose(s_merged, s_merged_std, atol=1e-2)
     except GPUArchitectureError as e:
-        pytest.skip(e.msg)
+        pytest.skip(str(e))
 
 
 @pytest.mark.parametrize("seq_len", [2048])
@@ -44,7 +44,7 @@ def test_merge_state_in_place(seq_len, num_heads, head_dim):
         assert torch.allclose(s, s_std, atol=1e-2)
 
     except GPUArchitectureError as e:
-        pytest.skip(e.msg)
+        pytest.skip(str(e))
 
 
 @pytest.mark.parametrize("seq_len", [2048])
@@ -63,7 +63,7 @@ def test_merge_states(seq_len, num_states, num_heads, head_dim):
         assert torch.allclose(v_merged, v_merged_std, atol=1e-2)
         assert torch.allclose(s_merged, s_merged_std, atol=1e-2)
     except GPUArchitectureError as e:
-        pytest.skip(e.msg)
+        pytest.skip(str(e))
 
 
 @pytest.mark.parametrize("seq_len", [2048])
@@ -94,4 +94,4 @@ def test_variable_length_merge_states(seq_len, num_heads, head_dim):
             assert torch.allclose(v_merged[i], v_merged_std, atol=1e-2)
             assert torch.allclose(s_merged[i], s_merged_std, atol=1e-2)
     except GPUArchitectureError as e:
-        pytest.skip(e.msg)
+        pytest.skip(str(e))
diff --git a/tests/test_trtllm_gen_attention.py b/tests/test_trtllm_gen_attention.py
diff --git a/tests/test_trtllm_gen_fused_moe.py b/tests/test_trtllm_gen_fused_moe.py

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):`
`70`	`70`
`71`	`71`
`72`	`72`	`class ArtifactPath:`
`73`		`- TRTLLM_GEN_FMHA: str = "9ef9e6243df03ab2c3fca1f0398a38cf1011d1e1/fmha/trtllm-gen/"`
	`73`	`+ TRTLLM_GEN_FMHA: str = "7206d64e67f4c8949286246d6e2e07706af5d223/fmha/trtllm-gen/"`
`74`	`74`	`TRTLLM_GEN_BMM: str = (`
`75`	`75`	`"9ef9e6243df03ab2c3fca1f0398a38cf1011d1e1/batched_gemm-45beda1-7bdba93/"`
`76`	`76`	`)`
`@@ -83,7 +83,7 @@ class ArtifactPath:`
`83`	`83`
`84`	`84`	`class MetaInfoHash:`
`85`	`85`	`TRTLLM_GEN_FMHA: str = (`
`86`		`- "875f50e8f466120b1a59b94397835b86fad785942b4036823230465bc618b919"`
	`86`	`+ "2f605255e71d673768f5bece66dde9e2e9f4c873347bfe8fefcffbf86a3c847d"`
`87`	`87`	`)`
`88`	`88`	`TRTLLM_GEN_BMM: str = (`
`89`	`89`	`"9490085267aed30a387bfff024a0605e1ca4d39dfe06a5abc159d7d7e129bdf4"`