Test refactoring and fixes (#1736)

nvmbreughe · gemini-code-assist[bot] · web-flow · commit b6cfc2cc440a · 2025-09-19T21:02:08.000-07:00
## 📌 Description Unit test fixes: * Refactored test_mla_decode_kernel to run from pytest * Added skip to test_mnnvl_custom_comm when world size is too large * Added asserts to the cascade API when not using Hopper ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [V] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [V] Tests have been added or updated as needed. - [V] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/flashinfer/triton/cascade.py b/flashinfer/triton/cascade.py
@@ -10,6 +10,8 @@
 )
 from .utils import check_device, check_dim, check_input, check_shape
 
+EXPECT_HOPPER = 9
+
 
 def merge_state(
     v_a: torch.Tensor, s_a: torch.Tensor, v_b: torch.Tensor, s_b: torch.Tensor
@@ -18,7 +20,7 @@ def merge_state(
     check_input(s_a)
     check_input(v_b)
     check_input(s_b)
-    check_device([v_a, s_a, v_b, s_b])
+    check_device([v_a, s_a, v_b, s_b], major=[EXPECT_HOPPER])
     check_dim(3, v_a)
     check_dim(2, s_a)
     check_dim(3, v_b)
@@ -55,7 +57,7 @@ def merge_state_in_place(
     check_input(s)
     check_input(v_other)
     check_input(s_other)
-    check_device([v, s, v_other, s_other])
+    check_device([v, s, v_other, s_other], major=[EXPECT_HOPPER])
     check_dim(3, v)
     check_dim(2, s)
     check_dim(3, v_other)
@@ -84,7 +86,7 @@ def merge_state_in_place(
 def merge_states(v: torch.Tensor, s: torch.Tensor):
     check_input(v)
     check_input(s)
-    check_device([v, s])
+    check_device([v, s], major=[EXPECT_HOPPER])
     check_dim(4, v)
     check_dim(3, s)
     assert v.size(0) == s.size(0)
@@ -121,7 +123,7 @@ def variable_length_merge_states(
 ):
     check_input(v)
     check_input(s)
-    check_device([v, s])
+    check_device([v, s], major=[EXPECT_HOPPER])
     check_dim(3, v)
     check_dim(2, s)
     assert v.size(0) == s.size(0)
diff --git a/flashinfer/triton/utils.py b/flashinfer/triton/utils.py
@@ -1,6 +1,7 @@
 from typing import List
 
 import torch
+from flashinfer.utils import get_compute_capability, GPUArchitectureError
 
 
 def check_input(x: torch.Tensor):
@@ -20,9 +21,21 @@ def check_shape(a: torch.Tensor, b: torch.Tensor):
         )
 
 
-def check_device(tensors: List[torch.Tensor]):
+def check_device(
+    tensors: List[torch.Tensor], major: List[int] = None, minor: List[int] = None
+):
     device = tensors[0].device
     for t in tensors:
         assert t.device == device, (
             f"All tensors should be on the same device, but got {device} and {t.device}"
         )
+    if major is not None or minor is not None:
+        actual_major, actual_minor = get_compute_capability(device)
+        if major is not None and actual_major not in major:
+            raise GPUArchitectureError(
+                f"Device major should be in {major}, but got {actual_major}"
+            )
+        if minor is not None and actual_minor not in minor:
+            raise GPUArchitectureError(
+                f"Device minor should be in {minor}, but got {actual_minor}"
+            )
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -51,6 +51,18 @@ class TensorLayout(Enum):
 log2e = 1.44269504088896340736
 
 
+class GPUArchitectureError(Exception):
+    def __init__(self, msg: str):
+        self.msg = msg
+        super().__init__(self.msg)
+
+    def __str__(self):
+        return self.msg
+
+    def __repr__(self):
+        return self.msg
+
+
 def _expand_5d(x: torch.Tensor, kv_layout: str) -> torch.Tensor:
     if x.ndim not in [4, 5]:
         raise ValueError("x must be 4D or 5D")
diff --git a/tests/test_mla_decode_kernel.py b/tests/test_mla_decode_kernel.py
@@ -1,10 +1,12 @@
 from typing import Optional, Tuple
+import pytest
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
 import flashinfer
+from rope_reference import apply_rotary_emb, precompute_freqs_cis
 
 
 def wmape(target: torch.Tensor, preds: torch.Tensor):
@@ -13,9 +15,6 @@ def wmape(target: torch.Tensor, preds: torch.Tensor):
     return sum_abs_error / sum_scale
 
 
-from rope_reference import *
-
-
 class DeepseekV2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -247,6 +246,10 @@ def run_proof_of_concept(
         k_pe_cache: torch.Tensor,
         use_flashinfer_kernel: bool,
         convert_float16: bool,
+        bsz: int,
+        kv_len: int,
+        page_size: int,
+        dev_id: int,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         c_Q = torch.matmul(hidden_states, self.W_DQ)
         # c_Q ~ [bsz, q_lora_rank:1536]
@@ -392,18 +395,17 @@ def run_proof_of_concept(
         return output
 
 
-if __name__ == "__main__":
+@pytest.mark.parametrize("bsz", [6])
+@pytest.mark.parametrize("kv_len", [640])
+@pytest.mark.parametrize("page_size", [16])
+def test_mla_decode_kernel(bsz, kv_len, page_size):
     dev_id = 0
 
     torch.manual_seed(666)
     torch.set_grad_enabled(False)
 
     mla_vanilla = DeepseekV2AttentionVanilla().cuda(device=dev_id)
 
-    bsz = 6
-    kv_len = 640
-    page_size = 16
-
     hidden_states = torch.randn([bsz, 1, mla_vanilla.hidden_size]).to(dev_id)
     compressed_kv_normed_cache = torch.randn(
         [bsz, kv_len, mla_vanilla.kv_lora_rank]
@@ -421,20 +423,32 @@ def run_proof_of_concept(
         k_pe_cache,
         use_flashinfer_kernel=False,
         convert_float16=False,
+        bsz=bsz,
+        kv_len=kv_len,
+        page_size=page_size,
+        dev_id=dev_id,
     )
     output_mat_absorbed_use_torch_f16 = mla_mat_absorb.run_proof_of_concept(
         hidden_states.squeeze(1),
         compressed_kv_normed_cache,
         k_pe_cache,
         use_flashinfer_kernel=False,
         convert_float16=True,
+        bsz=bsz,
+        kv_len=kv_len,
+        page_size=page_size,
+        dev_id=dev_id,
     )
     output_mat_absorbed_use_flashinfer = mla_mat_absorb.run_proof_of_concept(
         hidden_states.squeeze(1),
         compressed_kv_normed_cache,
         k_pe_cache,
         use_flashinfer_kernel=True,
         convert_float16=True,
+        bsz=bsz,
+        kv_len=kv_len,
+        page_size=page_size,
+        dev_id=dev_id,
     )
 
     cos_use_torch_f32 = F.cosine_similarity(
@@ -489,3 +503,10 @@ def run_proof_of_concept(
         output_vanilla.reshape(-1), output_mat_absorbed_use_flashinfer.reshape(-1)
     )
     print(f"mse_use_flashinfer = {mse_use_flashinfer}")
+
+
+if __name__ == "__main__":
+    bsz = 6
+    kv_len = 640
+    page_size = 16
+    test_mla_decode_kernel(bsz, kv_len, page_size)
diff --git a/tests/test_mnnvl_custom_comm.py b/tests/test_mnnvl_custom_comm.py
@@ -169,9 +169,10 @@ def test_mnnvl_custom_communicator(world_size):
     dtype = torch.float16
     available_gpus = torch.cuda.device_count()
     if world_size > available_gpus:
-        raise ValueError(
+        pytest.skip(
             f"world_size {world_size} is greater than available_gpus {available_gpus}"
         )
+
     print(f"Running test for world_size={world_size}")
 
     multi_process_parallel(
diff --git a/tests/test_triton_cascade.py b/tests/test_triton_cascade.py
@@ -3,96 +3,95 @@
 
 import flashinfer
 import flashinfer.triton
-from flashinfer.utils import get_compute_capability
+from flashinfer.utils import GPUArchitectureError
 
 
 @pytest.mark.parametrize("seq_len", [2048])
 @pytest.mark.parametrize("num_heads", [32])
 @pytest.mark.parametrize("head_dim", [128])
 def test_merge_state(seq_len, num_heads, head_dim):
-    compute_capability = get_compute_capability(torch.device(device="cuda"))
-    if compute_capability[0] != 9:
-        pytest.skip("These tests are only guaranteed to work on Hopper GPUs.")
+    try:
+        va = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
+        sa = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
+        vb = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
+        sb = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
+        v_merged, s_merged = flashinfer.triton.cascade.merge_state(va, sa, vb, sb)
+        v_merged_std, s_merged_std = flashinfer.merge_state(va, sa, vb, sb)
 
-    va = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
-    sa = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
-    vb = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
-    sb = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
-    v_merged, s_merged = flashinfer.triton.cascade.merge_state(va, sa, vb, sb)
-    v_merged_std, s_merged_std = flashinfer.merge_state(va, sa, vb, sb)
-
-    assert torch.allclose(v_merged, v_merged_std, atol=1e-2)
-    assert torch.allclose(s_merged, s_merged_std, atol=1e-2)
+        assert torch.allclose(v_merged, v_merged_std, atol=1e-2)
+        assert torch.allclose(s_merged, s_merged_std, atol=1e-2)
+    except GPUArchitectureError as e:
+        pytest.skip(e.msg)
 
 
 @pytest.mark.parametrize("seq_len", [2048])
 @pytest.mark.parametrize("num_heads", [32])
 @pytest.mark.parametrize("head_dim", [128])
 def test_merge_state_in_place(seq_len, num_heads, head_dim):
-    compute_capability = get_compute_capability(torch.device(device="cuda"))
-    if compute_capability[0] != 9:
-        pytest.skip("These tests are only guaranteed to work on Hopper GPUs.")
+    try:
+        v = torch.randn(seq_len, num_heads, head_dim).half()
+        v_std = v.clone()
+        v, v_std = v.to("cuda:0"), v_std.to("cuda:0")
+        s = torch.randn(seq_len, num_heads, dtype=torch.float32)
+        s_std = s.clone()
+        s, s_std = s.to("cuda:0"), s_std.to("cuda:0")
+        v_other = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
+        s_other = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
+        flashinfer.merge_state_in_place(v_std, s_std, v_other, s_other)
+        flashinfer.triton.cascade.merge_state_in_place(v, s, v_other, s_other)
 
-    v = torch.randn(seq_len, num_heads, head_dim).half()
-    v_std = v.clone()
-    v, v_std = v.to("cuda:0"), v_std.to("cuda:0")
-    s = torch.randn(seq_len, num_heads, dtype=torch.float32)
-    s_std = s.clone()
-    s, s_std = s.to("cuda:0"), s_std.to("cuda:0")
-    v_other = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
-    s_other = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
-    flashinfer.merge_state_in_place(v_std, s_std, v_other, s_other)
-    flashinfer.triton.cascade.merge_state_in_place(v, s, v_other, s_other)
+        assert torch.allclose(v, v_std, atol=1e-2)
+        assert torch.allclose(s, s_std, atol=1e-2)
 
-    assert torch.allclose(v, v_std, atol=1e-2)
-    assert torch.allclose(s, s_std, atol=1e-2)
+    except GPUArchitectureError as e:
+        pytest.skip(e.msg)
 
 
 @pytest.mark.parametrize("seq_len", [2048])
 @pytest.mark.parametrize("num_heads", [32])
 @pytest.mark.parametrize("head_dim", [128])
 @pytest.mark.parametrize("num_states", [100])
 def test_merge_states(seq_len, num_states, num_heads, head_dim):
-    compute_capability = get_compute_capability(torch.device(device="cuda"))
-    if compute_capability[0] != 9:
-        pytest.skip("These tests are only guaranteed to work on Hopper GPUs.")
-
-    v = torch.randn(seq_len, num_states, num_heads, head_dim).half().to("cuda:0")
-    s = torch.randn(seq_len, num_states, num_heads, dtype=torch.float32).to("cuda:0")
-    v_merged_std, s_merged_std = flashinfer.merge_states(v, s)
-    v_merged, s_merged = flashinfer.triton.cascade.merge_states(v, s)
+    try:
+        v = torch.randn(seq_len, num_states, num_heads, head_dim).half().to("cuda:0")
+        s = torch.randn(seq_len, num_states, num_heads, dtype=torch.float32).to(
+            "cuda:0"
+        )
+        v_merged_std, s_merged_std = flashinfer.merge_states(v, s)
+        v_merged, s_merged = flashinfer.triton.cascade.merge_states(v, s)
 
-    assert torch.allclose(v_merged, v_merged_std, atol=1e-2)
-    assert torch.allclose(s_merged, s_merged_std, atol=1e-2)
+        assert torch.allclose(v_merged, v_merged_std, atol=1e-2)
+        assert torch.allclose(s_merged, s_merged_std, atol=1e-2)
+    except GPUArchitectureError as e:
+        pytest.skip(e.msg)
 
 
 @pytest.mark.parametrize("seq_len", [2048])
 @pytest.mark.parametrize("num_heads", [32])
 @pytest.mark.parametrize("head_dim", [128])
 def test_variable_length_merge_states(seq_len, num_heads, head_dim):
-    compute_capability = get_compute_capability(torch.device(device="cuda"))
-    if compute_capability[0] != 9:
-        pytest.skip("These tests are only guaranteed to work on Hopper GPUs.")
-
-    max_index_sets = 512
-    lengths = torch.randint(low=1, high=max_index_sets, size=(seq_len,))
-    indptr = [0]
-    for i in range(seq_len):
-        indptr.append(indptr[-1] + lengths[i])
-    v = torch.randn(indptr[-1], num_heads, head_dim).half().to("cuda:0")
-    s = torch.randn(indptr[-1], num_heads, dtype=torch.float32).to("cuda:0")
-    indptr = torch.tensor(indptr, dtype=torch.int32).to("cuda:0")
-    v_merged, s_merged = flashinfer.triton.cascade.variable_length_merge_states(
-        v, s, indptr
-    )
-    for i in range(seq_len):
-        sub_v = v[indptr[i] : indptr[i + 1]]
-        sub_s = s[indptr[i] : indptr[i + 1]]
-        sub_v = torch.unsqueeze(sub_v, 0)
-        sub_s = torch.unsqueeze(sub_s, 0)
-        v_merged_std, s_merged_std = flashinfer.merge_states(sub_v, sub_s)
-        v_merged_std = torch.squeeze(v_merged_std, 0)
-        s_merged_std = torch.squeeze(s_merged_std, 0)
-        assert v_merged[i].shape == v_merged_std.shape
-        assert torch.allclose(v_merged[i], v_merged_std, atol=1e-2)
-        assert torch.allclose(s_merged[i], s_merged_std, atol=1e-2)
+    try:
+        max_index_sets = 512
+        lengths = torch.randint(low=1, high=max_index_sets, size=(seq_len,))
+        indptr = [0]
+        for i in range(seq_len):
+            indptr.append(indptr[-1] + lengths[i])
+        v = torch.randn(indptr[-1], num_heads, head_dim).half().to("cuda:0")
+        s = torch.randn(indptr[-1], num_heads, dtype=torch.float32).to("cuda:0")
+        indptr = torch.tensor(indptr, dtype=torch.int32).to("cuda:0")
+        v_merged, s_merged = flashinfer.triton.cascade.variable_length_merge_states(
+            v, s, indptr
+        )
+        for i in range(seq_len):
+            sub_v = v[indptr[i] : indptr[i + 1]]
+            sub_s = s[indptr[i] : indptr[i + 1]]
+            sub_v = torch.unsqueeze(sub_v, 0)
+            sub_s = torch.unsqueeze(sub_s, 0)
+            v_merged_std, s_merged_std = flashinfer.merge_states(sub_v, sub_s)
+            v_merged_std = torch.squeeze(v_merged_std, 0)
+            s_merged_std = torch.squeeze(s_merged_std, 0)
+            assert v_merged[i].shape == v_merged_std.shape
+            assert torch.allclose(v_merged[i], v_merged_std, atol=1e-2)
+            assert torch.allclose(s_merged[i], s_merged_std, atol=1e-2)
+    except GPUArchitectureError as e:
+        pytest.skip(e.msg)