tests: Add batch size 1 cases to test_trtllm_gen_attention.py that fail, marked xfail (#1897)

bkryu · web-flow · commit 40d3fea3f834 · 2025-10-09T13:51:14.000-07:00
## 📌 Description  Trtllm-gen's attention kernels have been discovered to fail tests when batch size is 1. Current PR adds batch size 1 cases to: `test_trtllm_gen_prefill_deepseek`: that triggers an IMA with the newly added parameters ``` ## Running pytest ./tests/attention/test_trtllm_gen_attention.py::test_trtllm_gen_prefill_deepseek -v > default_generator.manual_seed(seed) E torch.AcceleratorError: CUDA error: an illegal memory access was encountered E CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. E For debugging consider passing CUDA_LAUNCH_BLOCKING=1 E Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. /opt/conda/envs/py312/lib/python3.12/site-packages/torch/cuda/random.py:129: AcceleratorError ``` `test_trtllm_batch_decode`: that produces incorrect outputs with newly added parameters ``` ## Running pytest ./tests/attention/test_trtllm_gen_attention.py::test_trtllm_batch_decode -v > torch.testing.assert_close( output.float(), output_wrapper.float(), rtol=1e-1, atol=1e-1, ) E AssertionError: Tensor-likes are not close! E E Mismatched elements: 1480 / 8192 (18.1%) E Greatest absolute difference: 64.021484375 at index (0, 46, 106) (up to 0.1 allowed) E Greatest relative difference: 1.625 at index (0, 56, 109) (up to 0.1 allowed) ``` **These test cases have been marked as `pytest.xfail()`.** To avoid a combinatorial growth of test parameter combinations, these batch size 1 cases were defined as separate test functions. B200 status before PR: `2052 passed, 264 skipped in 177.80s (0:02:57)` B200 status after PR: `2052 passed, 264 skipped, 3 xfailed in 195.14s (0:03:15)` Status tracked in [Issue 1898](#1898) ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py
@@ -564,6 +564,7 @@ def test_trtllm_batch_prefill(
 )
 @pytest.mark.parametrize("enable_pdl", [True, False, None])
 @pytest.mark.parametrize("enable_sink", [True, False])
+@pytest.mark.parametrize("max_in_kv_len", [110])
 def test_trtllm_batch_decode(
     kv_layout,
     batch_size,
@@ -577,6 +578,7 @@ def test_trtllm_batch_decode(
     kv_dtype,
     enable_pdl,
     enable_sink,
+    max_in_kv_len,
 ):
     compute_capability = get_compute_capability(torch.device(device="cuda"))
     if compute_capability[0] != 10:
@@ -589,12 +591,11 @@ def test_trtllm_batch_decode(
     # Set up test parameters
     torch.manual_seed(0)
     head_dim = 128
-    MAX_IN_KV_LEN = 110
 
     # Generate random sequence lengths
     num_qo_heads = num_kv_heads * head_grp_size
     q_lens, in_kv_lens, seq_lens = generate_seq_lens_decode(
-        batch_size, q_len_per_req, MAX_IN_KV_LEN
+        batch_size, q_len_per_req, max_in_kv_len
     )
 
     # Create query tensor and related data
@@ -805,6 +806,56 @@ def test_trtllm_batch_decode(
         assert (workspace_buffer[: 8192 * 256 * 4].cpu().numpy() == 0).all()
 
 
+@pytest.mark.parametrize("kv_layout", ["HND"])  # trtllm-gen only support HND
+@pytest.mark.parametrize(
+    "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
+    [
+        (1, 1, 16, 8, 8),
+    ],
+)
+@pytest.mark.parametrize("window_left", [-1])
+@pytest.mark.parametrize(
+    "q_dtype,kv_dtype,o_dtype",
+    [
+        ("fp8", "fp8", "fp8"),
+    ],
+)
+@pytest.mark.parametrize("enable_pdl", [None])
+@pytest.mark.parametrize("enable_sink", [False])
+@pytest.mark.parametrize("max_in_kv_len", [8192])
+def test_trtllm_batch_decode_bs1(
+    kv_layout,
+    batch_size,
+    q_len_per_req,
+    page_size,
+    num_kv_heads,
+    head_grp_size,
+    window_left,
+    q_dtype,
+    o_dtype,
+    kv_dtype,
+    enable_pdl,
+    enable_sink,
+    max_in_kv_len,
+):
+    pytest.xfail("trtllm-gen decode gets incorrect output with bs1")
+    test_trtllm_batch_decode(
+        kv_layout,
+        batch_size,
+        q_len_per_req,
+        page_size,
+        num_kv_heads,
+        head_grp_size,
+        window_left,
+        q_dtype,
+        o_dtype,
+        kv_dtype,
+        enable_pdl,
+        enable_sink,
+        max_in_kv_len,
+    )
+
+
 @pytest.mark.parametrize("batch_size", [4, 128, 256])
 @pytest.mark.parametrize("s_qo", [32, 64, 87])
 @pytest.mark.parametrize("s_kv", [32, 64, 87])
@@ -938,6 +989,21 @@ def test_trtllm_gen_prefill_deepseek(
     assert (workspace_buffer[: 8192 * 256 * 4].cpu().numpy() == 0).all()
 
 
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("s_qo", [1024])
+@pytest.mark.parametrize("s_kv", [1024])
+@pytest.mark.parametrize("num_kv_heads", [128])
+@pytest.mark.parametrize("head_grp_size", [1])
+@pytest.mark.parametrize("causal", [True, False])
+def test_trtllm_gen_prefill_deepseek_bs1(
+    batch_size, s_qo, s_kv, num_kv_heads, head_grp_size, causal
+):
+    pytest.xfail("trtllm-gen prefill triggers an IMA with bs1")
+    test_trtllm_gen_prefill_deepseek(
+        batch_size, s_qo, s_kv, num_kv_heads, head_grp_size, causal
+    )
+
+
 if __name__ == "__main__":
     test_trtllm_batch_prefill("HND", 128, 32, 2, 5, -1, "fp16", "fp16", "fp16", False)
     test_trtllm_batch_decode("HND", 256, 3, 64, 4, 5, -1, "fp8", "fp8", "fp8", True)