test: minor update on trtllm-gen attn speculative-decoding test (#1760)

yyihuang · web-flow · commit 905f7554f643 · 2025-09-23T23:05:58.000-04:00
## 📌 Description  ## 🔍 Related Issues comments in #1453 ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/tests/test_trtllm_gen_attention.py b/tests/test_trtllm_gen_attention.py
@@ -536,10 +536,6 @@ def test_trtllm_batch_decode(
     workspace_buffer = global_trtllm_gen_fmha_workspace_buffer
     workspace_buffer_ref = global_workspace_buffer
 
-    # Run reference wrapper
-    wrapper_ref = flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer_ref, kv_layout, use_tensor_cores=True
-    )
     plan_params = {
         "indptr": kv_indptr,
         "indices": all_page_ids,
@@ -553,11 +549,14 @@ def test_trtllm_batch_decode(
         "q_data_type": ref_q.dtype,
         "window_left": window_left,
     }
-    wrapper_ref.plan(**plan_params)
-    output_ref = wrapper_ref.run(ref_q, ref_kv_cache)
-
-    if q_len_per_req > 1:
-        # hide the output_ref from decode wrapper for speculative decoding test
+    # Run reference wrapper
+    if q_len_per_req == 1:
+        wrapper_ref = flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper(
+            workspace_buffer_ref, kv_layout, use_tensor_cores=True
+        )
+        wrapper_ref.plan(**plan_params)
+        output_ref = wrapper_ref.run(ref_q, ref_kv_cache)
+    else:
         wrapper_ref = flashinfer.prefill.BatchPrefillWithPagedKVCacheWrapper(
             workspace_buffer_ref, kv_layout
         )