unittest: remove debug-print jit examples from unittest (#1851)

yzh119 · web-flow · commit 2b753a525d86 · 2025-10-03T02:00:43.000-07:00
## 📌 Description The debug print statements in `test_jit_examples` unittests clutter the CI output, making it difficult to identify useful information. This PR removes them from the unittests. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/README.md b/README.md
@@ -117,7 +117,7 @@ Check out [documentation](https://docs.flashinfer.ai/) for usage of batch decode
 
 ## Custom Attention Variants
 
-Starting from FlashInfer v0.2, users can customize their own attention variants with additional parameters. For more details, refer to our [JIT examples](https://github.com/flashinfer-ai/flashinfer/blob/main/tests/test_jit_example.py).
+Starting from FlashInfer v0.2, users can customize their own attention variants with additional parameters. For more details, refer to our [JIT examples](https://github.com/flashinfer-ai/flashinfer/blob/main/tests/utils/test_jit_example.py).
 
 ## C++ API and TVM Bindings
 
diff --git a/tests/utils/test_jit_example.py b/tests/utils/test_jit_example.py
@@ -572,145 +572,10 @@ def test_batch_prefill_sm90_flash_sigmoid():
     torch.testing.assert_close(o_paged, o_ref, rtol=2e-2, atol=2e-2)
 
 
-def test_debug_print_logits():
-    torch.manual_seed(42)
-    variant_decl = r"""
-struct DebugPrintLogits : AttentionVariantBase {
-  static constexpr bool use_softmax = true;
-
-  uint32_t window_left, qo_len, kv_len;
-  float sm_scale_log2;
-
-  // Create closure
-  template <typename Params>
-  __device__ __host__ DebugPrintLogits(const Params& params, uint32_t batch_idx,
-                                 uint8_t* smem_ptr) {
-    qo_len = params.get_qo_len(batch_idx);
-    kv_len = params.get_kv_len(batch_idx);
-    window_left = kv_len;
-    sm_scale_log2 = params.sm_scale * math::log2e;
-  }
-
-  REGISTER_LOGITS_TRANSFORM(params, logits, batch_idx, qo_idx, kv_idx, qo_head_idx, kv_head_idx, {
-    if (logits >= 5) {
-      printf("Large logits at qo_idx=%d, kv_idx=%d, qo_head_idx=%d, kv_head_idx=%d: %.3f\n",
-              qo_idx, kv_idx, qo_head_idx, kv_head_idx, float(logits));
-    }
-    return logits;
-  });
-};
-"""
-    jit_module = gen_customize_single_prefill_module(
-        "fa2",  # backend
-        "batch_prefill_debug_print_logits",  # uri
-        torch.float16,  # dtype_q
-        torch.float16,  # dtype_kv
-        torch.float16,  # dtype_o
-        128,  # hidden_dim_qk
-        128,  # hidden_dim_vo
-        [],  # additional_tensor_names
-        [],  # additional_tensor_dtypes
-        ["sm_scale"],  # additional_scalar_names
-        ["double"],  # additional_scalar_dtypes
-        "DebugPrintLogits",
-        variant_decl,
-    ).build_and_load()
-
-    f = functools.partial(single_prefill_with_kv_cache_with_jit_module, jit_module)
-
-    q = torch.randn(128, 32, 128, dtype=torch.float16, device="cuda")
-    k = torch.randn(1023, 32, 128, dtype=torch.float16, device="cuda")
-    v = torch.randn(1023, 32, 128, dtype=torch.float16, device="cuda")
-    sm_scale = 1.0 / math.sqrt(128)
-    o = f(q, k, v, sm_scale, mask_mode=MaskMode.NON_CAUSAL.value)
-
-    p = torch.einsum("mhd,nhd->hmn", q.float(), k.float()) * sm_scale
-    o_ref = torch.einsum("hmn,nhd->mhd", torch.softmax(p, dim=-1), v.float()).half()
-    torch.testing.assert_close(o, o_ref, rtol=1e-3, atol=1e-3)
-
-
-def test_sm90_debug_print_logits():
-    if not is_sm90a_supported(torch.device("cuda")):
-        pytest.skip("SM90A is not supported")
-
-    torch.manual_seed(42)
-    variant_decl = r"""
-struct DebugPrintLogits : AttentionVariantBase {
-  float sm_scale_log2;
-  int qo_len, kv_len;
-
-  // Init
-  template <typename MainloopParams, typename BlockCoord>
-  __device__ __host__ DebugPrintLogits(const MainloopParams& params, const BlockCoord& block_coord) {
-    sm_scale_log2 = params.additional_params.sm_scale * math::log2e;
-    auto [_, __, ___, ____, _____, qo_len_, kv_len_, batch_idx] =
-        block_coord;
-
-    qo_len = qo_len_;
-    kv_len = kv_len_;
-  }
-
-
-  template <int NUM_ROWS_PER_THREAD>
-  __device__ auto GetAttentionUpdater() {
-    return OnlineSoftmax<NUM_ROWS_PER_THREAD, /*WITH_SCALE*/false>(sm_scale_log2);
-  }
-
-
-  REGISTER_LOGITS_TRANSFORM(params, logits, batch_idx, qo_idx, kv_idx, qo_head_idx, kv_head_idx, {
-    if (qo_idx < qo_len && kv_idx < kv_len) {
-        printf(
-            "---> LOGITS DEBUG: "
-            "qo_idx=%-5d "
-            "kv_idx=%-5d "
-            "sm_scale_log2=%-12.5f "
-            "logits=%-12.5f "
-            "\n",
-            qo_idx,
-            kv_idx,
-            sm_scale_log2,
-            static_cast<float>(logits));
-    }
-    logits *= sm_scale_log2;
-    return logits;
-  })
-};
-"""
-    jit_module = gen_customize_single_prefill_module(
-        "fa3",  # backend
-        "debug_print_logits",  # uri
-        torch.float16,  # dtype_q
-        torch.float16,  # dtype_kv
-        torch.float16,  # dtype_o
-        128,  # hidden_dim_qk
-        128,  # hidden_dim_vo
-        [],  # additional_tensor_names
-        [],  # additional_tensor_dtypes
-        ["sm_scale"],  # additional_scalar_names
-        ["double"],  # additional_scalar_dtypes
-        "DebugPrintLogits",
-        variant_decl,
-    ).build_and_load()
-
-    f = functools.partial(single_prefill_with_kv_cache_with_jit_module, jit_module)
-
-    q = torch.randn(16, 2, 128, dtype=torch.float16, device="cuda")
-    k = torch.randn(16, 1, 128, dtype=torch.float16, device="cuda")
-    v = torch.randn(16, 1, 128, dtype=torch.float16, device="cuda")
-    sm_scale = 1.0 / math.sqrt(128)
-    o = f(q, k, v, sm_scale, mask_mode=MaskMode.NON_CAUSAL.value)
-
-    p = torch.einsum("mhd,nhd->hmn", q.float(), k.float()) * sm_scale
-    o_ref = torch.einsum("hmn,nhd->mhd", torch.softmax(p, dim=-1), v.float()).half()
-    torch.testing.assert_close(o, o_ref, rtol=1e-3, atol=1e-3)
-
-
 if __name__ == "__main__":
     test_single_decode_mask()
     test_flash_sigmoid()
     test_dump_logits()
-    test_debug_print_logits()
-    test_sm90_debug_print_logits()
     test_batch_decode_flash_sigmoid(False)
     test_batch_decode_flash_sigmoid(True)
     test_batch_prefill_flash_sigmoid()