Fix sink attention accuracy regression, add sink test and cleanup. (#1758)

weireweire · web-flow · commit 4fe837f0a0c3 · 2025-09-24T22:06:46.000-07:00
## 📌 Description Update trtllm-gen cubin to fix accuracy regression about sink. Integrate sink test to attention test as the old sink test didn't test and catch the fp8 accuracy issue before. Clean up attention test code. Attention test case has inflated a lot as param num raise. Reduce by using pairwise combination instead of Cartesian product.  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -69,7 +69,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):
 
 
 class ArtifactPath:
-    TRTLLM_GEN_FMHA: str = "538f8e38ace07f701f61e26b138b2b8c70ce9e8e/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "7206d64e67f4c8949286246d6e2e07706af5d223/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
         "e6f22dcc3fdeb29ff87af2f4a2cb3d30b8d273e0/batched_gemm-45beda1-ee6a802/"
     )
@@ -82,7 +82,7 @@ class ArtifactPath:
 
 class MetaInfoHash:
     TRTLLM_GEN_FMHA: str = (
-        "71f06a8fc03d28cc94ee6fc180fb7e37256a9e1c30ab2a6c0bf20a2d97af3eff"
+        "2f605255e71d673768f5bece66dde9e2e9f4c873347bfe8fefcffbf86a3c847d"
     )
     TRTLLM_GEN_BMM: str = (
         "c98b4ce69a39fd41556d67033c30ea814ef76b0a2fe16e798e55baf0104acc34"
diff --git a/tests/test_attention_sink_blackwell.py b/tests/test_attention_sink_blackwell.py
@@ -18,7 +18,6 @@
 import pytest
 import torch
 from sink_attention_reference import sink_attention_unified
-from conftest import assert_close_with_mismatch_tolerance
 
 import flashinfer
 from flashinfer.utils import get_compute_capability
@@ -122,13 +121,7 @@ def test_blackwell_trtllm_gen_decode_attention_sink(
     else:
         raise ValueError(f"Unsupported dtype: {dtype}")
 
-    assert_close_with_mismatch_tolerance(
-        o_ref,
-        output,
-        atol=atol,
-        rtol=rtol,
-        max_mismatched_elements=int(output.numel() * 0.01),
-    )
+    torch.testing.assert_close(o_ref, output, atol=atol, rtol=rtol)
 
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
diff --git a/tests/test_trtllm_gen_attention.py b/tests/test_trtllm_gen_attention.py

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):`
`69`	`69`
`70`	`70`
`71`	`71`	`class ArtifactPath:`
`72`		`- TRTLLM_GEN_FMHA: str = "538f8e38ace07f701f61e26b138b2b8c70ce9e8e/fmha/trtllm-gen/"`
	`72`	`+ TRTLLM_GEN_FMHA: str = "7206d64e67f4c8949286246d6e2e07706af5d223/fmha/trtllm-gen/"`
`73`	`73`	`TRTLLM_GEN_BMM: str = (`
`74`	`74`	`"e6f22dcc3fdeb29ff87af2f4a2cb3d30b8d273e0/batched_gemm-45beda1-ee6a802/"`
`75`	`75`	`)`
`@@ -82,7 +82,7 @@ class ArtifactPath:`
`82`	`82`
`83`	`83`	`class MetaInfoHash:`
`84`	`84`	`TRTLLM_GEN_FMHA: str = (`
`85`		`- "71f06a8fc03d28cc94ee6fc180fb7e37256a9e1c30ab2a6c0bf20a2d97af3eff"`
	`85`	`+ "2f605255e71d673768f5bece66dde9e2e9f4c873347bfe8fefcffbf86a3c847d"`
`86`	`86`	`)`
`87`	`87`	`TRTLLM_GEN_BMM: str = (`
`88`	`88`	`"c98b4ce69a39fd41556d67033c30ea814ef76b0a2fe16e798e55baf0104acc34"`