patch mm segfault & patch cubin avail. (#1628)

aleozlx · web-flow · commit 75df649d2ec4 · 2025-09-02T22:39:07.000-07:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_gemm_runner.cu b/csrc/trtllm_gemm_runner.cu
@@ -207,7 +207,7 @@ class TrtllmGenGemmRunner {
         return optionsA.mUseUnrollLoop2xForMma;
       }
 
-      return true;
+      return false;
     });
 
     bool findLoop2xMma = false;
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -48,27 +48,27 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):
 
 
 class ArtifactPath:
-    TRTLLM_GEN_FMHA: str = "6c74964c96684c3e674340f7e35fc20ad909a9a0/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "037e528e719ec3456a7d7d654f26b805e44c63b1/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
-        "6c74964c96684c3e674340f7e35fc20ad909a9a0/batched_gemm-8704aa4-ba3b00d/"
+        "037e528e719ec3456a7d7d654f26b805e44c63b1/batched_gemm-8704aa4-ba3b00d/"
     )
     TRTLLM_GEN_GEMM: str = (
-        "6c74964c96684c3e674340f7e35fc20ad909a9a0/gemm-8704aa4-f91dc9e/"
+        "037e528e719ec3456a7d7d654f26b805e44c63b1/gemm-8704aa4-f91dc9e/"
     )
     CUDNN_SDPA: str = "4c623163877c8fef5751c9c7a59940cd2baae02e/fmha/cudnn/"
     DEEPGEMM: str = "d25901733420c7cddc1adf799b0d4639ed1e162f/deep-gemm/"
 
 
 class MetaInfoHash:
     TRTLLM_GEN_FMHA: str = (
-        "5a41a165d4d5e956d4cccd0a7d1627dbdcaccf4d07a9cfcc8055ef0cb52e0c87"
+        "0ff77215b86997665cf75973e13cd2932f551d46b4e008f851d32d47e1d9560f"
     )
     TRTLLM_GEN_BMM: str = (
-        "3edf4847059d465182779436397ece3d5fb45c3360a1d1abda3b71e35f957caa"
+        "34bdfe7acfd49f5fb8b48e06d56e6a5ad88b951c730552f228fc5f614f7632a8"
     )
     DEEPGEMM: str = "69aa277b7f3663ed929e73f9c57301792b8c594dac15a465b44a5d151b6a1d50"
     TRTLLM_GEN_GEMM: str = (
-        "c6265cf047fc5d2208a37c54b5d720f4755b1215de5f7434ad24ffbc81c31c27"
+        "0345358c916d990709f9670e113e93f35c76aa22715e2d5128ec2ca8740be5ba"
     )
 
 
diff --git a/tests/test_mm_fp4.py b/tests/test_mm_fp4.py
@@ -25,10 +25,6 @@ def test_mm_fp4(m, n, k, res_dtype, backend, use_128x4_sf_layout, auto_tuning):
     if auto_tuning and backend == "cudnn":
         pytest.skip("Skipping test for cudnn fp4 with auto_tuning=True")
 
-    if not use_128x4_sf_layout and backend == "trtllm":
-        # FIXME (bringup) quantization failure from main
-        pytest.xfail("Skipping test for non-trtllm fp4 with use_128x4_sf_layout=False")
-
     input = torch.randn([m, k], device="cuda", dtype=torch.bfloat16)
     mat2 = torch.randn([n, k], device="cuda", dtype=torch.bfloat16)
     a_sf_layout = SfLayout.layout_128x4 if use_128x4_sf_layout else SfLayout.layout_8x4

Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@ class TrtllmGenGemmRunner {`
`207`	`207`	`return optionsA.mUseUnrollLoop2xForMma;`
`208`	`208`	`}`
`209`	`209`
`210`		`- return true;`
	`210`	`+ return false;`
`211`	`211`	`});`
`212`	`212`
`213`	`213`	`bool findLoop2xMma = false;`
Original file line number	Diff line number	Diff line change
`@@ -48,27 +48,27 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):`
`48`	`48`
`49`	`49`
`50`	`50`	`class ArtifactPath:`
`51`		`- TRTLLM_GEN_FMHA: str = "6c74964c96684c3e674340f7e35fc20ad909a9a0/fmha/trtllm-gen/"`
	`51`	`+ TRTLLM_GEN_FMHA: str = "037e528e719ec3456a7d7d654f26b805e44c63b1/fmha/trtllm-gen/"`
`52`	`52`	`TRTLLM_GEN_BMM: str = (`
`53`		`- "6c74964c96684c3e674340f7e35fc20ad909a9a0/batched_gemm-8704aa4-ba3b00d/"`
	`53`	`+ "037e528e719ec3456a7d7d654f26b805e44c63b1/batched_gemm-8704aa4-ba3b00d/"`
`54`	`54`	`)`
`55`	`55`	`TRTLLM_GEN_GEMM: str = (`
`56`		`- "6c74964c96684c3e674340f7e35fc20ad909a9a0/gemm-8704aa4-f91dc9e/"`
	`56`	`+ "037e528e719ec3456a7d7d654f26b805e44c63b1/gemm-8704aa4-f91dc9e/"`
`57`	`57`	`)`
`58`	`58`	`CUDNN_SDPA: str = "4c623163877c8fef5751c9c7a59940cd2baae02e/fmha/cudnn/"`
`59`	`59`	`DEEPGEMM: str = "d25901733420c7cddc1adf799b0d4639ed1e162f/deep-gemm/"`
`60`	`60`
`61`	`61`
`62`	`62`	`class MetaInfoHash:`
`63`	`63`	`TRTLLM_GEN_FMHA: str = (`
`64`		`- "5a41a165d4d5e956d4cccd0a7d1627dbdcaccf4d07a9cfcc8055ef0cb52e0c87"`
	`64`	`+ "0ff77215b86997665cf75973e13cd2932f551d46b4e008f851d32d47e1d9560f"`
`65`	`65`	`)`
`66`	`66`	`TRTLLM_GEN_BMM: str = (`
`67`		`- "3edf4847059d465182779436397ece3d5fb45c3360a1d1abda3b71e35f957caa"`
	`67`	`+ "34bdfe7acfd49f5fb8b48e06d56e6a5ad88b951c730552f228fc5f614f7632a8"`
`68`	`68`	`)`
`69`	`69`	`DEEPGEMM: str = "69aa277b7f3663ed929e73f9c57301792b8c594dac15a465b44a5d151b6a1d50"`
`70`	`70`	`TRTLLM_GEN_GEMM: str = (`
`71`		`- "c6265cf047fc5d2208a37c54b5d720f4755b1215de5f7434ad24ffbc81c31c27"`
	`71`	`+ "0345358c916d990709f9670e113e93f35c76aa22715e2d5128ec2ca8740be5ba"`
`72`	`72`	`)`
`73`	`73`
`74`	`74`