bugfix: fix flashinfer_benchmark.py IMA when running a test list (#1625)

bkryu · web-flow · commit 0e3403bace0c · 2025-09-03T09:37:19.000-07:00
## 📌 Description The current `flashinfer_benchmark.py` script can trigger an IMA when a testlist is provided to batch-benchmark multiple test cases. Current PR: * Fixes the bug by clearing torch's cache and synchronizing the device at the beginning of each test. - IMA occurs between test cases and can only be reproduced when running a testlist. - Fix is to `torch.cuda.empty_cache()` and `torch.cuda.synchronize()` at the beginning of each test. * Miscellaneous improvements to flashinfer_benchmark.py: - Attention benchmarks: - Reduces unnecessary reference calculations. - Prints statistics upon encountering reference check failures - GEMM benchmarks: - Allows testing `trtllm`backend in `testGemmFp8NtGroupwise` API *No changes to the library code*  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/benchmarks/routines/attention.py b/benchmarks/routines/attention.py
@@ -15,6 +15,7 @@
     dtype_str_to_torch_dtype,
     get_device,
     print_perf_metrics,
+    is_close_stats,
 )
 
 
@@ -485,7 +486,7 @@ def run_backend_wrapper(backend):
             )
         elif backend == "trtllm-gen-native":
             return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-                query=q,
+                query=q.contiguous(),
                 kv_cache=kv_cache,
                 workspace_buffer=workspace_buffer,
                 block_tables=block_tables,
@@ -498,19 +499,14 @@ def run_backend_wrapper(backend):
             raise ValueError(f"Backend {backend} not supported")
 
     has_reference_output = False
-    if run_refcheck and "fa2" in backends:
-        reference_output = (
-            backend_wrappers["fa2"]
-            .run(q, kv_cache, k_scale=k_scale, v_scale=v_scale)
-            .detach()
-        )
-        has_reference_output = True
-
     # Iterate over each backend:
     for cur_backend in backends:
         if run_refcheck:
-            outputs[cur_backend] = run_backend_wrapper(cur_backend).detach()
-        if is_cuda_graph_compatible:
+            outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
+            if cur_backend == "fa2":
+                has_reference_output = True
+                reference_output = outputs[cur_backend]
+        if is_cuda_graph_compatible and cur_backend != "fa2":
             backend_times[cur_backend] = bench_gpu_time_with_cudagraph(
                 fn=lambda: run_backend_wrapper(cur_backend),
                 dry_run_iters=args.dry_run_iters,
@@ -550,8 +546,14 @@ def run_backend_wrapper(backend):
                         reference_output, tested_outputs[i], rtol=rtol, atol=atol
                     )
                 except AssertionError as e:
+                    (
+                        num_different_elements,
+                        num_elements,
+                        num_different_elements_percentage,
+                    ) = is_close_stats(reference_output, tested_outputs[i], rtol, atol)
                     print(
-                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}"
+                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}: "
+                        f"{num_different_elements} / {num_elements} ({num_different_elements_percentage:.2f}%) elements are different"
                     )
                     if not args.allow_output_mismatch:
                         print(e)
@@ -721,9 +723,6 @@ def testBatchPrefillWithPagedKVCacheWrapper(args):
 
     # Check for layer-specific constraints
     layer_not_supported = False
-    if not ((head_dim_qk == 128 and head_dim_qk == head_dim_vo) or head_dim_qk == 192):
-        print("[ERROR] Head dimension must be 128 or 192")
-        layer_not_supported = True
     if layer_not_supported:
         print("[ERROR] Layer not supported. Exiting.")
         return
@@ -882,7 +881,9 @@ def testBatchPrefillWithPagedKVCacheWrapper(args):
                 flashinfer.prefill.BatchPrefillWithPagedKVCacheWrapper(
                     workspace_buffer,
                     "HND",
-                    use_cuda_graph=is_cuda_graph_compatible,
+                    use_cuda_graph=is_cuda_graph_compatible
+                    if backend != "fa2"
+                    else False,
                     qo_indptr_buf=qo_indptr,
                     paged_kv_indptr_buf=kv_indptr,
                     paged_kv_indices_buf=kv_indices,
@@ -958,17 +959,14 @@ def run_backend_wrapper(backend):
             raise ValueError(f"Backend {backend} not supported")
 
     has_reference_output = False
-    if run_refcheck and "fa2" in backends:
-        reference_output = backend_wrappers["fa2"].run(
-            q, kv_cache, k_scale=k_scale, v_scale=v_scale
-        )
-        has_reference_output = True
-
     # Iterate over each backend:
     for cur_backend in backends:
         if run_refcheck:
-            outputs[cur_backend] = run_backend_wrapper(cur_backend)
-        if is_cuda_graph_compatible:
+            outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
+            if cur_backend == "fa2":
+                has_reference_output = True
+                reference_output = outputs[cur_backend]
+        if is_cuda_graph_compatible and cur_backend != "fa2":
             backend_times[cur_backend] = bench_gpu_time_with_cudagraph(
                 fn=lambda: run_backend_wrapper(cur_backend),
                 dry_run_iters=args.dry_run_iters,
@@ -1008,8 +1006,14 @@ def run_backend_wrapper(backend):
                         reference_output, tested_outputs[i], rtol=rtol, atol=atol
                     )
                 except AssertionError as e:
+                    (
+                        num_different_elements,
+                        num_elements,
+                        num_different_elements_percentage,
+                    ) = is_close_stats(reference_output, tested_outputs[i], rtol, atol)
                     print(
-                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}"
+                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}: "
+                        f"{num_different_elements} / {num_elements} ({num_different_elements_percentage:.2f}%) elements are different"
                     )
                     if not args.allow_output_mismatch:
                         print(e)
@@ -1295,7 +1299,9 @@ def testBatchPrefillWithRaggedKVCacheWrapper(args):
                 flashinfer.prefill.BatchPrefillWithRaggedKVCacheWrapper(
                     workspace_buffer,
                     "NHD",
-                    use_cuda_graph=is_cuda_graph_compatible,
+                    use_cuda_graph=is_cuda_graph_compatible
+                    if backend != "fa2"
+                    else False,
                     qo_indptr_buf=qo_indptr,
                     kv_indptr_buf=kv_indptr,
                     backend=backend,
@@ -1350,15 +1356,14 @@ def run_backend_wrapper(backend):
             raise ValueError(f"Backend {backend} not supported")
 
     has_reference_output = False
-    if run_refcheck and "fa2" in backends:
-        reference_output = backend_wrappers["fa2"].run_return_lse(q, k, v)[0]
-        has_reference_output = True
-
     # Iterate over each backend:
     for cur_backend in backends:
         if run_refcheck:
-            outputs[cur_backend] = run_backend_wrapper(cur_backend)
-        if is_cuda_graph_compatible:
+            outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
+            if cur_backend == "fa2":
+                has_reference_output = True
+                reference_output = outputs[cur_backend]
+        if is_cuda_graph_compatible and cur_backend != "fa2":
             backend_times[cur_backend] = bench_gpu_time_with_cudagraph(
                 fn=lambda: run_backend_wrapper(cur_backend),
                 dry_run_iters=args.dry_run_iters,
@@ -1398,8 +1403,14 @@ def run_backend_wrapper(backend):
                         reference_output, tested_outputs[i], rtol=rtol, atol=atol
                     )
                 except AssertionError as e:
+                    (
+                        num_different_elements,
+                        num_elements,
+                        num_different_elements_percentage,
+                    ) = is_close_stats(reference_output, tested_outputs[i], rtol, atol)
                     print(
-                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}"
+                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}: "
+                        f"{num_different_elements} / {num_elements} ({num_different_elements_percentage:.2f}%) elements are different"
                     )
                     if not args.allow_output_mismatch:
                         print(e)
@@ -1693,19 +1704,15 @@ def run_backend_wrapper(backend):
         else:
             raise ValueError(f"Unsupported backend: {backend}")
 
-    if run_refcheck and "fa2" in backends:
-        reference_output = fi_fa2_mla_wrapper.run(
-            q_nope, q_pe, ckv_cache, kpe_cache, return_lse=False
-        )
-        has_reference_output = True
-    else:
-        has_reference_output = False
-
+    has_reference_output = False
     # Iterate over each backend:
     for cur_backend in backends:
         if run_refcheck:
-            outputs[cur_backend] = run_backend_wrapper(cur_backend).detach()
-        if is_cuda_graph_compatible:
+            outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
+            if cur_backend == "fa2":
+                has_reference_output = True
+                reference_output = outputs[cur_backend]
+        if is_cuda_graph_compatible and cur_backend != "fa2":
             backend_times[cur_backend] = bench_gpu_time_with_cudagraph(
                 fn=lambda: run_backend_wrapper(cur_backend),
                 dry_run_iters=args.dry_run_iters,
@@ -1741,8 +1748,14 @@ def run_backend_wrapper(backend):
                         reference_output, tested_outputs[i], rtol=rtol, atol=atol
                     )
                 except AssertionError as e:
+                    (
+                        num_different_elements,
+                        num_elements,
+                        num_different_elements_percentage,
+                    ) = is_close_stats(reference_output, tested_outputs[i], rtol, atol)
                     print(
-                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}"
+                        f"[ERROR] Output tensor mismatch between backends {tested_backends[0]} and {tested_backends[i]}: "
+                        f"{num_different_elements} / {num_elements} ({num_different_elements_percentage:.2f}%) elements are different"
                     )
                     if not args.allow_output_mismatch:
                         print(e)
diff --git a/benchmarks/routines/flashinfer_benchmark_utils.py b/benchmarks/routines/flashinfer_benchmark_utils.py
@@ -117,6 +117,9 @@ def print_perf_metrics(backend, median_time, std_time, tflops, tb_per_sec):
 
 
 def get_device(args):
+    # Synchronize to ensure that the device is ready after previous tests
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
     set_seed(args.random_seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     gpu_name = torch.cuda.get_device_name(torch.cuda.current_device()).replace(" ", "_")
@@ -125,6 +128,17 @@ def get_device(args):
     return device
 
 
+def is_close_stats(input, other, rtol=1e-5, atol=1e-8):
+    close_tensor = torch.isclose(input, other, rtol=rtol, atol=atol)
+    num_elements = close_tensor.numel()
+    num_different_elements = num_elements - close_tensor.sum().item()
+    return (
+        num_different_elements,  # number of different elements
+        num_elements,  # total number of elements in tensor
+        num_different_elements / num_elements * 100.0,
+    )
+
+
 def dtype_str_to_torch_dtype(dtype_str):
     if dtype_str == "bfloat16":
         return torch.bfloat16
diff --git a/benchmarks/routines/gemm.py b/benchmarks/routines/gemm.py
@@ -197,11 +197,22 @@ def testGemmFp8NtGroupwise(args):
     ## Done parsing input arguments
 
     if "trtllm" in backends:
-        remove_trtllm = True
-        print("[INFO] trtllm backend testing not supported yet")
+        remove_trtllm = False
+        if scale_major_mode != "MN":
+            print(
+                "[INFO] trtllm only supports MN scale_major_mode, removing trtllm from backends"
+            )
+            remove_trtllm = True
+        if k < 256:
+            print("[INFO] trtllm only supports k >= 256, removing trtllm from backends")
+            remove_trtllm = True
         if remove_trtllm:
             backends.remove("trtllm")
 
+    if len(backends) == 0:
+        print("[ERROR] No backends to test. Exiting.")
+        return
+
     ## Prepare input tensors
     a_val = torch.randn((m, k), dtype=torch.float, device=device)
     b_val = torch.randn((n, k), dtype=torch.float, device=device) / np.sqrt(k)
@@ -223,17 +234,6 @@ def testGemmFp8NtGroupwise(args):
     a_fp8, a_scale = quantize_fp8(a_val, a_scale_shape, a_tile_shape, scale_major_mode)
     b_fp8, b_scale = quantize_fp8(b_val, b_scale_shape, b_tile_shape, scale_major_mode)
 
-    if "trtllm" in backends:
-        a_scale_shape_trtllm = (m, k // tile_size)
-        b_scale_shape_trtllm = (k // tile_size, n // tile_size)
-
-        a_fp8_trtllm, a_scale_trtllm = quantize_fp8(
-            a_val, a_scale_shape_trtllm, a_tile_shape, "K"
-        )
-        b_fp8_trtllm, b_scale_trtllm = quantize_fp8(
-            b_val, b_scale_shape_trtllm, b_tile_shape, "MN"
-        )
-
     if args.verbose >= 2:
         print(f"[VVERBOSE] {a_fp8.shape = }")
         print(f"[VVERBOSE] {b_fp8.shape = }")
@@ -244,7 +244,7 @@ def testGemmFp8NtGroupwise(args):
     b_dequant = dequantize_fp8(b_fp8, b_scale, scale_major_mode)
 
     def run_backend(backend):
-        if backend == "cutlass":
+        if backend in ["cutlass", "trtllm"]:
             return flashinfer.gemm.gemm_fp8_nt_groupwise(
                 a=a_fp8,
                 b=b_fp8,
@@ -253,18 +253,7 @@ def run_backend(backend):
                 scale_major_mode=scale_major_mode,
                 out_dtype=out_dtype,
                 mma_sm=mma_sm,
-                backend="cutlass",
-            )
-        elif backend == "trtllm":
-            return flashinfer.gemm.gemm_fp8_nt_groupwise(
-                a=a_fp8,
-                b=b_fp8,
-                a_scale=a_scale,
-                b_scale=b_scale,
-                scale_major_mode=None,
-                out_dtype=out_dtype,
-                mma_sm=mma_sm,
-                backend="trtllm",
+                backend=backend,
             )
         else:
             raise ValueError(f"Unsupported backend: {backend}")