flashinfer-ai
diff --git a/‎benchmarks/bench_append_paged_kv_cache.py
Lines changed: 6 additions & 3 deletions b/‎benchmarks/bench_append_paged_kv_cache.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎benchmarks/bench_append_paged_mla_kv_cache.py
Lines changed: 6 additions & 3 deletions b/‎benchmarks/bench_append_paged_mla_kv_cache.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎benchmarks/bench_batch_attention.py
Lines changed: 5 additions & 3 deletions b/‎benchmarks/bench_batch_attention.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎benchmarks/bench_batch_decode.py
Lines changed: 3 additions & 2 deletions b/‎benchmarks/bench_batch_decode.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmarks/bench_blackwell_attention.py
Lines changed: 6 additions & 4 deletions b/‎benchmarks/bench_blackwell_attention.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎benchmarks/bench_block_sparse_attention.py
Lines changed: 21 additions & 13 deletions b/‎benchmarks/bench_block_sparse_attention.py
Lines changed: 21 additions & 13 deletions
diff --git a/‎benchmarks/bench_cutlass_fused_moe.py
Lines changed: 6 additions & 5 deletions b/‎benchmarks/bench_cutlass_fused_moe.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎benchmarks/bench_deepgemm_blackwell.py
Lines changed: 10 additions & 9 deletions b/‎benchmarks/bench_deepgemm_blackwell.py
Lines changed: 10 additions & 9 deletions
diff --git a/‎benchmarks/bench_deepseek_mla.py
Lines changed: 6 additions & 4 deletions b/‎benchmarks/bench_deepseek_mla.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎benchmarks/bench_fused_add_rmsnorm.py
Lines changed: 4 additions & 2 deletions b/‎benchmarks/bench_fused_add_rmsnorm.py
Lines changed: 4 additions & 2 deletions
@@ -2,10 +2,11 @@
 import dataclasses
 from typing import Tuple, cast
 
+import numpy as np
 import torch
-from triton.testing import do_bench
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 
 @dataclasses.dataclass(kw_only=True)
@@ -108,7 +109,8 @@ def fn_convert() -> Tuple[torch.Tensor, torch.Tensor]:
                 )
 
             batch_indices, positions = fn_convert()
-            convert_latency_ms = cast(float, do_bench(fn_convert))
+            convert_latencies = bench_gpu_time(fn_convert)
+            convert_latency_ms = np.median(convert_latencies)
 
             @torch.cuda.nvtx.range(f"append model={model_name}, seqlens={seqlens}")
             def fn() -> None:
@@ -124,7 +126,8 @@ def fn() -> None:
                     "NHD",
                 )
 
-            latency_ms = cast(float, do_bench(fn))
+            latencies = bench_gpu_time(fn)
+            latency_ms = np.median(latencies)
             all_layers_latency_ms = convert_latency_ms + latency_ms * model.num_layers
             throughput = (
                 k.numel()
 
@@ -2,10 +2,11 @@
 import dataclasses
 from typing import Tuple, cast
 
+import numpy as np
 import torch
-from triton.testing import do_bench
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 
 @dataclasses.dataclass(kw_only=True)
@@ -92,7 +93,8 @@ def fn_convert() -> Tuple[torch.Tensor, torch.Tensor]:
                 )
 
             batch_indices, positions = fn_convert()
-            convert_latency_ms = cast(float, do_bench(fn_convert))
+            convert_latencies = bench_gpu_time(fn_convert)
+            convert_latency_ms = np.median(convert_latencies)
 
             @torch.cuda.nvtx.range(f"append model={model_name}, seqlens={seqlens}")
             def fn() -> None:
@@ -108,7 +110,8 @@ def fn() -> None:
                     kv_last_page_len,
                 )
 
-            latency_ms = cast(float, do_bench(fn))
+            latencies = bench_gpu_time(fn)
+            latency_ms = np.median(latencies)
             all_layers_latency_ms = convert_latency_ms + latency_ms * model.num_layers
             throughput = (
                 (ckv.numel() + kpe.numel())
 
@@ -6,9 +6,9 @@
 import numpy as np
 import pandas as pd
 import torch
-from triton.testing import do_bench
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 
 def run_bench(
@@ -65,7 +65,8 @@ def run_bench(
         q_data_type=torch.bfloat16,
         kv_data_type=torch.bfloat16,
     )
-    ms_old = do_bench(lambda: wrapper_old.run(q, kv_data))
+    measurements_old = bench_gpu_time(lambda: wrapper_old.run(q, kv_data))
+    ms_old = np.mean(measurements_old)
 
     # new
     wrapper = flashinfer.BatchAttention(kv_layout="NHD")
@@ -83,7 +84,8 @@ def run_bench(
         q_data_type=torch.bfloat16,
         kv_data_type=torch.bfloat16,
     )
-    ms_new = do_bench(lambda: wrapper.run(q, kv_data))
+    measurements_new = bench_gpu_time(lambda: wrapper.run(q, kv_data))
+    ms_new = np.mean(measurements_new)
 
     total_bytes = (
         q.numel() * q.element_size() + kv_data.numel() * kv_data.element_size()
 
@@ -16,9 +16,9 @@
 
 import numpy as np
 import torch
-from triton.testing import do_bench
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 page_block_size = 16
 num_kv_heads = 4
@@ -67,7 +67,8 @@ def bench_batch_decode(
         q_data_type=q_dtype,
     )
 
-    ms = do_bench(lambda: wrapper.run(q, kv_data))
+    measurements = bench_gpu_time(lambda: wrapper.run(q, kv_data))
+    ms = np.median(measurements)
 
     io = q.numel() * q.element_size() + kv_data.numel() * kv_data.element_size()
     print(
 
@@ -14,10 +14,11 @@
 limitations under the License.
 """
 
+import numpy as np
 import torch
-from triton.testing import do_bench
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 
 def bench_fmha_blackwell(
@@ -61,11 +62,12 @@ def bench_fmha_blackwell(
         kv_data_type=dtype,
     )
     o = wrapper.run(q, k, v)
-    ms = do_bench(
+    measurements = bench_gpu_time(
         lambda: wrapper.run(q, k, v),
-        warmup=100,
-        rep=1000,
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
     )
+    ms = np.median(measurements)
 
     def flops(ms):
         if causal:
 
@@ -14,10 +14,11 @@
 limitations under the License.
 """
 
+import numpy as np
 import torch
-import triton
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 
 def bench_variable_block_sparse_attention(
@@ -86,27 +87,34 @@ def bench_variable_block_sparse_attention(
         q_data_type=torch.half,
     )
 
-    sparse_ms_fa2 = triton.testing.do_bench(
+    # Benchmark sparse attention with FA2
+    measurements_fa2 = bench_gpu_time(
         lambda: sparse_wrapper_fa2.run(q, k, v),
-        warmup=100,
-        rep=1000,
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
     )
-    sparse_ms_fa3 = triton.testing.do_bench(
+    sparse_ms_fa2 = np.median(measurements_fa2)
+
+    # Benchmark sparse attention with FA3
+    measurements_fa3 = bench_gpu_time(
         lambda: sparse_wrapper_fa3.run(q, k, v),
-        warmup=100,
-        rep=1000,
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
     )
+    sparse_ms_fa3 = np.median(measurements_fa3)
 
     q = torch.randn(seq_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda")
     k = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda")
     v = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda")
     dense_sm80_ms, dense_sm90_ms = (
-        triton.testing.do_bench(
-            lambda: flashinfer.single_prefill_with_kv_cache_return_lse(
-                q, k, v, causal=False, backend=backend
-            ),
-            warmup=100,
-            rep=1000,
+        np.median(
+            bench_gpu_time(
+                lambda: flashinfer.single_prefill_with_kv_cache_return_lse(
+                    q, k, v, causal=False, backend=backend
+                ),
+                dry_run_time_ms=100,
+                repeat_time_ms=1000,
+            )
         )
         for backend in ["fa2", "fa3"]
     )
 
@@ -17,13 +17,14 @@
 import argparse
 import pprint
 
+import numpy as np
 import torch
 from torch.nn import functional as F
 
 import flashinfer.fused_moe as fused_moe
 from flashinfer import fp4_quantize
 from flashinfer.autotuner import AutoTuner, autotune, get_config_path
-from flashinfer.testing.utils import bench_gpu_time_with_cudagraph
+from flashinfer.testing.utils import bench_gpu_time
 
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
@@ -173,7 +174,7 @@ def bench_cutlass_fused_moe(
                 output=flash_output,
                 tune_max_num_tokens=16384,
             )
-    ms_list = bench_gpu_time_with_cudagraph(
+    ms_list = bench_gpu_time(
         lambda: fused_moe.cutlass_fused_moe(
             hidden_states,
             selected_experts.to(torch.int),
@@ -184,12 +185,12 @@ def bench_cutlass_fused_moe(
             quant_scales=quant_scales,
             input_sf=input_sf,
             output=flash_output,
-        )
+        ),
     )
-    avg_ms = sum(ms_list) / len(ms_list)
+    median_ms = np.median(ms_list)
     print(f"{'input':<15} {'weight1':<20} {'weight2':<20} {'time(ms)'}")
     print(
-        f"{str(tuple(hidden_states.shape)):<15} {str(tuple(w1.shape)):<20} {str(tuple(w2.shape)):<20} {avg_ms:.3f}"
+        f"{str(tuple(hidden_states.shape)):<15} {str(tuple(w1.shape)):<20} {str(tuple(w2.shape)):<20} {median_ms:.3f}"
     )
 
 
 
@@ -14,14 +14,14 @@
 limitations under the License.
 """
 
+import numpy as np
 import torch
-from triton.testing import do_bench
 
 from flashinfer.gemm import (
     batch_deepgemm_fp8_nt_groupwise,
     group_deepgemm_fp8_nt_groupwise,
 )
-from flashinfer.testing.utils import quantize_fp8
+from flashinfer.testing.utils import bench_gpu_time, quantize_fp8
 
 
 def bench_deepgemm_grouped_fp8_blackwell(batch_size, m, n, k, in_dtype, out_dtype):
@@ -48,14 +48,14 @@ def bench_deepgemm_grouped_fp8_blackwell(batch_size, m, n, k, in_dtype, out_dtyp
     out = torch.empty(batch_size * m, n, device="cuda", dtype=out_dtype)
 
     # Benchmark the DeepGEMM function
-    ms = do_bench(
+    measurements = bench_gpu_time(
         lambda: group_deepgemm_fp8_nt_groupwise(
             a_fp8, b_fp8, a_scale, b_scale, m_indices, out=out, out_dtype=out_dtype
         ),
-        warmup=100,
-        rep=1000,
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
     )
-
+    ms = np.median(measurements)
     tflops_per_second = 2 * batch_size * m * n * k * 1e-9 / ms
     memory_bandwidth_per_second = (
         sum(
@@ -91,7 +91,7 @@ def bench_deepgemm_batch_fp8_blackwell(batch_size, m, n, k, in_dtype, out_dtype)
     out = torch.empty((batch_size, m, n), device="cuda", dtype=out_dtype)
 
     # Benchmark the DeepGEMM function
-    ms = do_bench(
+    measurements = bench_gpu_time(
         lambda: batch_deepgemm_fp8_nt_groupwise(
             a_fp8,
             b_fp8,
@@ -102,9 +102,10 @@ def bench_deepgemm_batch_fp8_blackwell(batch_size, m, n, k, in_dtype, out_dtype)
             out=out,
             out_dtype=out_dtype,
         ),
-        warmup=100,
-        rep=1000,
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
     )
+    ms = np.median(measurements)
 
     tflops_per_second = 2 * batch_size * m * n * k * 1e-9 / ms
     memory_bandwidth_per_second = (
 
@@ -14,10 +14,11 @@
 limitations under the License.
 """
 
+import numpy as np
 import torch
-import triton
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 
 def bench_deepseek_mla_decode(batch_size, seq_len, num_heads, backend):
@@ -61,11 +62,12 @@ def bench_deepseek_mla_decode(batch_size, seq_len, num_heads, backend):
     )
     o = wrapper.run(q_nope, q_pe, ckv, kpe, return_lse=False)
 
-    ms = triton.testing.do_bench(
+    measurements = bench_gpu_time(
         lambda: wrapper.run(q_nope, q_pe, ckv, kpe),
-        warmup=100,
-        rep=1000,
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
     )
+    ms = np.median(measurements)
 
     io = sum([_.numel() * _.element_size() for _ in [q_nope, q_pe, ckv, kpe, o]])
     flops = 2 * batch_size * num_heads * (2 * head_dim_ckv + head_dim_kpe) * seq_len
 
@@ -1,10 +1,11 @@
 import argparse
 from typing import cast
 
+import numpy as np
 import torch
-from triton.testing import do_bench
 
 import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
 
 
 @torch.inference_mode()
@@ -42,7 +43,8 @@ def fn() -> None:
                     flashinfer.fused_add_rmsnorm(x, residual, weight, eps)
 
                 # Run benchmarking
-                latency_ms = cast(float, do_bench(fn))
+                measurements = bench_gpu_time(fn)
+                latency_ms = np.median(measurements)
                 throughput = (
                     x.numel() * x.element_size() * 2
                     + residual.numel() * residual.element_size() * 2