Add inductor_benchmarker as latency measurement option

yf225 · yf225 · commit a83ce7008e52 · 2025-08-18T14:08:37.000-07:00
diff --git a/tritonbench/components/do_bench/run.py b/tritonbench/components/do_bench/run.py
@@ -5,6 +5,7 @@
 
 import torch
 import triton
+from torch._inductor.runtime.benchmarking import benchmarker
 
 NS_TO_MS = 1e-6
 
@@ -125,6 +126,42 @@ def _summarize_statistics(times, quantiles, return_mode):
     return getattr(torch, return_mode)(times).item()
 
 
+def _do_bench_inductor(fn, warmup, rep, grad_to_none=None):
+    """Measure latency using inductor benchmarker.
+
+    Args:
+        warmup: Target warmup time in milliseconds (matches triton.testing.do_bench)
+        rep: Target total measurement time in milliseconds (matches triton.testing.do_bench)
+        grad_to_none: Tensors whose gradients should be cleared before each measurement
+    
+    Returns:
+        List of measured times in milliseconds.
+    """
+    # First, estimate the runtime with a single measurement
+    estimate_ms = benchmarker.benchmark_gpu(fn, estimation_iters=5, benchmark_iters=10)
+
+    # Calculate number of iterations based on target rep time
+    # Similar to how triton.testing.do_bench calculates iterations
+    if estimate_ms == 0:
+        n_repeat = 1000  # Default if function is very fast
+    else:
+        n_repeat = max(1, int(rep / estimate_ms))
+
+    # Collect multiple measurements like triton.testing.do_bench with return_mode='all'
+    times_ms = []
+    for _ in range(n_repeat):
+        # Clear gradients BEFORE timing (like triton.testing.do_bench)
+        if grad_to_none is not None:
+            for x in grad_to_none:
+                x.grad = None
+
+        # Measure only the function execution time
+        ms_time = benchmarker.benchmark_gpu(fn)
+        times_ms.append(ms_time)
+
+    return times_ms
+
+
 def _do_bench_cpu(
     fn, warmup, rep=20, grad_to_none=None, quantiles=None, return_mode="mean"
 ):
@@ -174,8 +211,13 @@ def do_bench_wrapper(
     device: str = "cuda",
     use_cuda_graphs: bool = False,
     bypass_fail: bool = False,
+    latency_measure_mode: str = "triton_do_bench",
 ) -> Optional[Latency]:
-    """Wrapper to triton's do_bench to gain latency."""
+    """Wrapper to triton's do_bench to gain latency.
+
+    Args:
+        latency_measure_mode: Either "triton_do_bench" (default) or "inductor_benchmarker"
+    """
     try:
         if device == "cpu":
             return Latency(
@@ -198,15 +240,25 @@ def do_bench_wrapper(
                     )
                 )
         else:
-            return Latency(
-                times=triton.testing.do_bench(
-                    fn,
-                    warmup=warmup,
-                    rep=rep,
-                    return_mode="all",
-                    grad_to_none=grad_to_none,
+            if latency_measure_mode == "inductor_benchmarker":
+                return Latency(
+                    times=_do_bench_inductor(
+                        fn,
+                        warmup=warmup,
+                        rep=rep,
+                        grad_to_none=grad_to_none,
+                    )
+                )
+            else:  # default to triton do_bench
+                return Latency(
+                    times=triton.testing.do_bench(
+                        fn,
+                        warmup=warmup,
+                        rep=rep,
+                        return_mode="all",
+                        grad_to_none=grad_to_none,
+                    )
                 )
-            )
     except Exception as e:
         if not bypass_fail:
             raise e
diff --git a/tritonbench/utils/parser.py b/tritonbench/utils/parser.py
@@ -179,6 +179,12 @@ def get_parser(args=None):
     parser.add_argument(
         "--cudagraph", action="store_true", help="Benchmark with CUDA graph."
     )
+    parser.add_argument(
+        "--latency-measure-mode",
+        default="triton_do_bench",
+        choices=["triton_do_bench", "inductor_benchmarker"],
+        help="Method to measure latency: triton_do_bench (default) or inductor_benchmarker.",
+    )
     parser.add_argument(
         "--isolate",
         action="store_true",
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py
@@ -1229,6 +1229,7 @@ def _init_extra_metrics() -> Dict[str, Any]:
                     device=self.device,
                     use_cuda_graphs=self.use_cuda_graphs,
                     bypass_fail=self.tb_args.bypass_fail,
+                    latency_measure_mode=self.tb_args.latency_measure_mode,
                 )
             if {
                 "gpu_peak_mem",

Original file line number	Diff line number	Diff line change
`@@ -1229,6 +1229,7 @@ def _init_extra_metrics() -> Dict[str, Any]:`
`1229`	`1229`	`device=self.device,`
`1230`	`1230`	`use_cuda_graphs=self.use_cuda_graphs,`
`1231`	`1231`	`bypass_fail=self.tb_args.bypass_fail,`
	`1232`	`+ latency_measure_mode=self.tb_args.latency_measure_mode,`
`1232`	`1233`	`)`
`1233`	`1234`	`if {`
`1234`	`1235`	`"gpu_peak_mem",`