Add launch overhead microbenchmark (#7849)

ThomasRaoux · peterbell10 · web-flow · commit b220c7644783 · 2025-08-14T16:49:29.000Z
Adding a script from @apgoucher to track dispatch overhead. The script never fails but it allows us to see the launch overhead whenever we have front end changes to see if there are significant changes --------- Co-authored-by: peterbell10 <peterbell10@live.co.uk>
diff --git a/.github/workflows/integration-tests-amd.yml b/.github/workflows/integration-tests-amd.yml
@@ -164,6 +164,9 @@ jobs:
           # Reenable test_functional_regression.py once it's fixed
           cd python/test/regression
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
+      - name: Run microbenchmark tests
+        run: |
+          python3 python/test/microbenchmark/launch_overhead.py
       - name: Run Proton tests
         run: |
           unset HIP_VISIBLE_DEVICES
diff --git a/.github/workflows/integration-tests-nvidia.yml b/.github/workflows/integration-tests-nvidia.yml
@@ -96,6 +96,9 @@ jobs:
         run: make test-interpret
       - name: Run regression tests
         run: make test-regression
+      - name: Run microbenchmark tests
+        # Microbenchmark never fail but running them gives us an easy way to track performance changes.
+        run: make test-microbenchmark
       - name: Run C++ unittests
         run: make test-cpp
       - name: Run Proton tests
diff --git a/Makefile b/Makefile
@@ -60,6 +60,10 @@ test-gluon: all
 test-regression: all
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/regression
 
+.PHONY: test-microbenchmark
+test-microbenchmark: all
+	$(PYTHON) python/test/microbenchmark/launch_overhead.py
+
 .PHONY: test-interpret
 test-interpret: all
 	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
diff --git a/python/test/microbenchmark/launch_overhead.py b/python/test/microbenchmark/launch_overhead.py
@@ -0,0 +1,95 @@
+"""
+Original code by @bertmaher; profiling added by @apgoucher
+"""
+
+import cProfile
+import pstats
+import time
+
+import numpy as np
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def nop_args(
+    t1,
+    t2,
+    t3,
+    t4,
+    t5,
+    i1,
+    i2,
+    i3,
+    i4,
+    i5,
+    i6,
+    i7,
+    i8,
+    i9,
+    c1: tl.constexpr,
+    c2: tl.constexpr,
+    c3: tl.constexpr,
+    c4: tl.constexpr,
+    c5: tl.constexpr,
+):
+    pass
+
+
+def do_bench_walltime(fn):
+    print("Compiling...")
+    fn()
+    torch.cuda.synchronize()
+
+    for _ in range(1000):
+        fn()
+    torch.cuda.synchronize()
+
+    n_repeat = 10000
+
+    mses = []
+
+    for _ in range(25):
+        print("Running %d benchmarking iterations..." % n_repeat)
+        # Benchmark
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(n_repeat):
+            fn()
+        torch.cuda.synchronize()
+        end_time = time.time()
+        wall_time_ms = (end_time - start_time) * 1e3 / n_repeat
+        mses.append(wall_time_ms)
+
+    mses = np.array(mses)
+
+    print("Running profiler...")
+    profile = cProfile.Profile()
+    profile.enable()
+    for _ in range(n_repeat):
+        fn()
+    torch.cuda.synchronize()
+    profile.disable()
+    stats = pstats.Stats(profile)
+    stats.sort_stats("time")
+    stats.print_stats()
+    return mses
+
+
+def main():
+    targs = [torch.zeros(1, device="cuda") for _ in range(5)]
+    iargs = [1 for _ in range(9)]
+    cargs = [32 for _ in range(5)]
+
+    usecs = do_bench_walltime(lambda: nop_args[
+        1,
+    ](*targs, *iargs, *cargs)) * 1000.0
+
+    print(usecs)
+    print(sorted(usecs)[len(usecs) >> 1])
+
+
+if __name__ == "__main__":
+    main()