intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 4 additions & 0 deletions b/‎Makefile‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 5 additions & 6 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 64 additions & 1 deletion b/‎python/test/gluon/test_core.py‎
Lines changed: 64 additions & 1 deletion
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 78 additions & 35 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 78 additions & 35 deletions
diff --git a/‎python/test/microbenchmark/launch_overhead.py‎
Lines changed: 95 additions & 0 deletions b/‎python/test/microbenchmark/launch_overhead.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎python/test/unit/runtime/test_driver.py‎
Lines changed: 4 additions & 4 deletions b/‎python/test/unit/runtime/test_driver.py‎
Lines changed: 4 additions & 4 deletions
@@ -166,6 +166,9 @@ jobs:
           # Reenable test_functional_regression.py once it's fixed
           cd python/test/regression
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
+      - name: Run microbenchmark tests
+        run: |
+          python3 python/test/microbenchmark/launch_overhead.py
       - name: Run Proton tests
         run: |
           unset HIP_VISIBLE_DEVICES
 
@@ -98,6 +98,9 @@ jobs:
         run: make test-interpret
       - name: Run regression tests
         run: make test-regression
+      - name: Run microbenchmark tests
+        # Microbenchmark never fail but running them gives us an easy way to track performance changes.
+        run: make test-microbenchmark
       - name: Run C++ unittests
         run: make test-cpp
       - name: Run Proton tests
 
@@ -60,6 +60,10 @@ test-gluon: all
 test-regression: all
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/regression
 
+.PHONY: test-microbenchmark
+test-microbenchmark: all
+	$(PYTHON) python/test/microbenchmark/launch_overhead.py
+
 .PHONY: test-interpret
 test-interpret: all
 	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
 
@@ -217,8 +217,8 @@ py::object layoutToGluon(Attribute layout) {
 
     return layouts.AMDMFMALayout(
         amdMfma.getVersion(), instrShape, amdMfma.getIsTransposed(),
-        toStdVector(amdMfma.getWarpsPerCTA()),
-        toStdVector(amdMfma.getTilesPerWarp()), layouts.GluonDType(typeName),
+        toStdVector(amdMfma.getWarpsPerCTA()), layouts.GluonDType(typeName),
+        toStdVector(amdMfma.getTilesPerWarp()),
         toStdVector(ctaLayout.getCTAsPerCGA()),
         toStdVector(ctaLayout.getCTASplitNum()),
         toStdVector(ctaLayout.getCTAOrder()));
@@ -325,13 +325,12 @@ void init_gluon_ir(py::module &&m) {
            })
       .def("get_amd_mfma_layout",
            [](GluonOpBuilder &self, unsigned version,
+              std::vector<unsigned> &instrShape, bool transposed,
+              std::vector<unsigned> &warpsPerCta, mlir::Type elemType,
               std::vector<unsigned> &tilesPerWarp,
-              std::vector<unsigned> &warpsPerCta,
               std::vector<unsigned> &ctasPerCga,
               std::vector<unsigned> &ctaSplitNum,
-              std::vector<unsigned> &ctaOrder,
-              std::vector<unsigned> &instrShape, bool transposed,
-              mlir::Type elemType) -> Attribute {
+              std::vector<unsigned> &ctaOrder) -> Attribute {
              auto ctx = self.getContext();
              auto ctaLayout = self.getChecked<ttg::CTALayoutAttr>(
                  ctx, ctasPerCga, ctaSplitNum, ctaOrder);
 
@@ -1,7 +1,7 @@
 import torch
 import pytest
 
-from triton._internal_testing import is_cuda, is_ampere_or_newer, is_hopper_or_newer, is_hopper
+from triton._internal_testing import is_cuda, is_ampere_or_newer, is_hip_cdna3, is_hip_cdna4, is_hopper_or_newer, is_hopper
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
 from triton.experimental.gluon.language.nvidia.ampere import async_copy, mbarrier
@@ -143,3 +143,66 @@ def test_warpgroup_mma(ASYNC):
     ref = torch.matmul(a, b)
 
     torch.testing.assert_close(out, ref, atol=1e-3, rtol=1e-1)
+
+
+@pytest.mark.parametrize("M, N, K", [(32, 32, 16), (16, 16, 32)])
+@pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
+@pytest.mark.parametrize("num_warps", [4, 8])
+@pytest.mark.parametrize("cdna_version", [3, 4])
+def test_amd_mfma(M, N, K, in_dtype, num_warps, cdna_version):
+
+    @gluon.jit
+    def kernel(a_ptr, b_ptr, c_ptr, stride_am, stride_ak,  #
+               stride_bk, stride_bn,  #
+               stride_cm, stride_cn, BLOCK_SIZE_M: ttgl.constexpr, BLOCK_SIZE_N: ttgl.constexpr,
+               BLOCK_SIZE_K: ttgl.constexpr, blocked: ttgl.constexpr, mfma_layout: ttgl.constexpr):
+        dot_a_layout: ttgl.constexpr = ttgl.DotOperandLayout(operand_index=0, parent=mfma_layout, k_width=8)
+        dot_b_layout: ttgl.constexpr = ttgl.DotOperandLayout(operand_index=1, parent=mfma_layout, k_width=8)
+
+        offs_am = ttgl.arange(0, BLOCK_SIZE_M, layout=ttgl.SliceLayout(1, blocked))
+        offs_bn = ttgl.arange(0, BLOCK_SIZE_N, layout=ttgl.SliceLayout(0, blocked))
+
+        offs_ak = ttgl.arange(0, BLOCK_SIZE_K, layout=ttgl.SliceLayout(0, blocked))
+        offs_bk = ttgl.arange(0, BLOCK_SIZE_K, layout=ttgl.SliceLayout(1, blocked))
+        offs_a = offs_am[:, None] * stride_am + offs_ak[None, :] * stride_ak
+        offs_b = offs_bk[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+        a = ttgl.amd.cdna3.buffer_load(ptr=a_ptr, offsets=offs_a)
+        b = ttgl.amd.cdna3.buffer_load(ptr=b_ptr, offsets=offs_b)
+        a1 = ttgl.convert_layout(a, layout=dot_a_layout)
+        b1 = ttgl.convert_layout(b, layout=dot_b_layout)
+        acc = ttgl.zeros([BLOCK_SIZE_M, BLOCK_SIZE_N], ttgl.float32, mfma_layout)
+        c = ttgl.amd.cdna3.mfma(a1, b1, acc)
+        c = ttgl.convert_layout(c, layout=blocked)
+        c = c.to(a_ptr.dtype.element_ty)
+
+        offs_cm = ttgl.arange(0, BLOCK_SIZE_M, layout=ttgl.SliceLayout(1, blocked))
+        offs_cn = ttgl.arange(0, BLOCK_SIZE_N, layout=ttgl.SliceLayout(0, blocked))
+        offs_c = offs_cm[:, None] * stride_cm + offs_cn[None, :] * stride_cn
+        ttgl.amd.cdna3.buffer_store(stored_value=c, ptr=c_ptr, offsets=offs_c)
+
+    if not is_hip_cdna4() and not is_hip_cdna3():
+        pytest.skip("mfma quires target to be CDNA3 or CDNA4")
+
+    if is_hip_cdna3() and cdna_version != 3:
+        pytest.skip("On CDNA3 target, skip if mfma version is not 3")
+
+    if is_hip_cdna4() and cdna_version != 4:
+        pytest.skip("On CDNA4 target, skip if mfma version is not 4")
+
+    elem_type = torch.float16 if in_dtype == 'float16' else torch.bfloat16
+    a = torch.randn((M, K), device='cuda', dtype=elem_type) - 0.5
+    b = torch.randn((K, N), device='cuda', dtype=elem_type) - 0.5
+    c = torch.empty((M, N), device=a.device, dtype=elem_type)
+    nonkdim: ttgl.constexpr = 32
+    blocked: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[4, 4], threads_per_warp=[4, 16],
+                                                 warps_per_cta=[num_warps, 1], order=[1, 0])
+    mfma_layout: ttgl.constexpr = ttgl.amd.AMDMFMALayout(version=cdna_version, instr_shape=[nonkdim, nonkdim],
+                                                         transposed=True, warps_per_cta=[num_warps, 1])
+
+    kernel[1, 1](a, b, c, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), BLOCK_SIZE_M=M,
+                 BLOCK_SIZE_N=N, BLOCK_SIZE_K=K, blocked=blocked, mfma_layout=mfma_layout, num_warps=num_warps)
+
+    ref = torch.matmul(a, b)
+    triton_output = c
+    torch.testing.assert_close(ref, triton_output)
@@ -1413,31 +1413,27 @@ def test_atomic_cas():
 
 @gluon.jit
 def amd_mfma_layout_kernel():
-    mfma_layout_fp32: ttgl.constexpr = amd_layouts.AMDMFMALayout(version=3, instr_shape=[32, 32], transposed=True,
-                                                                 warps_per_cta=[4, 1], tiles_per_warp=[4, 1],
-                                                                 ctas_per_cga=[1,
-                                                                               1], cta_split_num=[1,
-                                                                                                  1], cta_order=[1, 0])
+    ttgl.full([128, 32], 0, ttgl.float32, layout=amd_layouts.AMDMFMALayout(version=3, instr_shape=[32, 32],
+                                                                           transposed=True, warps_per_cta=[4, 1]))
 
-    mfma_layout_fp64: ttgl.constexpr = amd_layouts.AMDMFMALayout(version=3, instr_shape=[16, 16], transposed=True,
-                                                                 warps_per_cta=[4, 1], tiles_per_warp=[4, 1],
-                                                                 elem_type=ttgl.float64, ctas_per_cga=[1, 1],
-                                                                 cta_split_num=[1, 1], cta_order=[1, 0])
+    ttgl.full([128, 32], 0, ttgl.float32,
+              layout=amd_layouts.AMDMFMALayout(version=3, instr_shape=[32, 32], tiles_per_warp=[4, 1], transposed=True,
+                                               warps_per_cta=[4, 1]))
 
-    mfma_layout_int32: ttgl.constexpr = amd_layouts.AMDMFMALayout(version=3, instr_shape=[16, 16], transposed=True,
-                                                                  warps_per_cta=[4, 1], tiles_per_warp=[4, 1],
-                                                                  elem_type=ttgl.int32, ctas_per_cga=[1, 1],
-                                                                  cta_split_num=[1, 1], cta_order=[1, 0])
+    ttgl.full([128, 32], 0, ttgl.float32,
+              layout=amd_layouts.AMDMFMALayout(version=3, instr_shape=[32, 32], transposed=True, warps_per_cta=[4, 1],
+                                               ctas_per_cga=[1, 1], tiles_per_warp=[1, 1], cta_split_num=[1, 1],
+                                               cta_order=[1, 0]))
 
-    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 64], [4, 1], [1, 0])
+    ttgl.full([128, 32], 0, ttgl.float64,
+              layout=amd_layouts.AMDMFMALayout(version=3, instr_shape=[16, 16], transposed=True, warps_per_cta=[4, 1],
+                                               elem_type=ttgl.float64, tiles_per_warp=[1, 1], ctas_per_cga=[1, 1],
+                                               cta_split_num=[1, 1], cta_order=[1, 0]))
 
-    x_fp32 = ttgl.full([128, 32], 0, ttgl.float32, layout)
-    x_fp64 = ttgl.full([128, 32], 0, ttgl.float64, layout)
-    x_int32 = ttgl.full([128, 32], 0, ttgl.int32, layout)
-
-    ttgl.convert_layout(x_fp32, mfma_layout_fp32)
-    ttgl.convert_layout(x_fp64, mfma_layout_fp64)
-    ttgl.convert_layout(x_int32, mfma_layout_int32)
+    ttgl.full([128, 32], 0, ttgl.int32,
+              layout=amd_layouts.AMDMFMALayout(version=3, instr_shape=[16, 16], transposed=True, warps_per_cta=[4, 1],
+                                               elem_type=ttgl.int32, tiles_per_warp=[1, 1], ctas_per_cga=[1, 1],
+                                               cta_split_num=[1, 1]))
 
 
 @pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
@@ -1446,21 +1442,22 @@ def test_amd_mfma_layout(target):
     module = run_parser(amd_mfma_layout_kernel, target=target)
     expecttest.assert_expected_inline(
         anonymize_ir(module.str_nodebug()), """\
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
-#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [32, 32], isTransposed = true}>
-#mma1 = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [16, 16], isTransposed = true, elementType = f64}>
-#mma2 = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [16, 16], isTransposed = true, elementType = i32}>
+#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#mma1 = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#mma2 = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true, elementType = f64}>
+#mma3 = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true, elementType = i32}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @amd_mfma_layout_kernel() attributes {noinline = false} {
     %cst = arith.constant 0.000000e+00 : f32
-    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked>
-    %cst_1 = arith.constant 0.000000e+00 : f64
-    %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x32xf64, #blocked>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #mma>
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #mma1>
+    %cst_3 = arith.constant 0.000000e+00 : f32
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #mma>
+    %cst_5 = arith.constant 0.000000e+00 : f64
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x32xf64, #mma2>
     %c0_i32 = arith.constant 0 : i32
-    %cst_3 = arith.constant dense<0> : tensor<128x32xi32, #blocked>
-    %0 = ttg.convert_layout %cst_0 : tensor<128x32xf32, #blocked> -> tensor<128x32xf32, #mma>
-    %1 = ttg.convert_layout %cst_2 : tensor<128x32xf64, #blocked> -> tensor<128x32xf64, #mma1>
-    %2 = ttg.convert_layout %cst_3 : tensor<128x32xi32, #blocked> -> tensor<128x32xi32, #mma2>
+    %cst_7 = arith.constant dense<0> : tensor<128x32xi32, #mma3>
     tt.return
   }
 }
@@ -1475,8 +1472,8 @@ def add_int(a, b):
 @gluon.jit
 def infer_layout_for_amd_mfma_kernel():
     layout: ttgl.constexpr = amd_layouts.AMDMFMALayout(version=3, instr_shape=[32, 32], transposed=True,
-                                                       elem_type=ttgl.int32, warps_per_cta=[4,
-                                                                                            1], tiles_per_warp=[4, 1],
+                                                       warps_per_cta=[4,
+                                                                      1], elem_type=ttgl.int32, tiles_per_warp=[1, 1],
                                                        ctas_per_cga=[1, 1], cta_split_num=[1, 1], cta_order=[1, 0])
     a = ttgl.full([128, 32], 1, ttgl.int32, layout)
     b = ttgl.reduce(a, 1, add_int)
@@ -1489,7 +1486,7 @@ def test_infer_layout_for_amd_mfma(target):
 
     expecttest.assert_expected_inline(
         anonymize_ir(module.str_nodebug()), """\
-#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [32, 32], isTransposed = true, elementType = i32}>
+#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true, elementType = i32}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @infer_layout_for_amd_mfma_kernel() attributes {noinline = false} {
     %c1_i32 = arith.constant 1 : i32
@@ -1719,3 +1716,49 @@ def test_buffer_load_store_with_broadcast(target):
   }
 }
 """)
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+def test_amd_mfma(target):
+
+    @gluon.jit
+    def kernel():
+        mfma_layout: ttgl.constexpr = ttgl.amd.AMDMFMALayout(version=3, instr_shape=[32, 32], transposed=True,
+                                                             warps_per_cta=[4, 1])
+
+        a = ttgl.full([64, 32], 1.0, ttgl.float32, layout=ttgl.DotOperandLayout(operand_index=0, parent=mfma_layout,
+                                                                                k_width=8))
+        b = ttgl.full([32, 64], 2.0, ttgl.float32, layout=ttgl.DotOperandLayout(operand_index=1, parent=mfma_layout,
+                                                                                k_width=8))
+
+        acc = ttgl.zeros([64, 64], ttgl.float32, mfma_layout)
+        acc = ttgl.amd.cdna3.mfma(a, b, acc)
+        ttgl.static_assert(isinstance(acc, ttgl.tensor))
+        ttgl.static_assert(acc.type.layout == mfma_layout)
+
+    module = run_parser(kernel, target=target)
+
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @kernel() attributes {noinline = false} {
+    %cst = arith.constant 1.000000e+00 : f32
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x32xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    %cst_1 = arith.constant 2.000000e+00 : f32
+    %cst_2 = arith.constant dense<2.000000e+00> : tensor<32x64xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    %0 = tt.call @"triton.experimental.gluon.language._standard.zeros____(0, 0)cconstexpr_64__(0, 1)cconstexpr_64__(1,)cconstexpr_fp32__(2,)cconstexpr_AMDMFMALayout(version=3, instr_shape=(32 ,32), transposed=True, warps_per_cta=(4 ,1), elem_type=triton_d_language_d_float32, tiles_per_warp=_1, 1_, ctas_per_cga=_1, 1_, cta_split_num=_1, 1_, cta_order=_1, 0_)_"() : () -> tensor<64x64xf32, #mma>
+    %cst_3 = arith.constant 0.000000e+00 : f32
+    %1 = tt.dot %cst_0, %cst_2, %0 : tensor<64x32xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<32x64xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<64x64xf32, #mma>
+    tt.return
+  }
+  tt.func private @"triton.experimental.gluon.language._standard.zeros____(0, 0)cconstexpr_64__(0, 1)cconstexpr_64__(1,)cconstexpr_fp32__(2,)cconstexpr_AMDMFMALayout(version=3, instr_shape=(32 ,32), transposed=True, warps_per_cta=(4 ,1), elem_type=triton_d_language_d_float32, tiles_per_warp=_1, 1_, ctas_per_cga=_1, 1_, cta_split_num=_1, 1_, cta_order=_1, 0_)_"() -> tensor<64x64xf32, #mma> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    tt.return %cst_0 : tensor<64x64xf32, #mma>
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<64x64xf32, #mma>
+    tt.return %0 : tensor<64x64xf32, #mma>
+  }
+}
+""")
@@ -0,0 +1,95 @@
+"""
+Original code by @bertmaher; profiling added by @apgoucher
+"""
+
+import cProfile
+import pstats
+import time
+
+import numpy as np
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def nop_args(
+    t1,
+    t2,
+    t3,
+    t4,
+    t5,
+    i1,
+    i2,
+    i3,
+    i4,
+    i5,
+    i6,
+    i7,
+    i8,
+    i9,
+    c1: tl.constexpr,
+    c2: tl.constexpr,
+    c3: tl.constexpr,
+    c4: tl.constexpr,
+    c5: tl.constexpr,
+):
+    pass
+
+
+def do_bench_walltime(fn):
+    print("Compiling...")
+    fn()
+    torch.cuda.synchronize()
+
+    for _ in range(1000):
+        fn()
+    torch.cuda.synchronize()
+
+    n_repeat = 10000
+
+    mses = []
+
+    for _ in range(25):
+        print("Running %d benchmarking iterations..." % n_repeat)
+        # Benchmark
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(n_repeat):
+            fn()
+        torch.cuda.synchronize()
+        end_time = time.time()
+        wall_time_ms = (end_time - start_time) * 1e3 / n_repeat
+        mses.append(wall_time_ms)
+
+    mses = np.array(mses)
+
+    print("Running profiler...")
+    profile = cProfile.Profile()
+    profile.enable()
+    for _ in range(n_repeat):
+        fn()
+    torch.cuda.synchronize()
+    profile.disable()
+    stats = pstats.Stats(profile)
+    stats.sort_stats("time")
+    stats.print_stats()
+    return mses
+
+
+def main():
+    targs = [torch.zeros(1, device="cuda") for _ in range(5)]
+    iargs = [1 for _ in range(9)]
+    cargs = [32 for _ in range(5)]
+
+    usecs = do_bench_walltime(lambda: nop_args[
+        1,
+    ](*targs, *iargs, *cargs)) * 1000.0
+
+    print(usecs)
+    print(sorted(usecs)[len(usecs) >> 1])
+
+
+if __name__ == "__main__":
+    main()
@@ -10,11 +10,11 @@ def test_is_lazy():
     from importlib import reload
     reload(sys.modules["triton.runtime.driver"])
     reload(sys.modules["triton.runtime"])
-    mod = sys.modules[triton.runtime.driver.__module__]
-    assert isinstance(triton.runtime.driver.active, getattr(mod, "LazyProxy"))
-    assert triton.runtime.driver.active._obj is None
+    assert triton.runtime.driver._active is None
+    assert triton.runtime.driver._default is None
+    assert isinstance(triton.runtime.driver.active, getattr(triton.backends.driver, "DriverBase"))
+    assert isinstance(triton.runtime.driver.default, getattr(triton.backends.driver, "DriverBase"))
     utils = triton.runtime.driver.active.utils  # noqa: F841
-    assert issubclass(triton.runtime.driver.active._obj.__class__, getattr(triton.backends.driver, "DriverBase"))
 
 
 def test_kernel_in_thread(device):