[TUTORIAL] Replace legacy host side TMA with TensorDescriptor (#6465)

peterbell10 · web-flow · commit aac457e8d9af · 2025-04-12T03:00:16.000Z
diff --git a/python/triton/tools/experimental_descriptor.py b/python/triton/tools/experimental_descriptor.py
@@ -46,3 +46,11 @@ class TensorDescriptor:
     shape: List[int]
     strides: List[int]
     block_shape: List[int]
+
+    def from_tensor(tensor: Any, block_shape: List[int]):
+        return TensorDescriptor(
+            tensor,
+            tensor.shape,
+            tensor.stride(),
+            block_shape,
+        )
diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py
@@ -15,7 +15,7 @@
 
 import pytest
 import torch
-import triton.tools.experimental_descriptor
+from triton.tools.experimental_descriptor import TensorDescriptor
 
 import triton
 import triton.language as tl
@@ -43,68 +43,6 @@ def supports_tma():
     print("TMA benchmarks will be running without grid constant TMA descriptor.", )
 
 
-# TmaAutoTuneHelper used in htyu's PR #5622
-class TmaAutoTuneHelper:
-
-    # duck typing wrapper to implement the same interface as TmaDescKernelParam in Triton PR #4498
-    class KernelParamWrapper:
-
-        def __init__(self, desc):
-            self.desc = desc
-
-        def tma_desc_cpu_ptr(self):
-            return self.desc.data_ptr()
-
-    TMA_SIZE = 128
-
-    def __init__(self):
-        self.fill_1d_tma_descriptor_inner = (triton.runtime.driver.active.utils.fill_1d_tma_descriptor)
-        self.fill_2d_tma_descriptor_inner = (triton.runtime.driver.active.utils.fill_2d_tma_descriptor)
-        if HAS_TMA_DESC:
-            self.descriptors = {}
-        else:
-            self.cuda_descriptors = {}
-
-    # Call this method outside of the lambda function for grid size
-    def init_tma_descriptor(self, name):
-        if HAS_TMA_DESC:
-            self.descriptors[name] = torch.empty(TmaAutoTuneHelper.TMA_SIZE, device="cpu", dtype=torch.int8)
-        else:
-            self.cuda_descriptors[name] = torch.empty(TmaAutoTuneHelper.TMA_SIZE, device="cuda", dtype=torch.int8)
-
-    # Call this method inside the lambda function for grid size
-    def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_size):
-        if HAS_TMA_DESC:
-            desc_x = self.descriptors[name]
-            assert desc_x.data_ptr() % 64 == 0
-            self.fill_1d_tma_descriptor_inner(ptr, dim, block_dim, element_size, desc_x.data_ptr())
-        else:
-            desc_x = self.cuda_descriptors[name]
-            buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
-            self.fill_1d_tma_descriptor_inner(ptr, dim, block_dim, element_size, buf_x.data_ptr())
-            desc_x.copy_(buf_x, non_blocking=True)
-
-    # Call this method inside the lambda function for grid size
-    def fill_2d_tma_descriptor(self, name, ptr, dim1, dim0, block_dim1, block_dim0, element_size):
-        if HAS_TMA_DESC:
-            desc_x = self.descriptors[name]
-            assert desc_x.data_ptr() % 64 == 0
-            self.fill_2d_tma_descriptor_inner(ptr, dim1, dim0, block_dim1, block_dim0, element_size, desc_x.data_ptr())
-        else:
-            desc_x = self.cuda_descriptors[name]
-            buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
-            self.fill_2d_tma_descriptor_inner(ptr, dim1, dim0, block_dim1, block_dim0, element_size, buf_x.data_ptr())
-            desc_x.copy_(buf_x, non_blocking=True)
-
-    def get_tma_descriptor_kernel_param(self, name):
-        if HAS_TMA_DESC:
-            assert self.descriptors[name] is not None
-            return self.KernelParamWrapper(self.descriptors[name])
-        else:
-            assert self.cuda_descriptors[name] is not None
-            return self.cuda_descriptors[name]
-
-
 @triton.jit
 def _attn_fwd_inner(acc, l_i, m_i, q,  #
                     K_block_ptr, V_block_ptr,  #
@@ -179,7 +117,7 @@ def _attn_fwd_inner_tma(acc, l_i, m_i, q,  #
     for start_n in range(lo, hi, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
-        k = tl._experimental_descriptor_load(desc_k, [offsetkv_y, 0], [BLOCK_N, HEAD_DIM], dtype).T
+        k = desc_k.load([offsetkv_y, 0]).T
         qk = tl.dot(q, k)
         if STAGE == 2:
             mask = offs_m[:, None] >= (start_n + offs_n[None, :])
@@ -197,7 +135,7 @@ def _attn_fwd_inner_tma(acc, l_i, m_i, q,  #
         # -- update output accumulator --
         acc = acc * alpha[:, None]
         # update acc
-        v = tl._experimental_descriptor_load(desc_v, [offsetkv_y, 0], [BLOCK_N, HEAD_DIM], dtype)
+        v = desc_v.load([offsetkv_y, 0])
         p = p.to(dtype)
         # note that this non transposed v for FP8 is only supported on Blackwell
         acc = tl.dot(p, v, acc)
@@ -319,11 +257,21 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out,  #
     tl.store(O_block_ptr, acc.to(Out.type.element_ty))
 
 
+def _tma_pre_hook(nargs):
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    HEAD_DIM = nargs["HEAD_DIM"]
+    nargs["desc_q"].block_shape = [BLOCK_M, HEAD_DIM]
+    nargs["desc_v"].block_shape = [BLOCK_N, HEAD_DIM]
+    nargs["desc_k"].block_shape = [BLOCK_N, HEAD_DIM]
+    nargs["desc_o"].block_shape = [BLOCK_M, HEAD_DIM]
+
+
 # We don't run auto-tuning every time to keep the tutorial fast. Keeping
 # the code below and commenting out the equivalent parameters is convenient for
 # re-tuning.
 configs_tma = [
-    triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN}, num_stages=s, num_warps=w) \
+    triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN}, num_stages=s, num_warps=w, pre_hook=_tma_pre_hook) \
     for BM in [64, 128]\
     for BN in [32, 64, 128]\
     for s in [2, 3, 4, 6]\
@@ -369,7 +317,7 @@ def _attn_fwd_tma(sm_scale, M,  #
     qk_scale = sm_scale
     qk_scale *= 1.44269504  # 1/log(2)
     # load q: it will stay in SRAM throughout
-    q = tl._experimental_descriptor_load(desc_q, [qo_offset_y, 0], [BLOCK_M, HEAD_DIM], dtype)
+    q = desc_q.load([qo_offset_y, 0])
     # stage 1: off-band
     # For causal = True, STAGE = 3 and _attn_fwd_inner gets 1 as its STAGE
     # For causal = False, STAGE = 1, and _attn_fwd_inner gets 3 as its STAGE
@@ -395,7 +343,7 @@ def _attn_fwd_tma(sm_scale, M,  #
     acc = acc / l_i[:, None]
     m_ptrs = M + off_hz * N_CTX + offs_m
     tl.store(m_ptrs, m_i)
-    tl._experimental_descriptor_store(desc_o, acc.to(dtype), [qo_offset_y, 0])
+    desc_o.store([qo_offset_y, 0], acc.to(dtype))
 
 
 @triton.jit
@@ -670,34 +618,15 @@ def forward(ctx, q, k, v, causal, sm_scale, USE_TMA=True):
             # Note that on Hopper we cannot perform a FP8 dot with a non-transposed second tensor
             y_dim = q.shape[0] * q.shape[1] * q.shape[2]
 
-            desc_helper = TmaAutoTuneHelper()
-            desc_helper.init_tma_descriptor("q")
-            desc_helper.init_tma_descriptor("v")
-            desc_helper.init_tma_descriptor("k")
-            desc_helper.init_tma_descriptor("o")
+            dummy_block = [1, 1]
+            desc_q = TensorDescriptor(q, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
+            desc_v = TensorDescriptor(v, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
+            desc_k = TensorDescriptor(k, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
+            desc_o = TensorDescriptor(o, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
 
             def grid(META):
-                nonlocal desc_helper
-
-                desc_helper.fill_2d_tma_descriptor("q", q.data_ptr(), y_dim, HEAD_DIM_K, META["BLOCK_M"], HEAD_DIM_K,
-                                                   q.element_size())
-
-                desc_helper.fill_2d_tma_descriptor("v", v.data_ptr(), y_dim, HEAD_DIM_K, META["BLOCK_N"], HEAD_DIM_K,
-                                                   v.element_size())
-
-                desc_helper.fill_2d_tma_descriptor("k", k.data_ptr(), y_dim, HEAD_DIM_K, META["BLOCK_N"], HEAD_DIM_K,
-                                                   k.element_size())
-
-                desc_helper.fill_2d_tma_descriptor("o", o.data_ptr(), y_dim, HEAD_DIM_K, META["BLOCK_M"], HEAD_DIM_K,
-                                                   o.element_size())
-
                 return (triton.cdiv(q.shape[2], META["BLOCK_M"]), q.shape[0] * q.shape[1], 1)
 
-            desc_q = desc_helper.get_tma_descriptor_kernel_param("q")
-            desc_v = desc_helper.get_tma_descriptor_kernel_param("v")
-            desc_k = desc_helper.get_tma_descriptor_kernel_param("k")
-            desc_o = desc_helper.get_tma_descriptor_kernel_param("o")
-
             ctx.grid = grid
             _attn_fwd_tma[grid](
                 sm_scale, M,  #
diff --git a/python/tutorials/10-block-scaled-matmul.py b/python/tutorials/10-block-scaled-matmul.py
@@ -72,7 +72,7 @@
 import triton.language as tl
 import triton.tools.experimental_descriptor
 import triton.profiler as proton
-from triton.tools.experimental_descriptor import TmaDescKernelParam
+from triton.tools.experimental_descriptor import TensorDescriptor
 from triton.tools.mxfp import MXFP4Tensor, MXScaleTensor
 
 
@@ -106,7 +106,7 @@ def _matmul_launch_metadata(grid, kernel, args):
 @triton.jit(launch_metadata=_matmul_launch_metadata)
 def block_scaled_matmul_kernel(  #
         a_desc, a_scale,  #
-        b_desc_or_tensor, b_scale,  #
+        b_desc, b_scale,  #
         c_desc,  #
         M: tl.constexpr, N: tl.constexpr, K: tl.constexpr,  #
         stride_sk: tl.constexpr, stride_sb: tl.constexpr, stride_sc: tl.constexpr, stride_sd: tl.constexpr,
@@ -120,16 +120,6 @@ def block_scaled_matmul_kernel(  #
         NUM_STAGES: tl.constexpr,  #
         USE_2D_SCALE_LOAD: tl.constexpr):  #
 
-    if ELEM_PER_BYTE_A == 1:
-        dtype_a = tl.float8e4nv
-    elif ELEM_PER_BYTE_A == 2:
-        dtype_a = tl.dtype("uint8")
-
-    if ELEM_PER_BYTE_B == 1:
-        dtype_b = tl.float8e4nv
-    elif ELEM_PER_BYTE_B == 2:
-        dtype_b = tl.dtype("uint8")
-
     if output_type == 0:
         output_dtype = tl.float32
     elif output_type == 1:
@@ -152,23 +142,6 @@ def block_scaled_matmul_kernel(  #
 
     MIXED_PREC: tl.constexpr = ELEM_PER_BYTE_A == 1 and ELEM_PER_BYTE_B == 2
 
-    if MIXED_PREC:
-        b_desc = tl.make_tensor_descriptor(
-            b_desc_or_tensor,
-            shape=[N, K // ELEM_PER_BYTE_B],
-            strides=[K // ELEM_PER_BYTE_B, 1],
-            block_shape=[BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B],
-        )
-    else:
-        b_desc = b_desc_or_tensor
-        tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [b_desc], dtype=tl.int32,
-                                  is_pure=False, pack=1)
-
-    tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [a_desc], dtype=tl.int32, is_pure=False,
-                              pack=1)
-    tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [c_desc], dtype=tl.int32, is_pure=False,
-                              pack=1)
-
     # For now it is recommended to use 2D scale loads for better performance.
     # In the future we will bring additional optimizations to either allow 5D loads,
     # the use of TMAs for scale factors, or both.
@@ -192,15 +165,8 @@ def block_scaled_matmul_kernel(  #
 
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in tl.range(0, tl.cdiv(K, BLOCK_K), num_stages=NUM_STAGES):
-        a = tl._experimental_descriptor_load(a_desc, [offs_am, offs_k_a], [BLOCK_M, BLOCK_K // ELEM_PER_BYTE_A],
-                                             dtype_a)
-
-        if MIXED_PREC:
-            b = b_desc.load([offs_bn, offs_k_b])
-        else:
-            b = tl._experimental_descriptor_load(b_desc, [offs_bn, offs_k_b], [BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B],
-                                                 dtype_b)
-
+        a = a_desc.load([offs_am, offs_k_a])
+        b = b_desc.load([offs_bn, offs_k_b])
         scale_a = tl.load(a_scale_ptr)
         scale_b = tl.load(b_scale_ptr)
         if USE_2D_SCALE_LOAD:
@@ -221,10 +187,10 @@ def block_scaled_matmul_kernel(  #
         a_scale_ptr += (BLOCK_K // VEC_SIZE // 4) * stride_sb
         b_scale_ptr += (BLOCK_K // VEC_SIZE // 4) * stride_sb
 
-    tl._experimental_descriptor_store(c_desc, accumulator.to(output_dtype), [offs_am, offs_bn])
+    c_desc.store([offs_am, offs_bn], accumulator.to(output_dtype))
 
 
-def block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, dtype_dst, M, N, K, configs):
+def block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, dtype_dst, M, N, K, configs):
     output = torch.empty((M, N), dtype=dtype_dst, device="cuda")
     if dtype_dst == torch.float32:
         dtype_dst = 0
@@ -235,11 +201,12 @@ def block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, dtype_dst, M
     else:
         raise ValueError(f"Unsupported dtype: {dtype_dst}")
 
-    c_desc = TmaDescKernelParam(output.data_ptr(), output.shape, [configs["BLOCK_SIZE_M"], configs["BLOCK_SIZE_N"]],
-                                output.element_size())
+    BLOCK_M = configs["BLOCK_SIZE_M"]
+    BLOCK_N = configs["BLOCK_SIZE_N"]
+    c_desc = TensorDescriptor.from_tensor(output, [BLOCK_M, BLOCK_N])
 
-    grid = (triton.cdiv(M, configs["BLOCK_SIZE_M"]) * triton.cdiv(N, configs["BLOCK_SIZE_N"]), 1)
-    block_scaled_matmul_kernel[grid](a_desc, a_scale, b_desc_or_tensor, b_scale, c_desc, M, N, K, a_scale.stride(0),
+    grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), 1)
+    block_scaled_matmul_kernel[grid](a_desc, a_scale, b_desc, b_scale, c_desc, M, N, K, a_scale.stride(0),
                                      a_scale.stride(1), a_scale.stride(2), a_scale.stride(3), dtype_dst,
                                      configs["ELEM_PER_BYTE_A"], configs["ELEM_PER_BYTE_B"], configs["VEC_SIZE"],
                                      configs["BLOCK_SIZE_M"], configs["BLOCK_SIZE_N"], configs["BLOCK_SIZE_K"],
@@ -284,12 +251,17 @@ def initialize_block_scaled(M, N, K, block_scale_type="nvfp4", compute_reference
 
     b_ref = b_ref.to(torch.float32).T
 
-    a_desc = TmaDescKernelParam(a.data_ptr(), a.shape, [BLOCK_M, BLOCK_K // ELEM_PER_BYTE_A], 1)
+    a_desc = TensorDescriptor.from_tensor(a, [BLOCK_M, BLOCK_K // ELEM_PER_BYTE_A])
 
     if block_scale_type == "mixed":
-        b_desc_or_tensor = b
+        b_desc = TensorDescriptor(
+            b,
+            shape=[N, K // ELEM_PER_BYTE_B],
+            strides=[K // ELEM_PER_BYTE_B, 1],
+            block_shape=[BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B],
+        )
     else:
-        b_desc_or_tensor = TmaDescKernelParam(b.data_ptr(), b.shape, [BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B], 1)
+        b_desc = TensorDescriptor.from_tensor(b, [BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B])
 
     epsilon = 1e-8
     a_scale = torch.rand((M // 128, K // VEC_SIZE // 4, 32, 4, 4), device=device) + epsilon
@@ -327,7 +299,7 @@ def unpack_scale(packed):
         "ELEM_PER_BYTE_B": ELEM_PER_BYTE_B,
         "VEC_SIZE": VEC_SIZE,
     }
-    return a_desc, a_scale, b_desc_or_tensor, b_scale, configs, reference
+    return a_desc, a_scale, b_desc, b_scale, configs, reference
 
 
 def validate_block_scaled(M, N, K, block_scale_type="nvfp4"):
@@ -340,9 +312,9 @@ def alloc_fn(size: int, align: int, _):
         # TMA load for mixed-precision fp4 is supported only by device TMA.
         triton.set_allocator(alloc_fn)
 
-    a_desc, a_scale, b_desc_or_tensor, b_scale, configs, reference = initialize_block_scaled(
-        M, N, K, block_scale_type, compute_reference=True)
-    output = block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, torch.float16, M, N, K, configs)
+    a_desc, a_scale, b_desc, b_scale, configs, reference = initialize_block_scaled(M, N, K, block_scale_type,
+                                                                                   compute_reference=True)
+    output = block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, torch.float16, M, N, K, configs)
     torch.testing.assert_close(reference, output.to(torch.float32), atol=1e-3, rtol=1e-3)
     print(f"✅ (pass {block_scale_type})")
 
@@ -353,19 +325,13 @@ def bench_block_scaled(K, block_scale_type="nvfp4", reps=10):
     N = 8192
     print(f"Problem Shape = {M}x{N}x{K}")
 
-    def alloc_fn(size: int, align: int, _):
-        return torch.empty(size, dtype=torch.int8, device="cuda")
-
-    if block_scale_type == "mixed":
-        triton.set_allocator(alloc_fn)
-
-    a_desc, a_scale, b_desc_or_tensor, b_scale, configs, _ = initialize_block_scaled(
-        M, N, K, block_scale_type, compute_reference=False)
-    _ = block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, torch.float16, M, N, K, configs)
+    a_desc, a_scale, b_desc, b_scale, configs, _ = initialize_block_scaled(M, N, K, block_scale_type,
+                                                                           compute_reference=False)
+    _ = block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, torch.float16, M, N, K, configs)
 
     proton.activate(0)
     for _ in range(reps):
-        _ = block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, torch.float16, M, N, K, configs)
+        _ = block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, torch.float16, M, N, K, configs)
     proton.deactivate(0)
     print("Done benchmarking")