Support for host tensor descriptors on devices that don't support TMA descriptors (#6811)

Anstow · web-flow · commit 5eee38596697 · 2025-05-19T13:35:14.000+01:00
This implements host side tensor descriptors by updating the driver to
recognise the decomposition of the tensor descriptors into a tensor
pointer, shape and strides. If a cuda device would prefer to keep the
tensor descriptor it may add tensor descriptor metadata to the kernel;
this is currently the default for cuda devices supporting TMA
descriptors.
diff --git a/python/test/unit/cuda/test_tensor_descriptor.py b/python/test/unit/cuda/test_tensor_descriptor.py
@@ -1,111 +1,10 @@
-import pytest
 import torch
 
 import triton
-import triton.language as tl
-from triton._internal_testing import is_interpreter, numpy_random, to_triton, requires_tma, unwrap_tensor, tma_dtypes
+from triton._internal_testing import requires_tma
 from triton.tools.tensor_descriptor import TensorDescriptor
 
 
-@requires_tma
-@pytest.mark.interpreter()
-@pytest.mark.parametrize("dtype_str", tma_dtypes)
-@pytest.mark.parametrize("num_ctas", [1, 2])
-@pytest.mark.parametrize("M_BLOCK,N_BLOCK", [(2, 16), (8, 16), (8, 32), (8, 128), (512, 32), (1, 1024)])
-def test_host_tensor_descriptor_load(dtype_str, num_ctas, M_BLOCK, N_BLOCK):
-
-    @triton.jit(debug=True)
-    def kernel(out_ptr, desc, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
-        assert desc.shape[0] == M
-        assert desc.shape[1] == N
-        assert desc.strides[0] == N
-        assert desc.strides[1] == 1
-        assert desc.block_shape == [M_BLOCK, N_BLOCK]
-        block = desc.load([M_BLOCK, 2 * N_BLOCK])
-        idx = tl.arange(0, M_BLOCK)[:, None] * N_BLOCK + tl.arange(0, N_BLOCK)[None, :]
-        tl.store(out_ptr + idx, block)
-
-    M, N = M_BLOCK * 3, N_BLOCK * 4
-    inp = to_triton(numpy_random((M, N), dtype_str), device="cuda", dst_type=dtype_str)
-    out = inp.new_empty((M_BLOCK, N_BLOCK))
-
-    inp_desc = TensorDescriptor(inp, shape=inp.shape, strides=inp.stride(), block_shape=[M_BLOCK, N_BLOCK])
-    kernel[(1, )](out, inp_desc, M, N, M_BLOCK, N_BLOCK, num_ctas=num_ctas)
-
-    expect = unwrap_tensor(inp)[1 * M_BLOCK:2 * M_BLOCK, 2 * N_BLOCK:3 * N_BLOCK]
-    torch.testing.assert_close(expect, unwrap_tensor(out))
-
-
-@triton.jit
-def matmul_kernel_host_tensor_descriptor(a_desc, b_desc, c_desc):
-    K = a_desc.shape[1]
-    BLOCK_M: tl.constexpr = a_desc.block_shape[0]
-    BLOCK_K: tl.constexpr = a_desc.block_shape[1]
-    BLOCK_N: tl.constexpr = b_desc.block_shape[1]
-
-    pid_m = tl.program_id(axis=0)
-    pid_n = tl.program_id(axis=1)
-    offs_am = pid_m * BLOCK_M
-    offs_bn = pid_n * BLOCK_N
-    offs_k = 0
-
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_K)):
-        a = a_desc.load([offs_am, offs_k])
-        b = b_desc.load([offs_k, offs_bn])
-        accumulator = tl.dot(a, b, acc=accumulator)
-        offs_k += BLOCK_K
-    accumulator = accumulator.to(a_desc.dtype)
-    c_desc.store([offs_am, offs_bn], accumulator)
-
-
-@requires_tma
-@pytest.mark.interpreter()
-@pytest.mark.parametrize("num_ctas", [1, 2])
-@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K, num_stages", [
-    (128, 128, 16, 1),
-    (512, 64, 32, 2),
-    (64, 512, 32, 2),
-    (128, 128, 16, 4),
-    (64, 128, 32, 4),
-    (32, 32, 32, 4),
-    (256, 128, 32, 4),
-])
-def test_host_tensor_descriptor_matmul(num_stages, num_ctas, BLOCK_M, BLOCK_N, BLOCK_K):
-    device = "cuda"
-    if is_interpreter():
-        M, N, K = BLOCK_M, BLOCK_N, BLOCK_K
-    else:
-        M, N, K = 1024, 512, 256
-    torch.manual_seed(42)
-    A = torch.randn((M, K), dtype=torch.float16, device=device)
-    B = torch.randn((K, N), dtype=torch.float16, device=device)
-    C = torch.empty((M, N), dtype=torch.float16, device=device)
-    grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, BLOCK_N), 1)
-
-    A_desc = TensorDescriptor(A, A.shape, A.stride(), [BLOCK_M, BLOCK_K])
-    B_desc = TensorDescriptor(B, B.shape, B.stride(), [BLOCK_K, BLOCK_N])
-    C_desc = TensorDescriptor(C, C.shape, C.stride(), [BLOCK_M, BLOCK_N])
-
-    kernel = matmul_kernel_host_tensor_descriptor[grid](
-        A_desc,
-        B_desc,
-        C_desc,  #
-        num_warps=8,
-        num_stages=num_stages,
-        num_ctas=num_ctas,
-    )
-    ref_out = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(torch.float16)
-    torch.testing.assert_close(ref_out, C, rtol=1e-3, atol=1e-3)
-    if is_interpreter():
-        return
-
-    if BLOCK_M >= 64 * num_ctas and BLOCK_N >= 64 and torch.cuda.get_device_capability()[0] == 9:
-        # TODO: The use of stmatrix for Blackwell is currently not supported.
-        # Only a subset of TMEM and stmatrix layout pairs are compatible, for example 16x256bx2 and m8n8x4.
-        assert "stmatrix.sync.aligned.m8n8.x4.shared.b16" in kernel.asm["ptx"]
-
-
 @requires_tma
 def test_specialization_after_host_tensordesc():
 
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -1494,8 +1494,6 @@ def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK
     if not is_native:
         if num_ctas != 1:
             pytest.skip("Multi-CTA not supported")
-        if descriptor == "host":
-            pytest.skip("NYI: Host side tensor descriptor fallback")
         if is_hip_cdna3() and (kind, dtype_str, M_BLOCK, N_BLOCK) in REDUCE_SKIP_HIP_CDNA3:
             pytest.skip("Broken on rocm")
 
@@ -1573,3 +1571,105 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
     expect = REDUCE_OP[kind](inp, out)
     kernel[(grid_m, grid_n)](out_desc, out, inp, M, N, M_BLOCK, N_BLOCK, kind, num_ctas=num_ctas)
     torch.testing.assert_close(expect, unwrap_tensor(out), check_dtype=False)
+
+
+@pytest.mark.interpreter()
+@pytest.mark.parametrize("dtype_str", tma_dtypes)
+@pytest.mark.parametrize("num_ctas", [1, 2])
+@pytest.mark.parametrize("M_BLOCK,N_BLOCK", [(2, 16), (8, 16), (8, 32), (8, 128)])
+def test_host_tensor_descriptor_load(dtype_str, num_ctas, M_BLOCK, N_BLOCK, device):
+    if num_ctas == 2 and (not is_cuda() or torch.cuda.get_device_capability(0)[0] not in (9, 10)):
+        pytest.skip("CTAs is unsupported for these cards")
+
+    @triton.jit(debug=True)
+    def kernel(out_ptr, desc, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
+        assert desc.shape[0] == M
+        assert desc.shape[1] == N
+        assert desc.strides[0] == N
+        assert desc.strides[1] == 1
+        assert desc.block_shape == [M_BLOCK, N_BLOCK]
+        block = desc.load([M_BLOCK, 2 * N_BLOCK])
+        idx = tl.arange(0, M_BLOCK)[:, None] * N_BLOCK + tl.arange(0, N_BLOCK)[None, :]
+        tl.store(out_ptr + idx, block)
+
+    M, N = M_BLOCK * 3, N_BLOCK * 4
+    inp = to_triton(numpy_random((M, N), dtype_str), device=device, dst_type=dtype_str)
+    out = inp.new_empty((M_BLOCK, N_BLOCK))
+
+    inp_desc = TensorDescriptor(inp, shape=inp.shape, strides=inp.stride(), block_shape=[M_BLOCK, N_BLOCK])
+    kernel[(1, )](out, inp_desc, M, N, M_BLOCK, N_BLOCK, num_ctas=num_ctas)
+
+    expect = unwrap_tensor(inp)[1 * M_BLOCK:2 * M_BLOCK, 2 * N_BLOCK:3 * N_BLOCK]
+    torch.testing.assert_close(expect, unwrap_tensor(out))
+
+
+@triton.jit
+def matmul_kernel_host_tensor_descriptor(a_desc, b_desc, c_desc):
+    K = a_desc.shape[1]
+    BLOCK_M: tl.constexpr = a_desc.block_shape[0]
+    BLOCK_K: tl.constexpr = a_desc.block_shape[1]
+    BLOCK_N: tl.constexpr = b_desc.block_shape[1]
+
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    offs_am = pid_m * BLOCK_M
+    offs_bn = pid_n * BLOCK_N
+    offs_k = 0
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = a_desc.load([offs_am, offs_k])
+        b = b_desc.load([offs_k, offs_bn])
+        accumulator = tl.dot(a, b, acc=accumulator)
+        offs_k += BLOCK_K
+    accumulator = accumulator.to(a_desc.dtype)
+    c_desc.store([offs_am, offs_bn], accumulator)
+
+
+@pytest.mark.interpreter()
+@pytest.mark.parametrize("num_ctas", [1, 2])
+@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K, num_stages", [
+    (128, 128, 16, 1),
+    (256, 64, 32, 2),
+    (64, 512, 32, 2),
+    (128, 128, 16, 4),
+    (64, 128, 32, 4),
+    (32, 32, 32, 4),
+    (256, 128, 32, 4),
+])
+def test_host_tensor_descriptor_matmul(num_stages, num_ctas, BLOCK_M, BLOCK_N, BLOCK_K, device):
+    if num_ctas == 2 and (not is_cuda() or torch.cuda.get_device_capability(0)[0] not in (9, 10)):
+        pytest.skip("CTAs is unsupported for these cards")
+
+    if is_hip() and (BLOCK_M, BLOCK_N, BLOCK_K, num_stages) == (256, 128, 32, 4):
+        pytest.skip("Insufficient shared memory on HIP devices")
+
+    if is_interpreter():
+        M, N, K = BLOCK_M, BLOCK_N, BLOCK_K
+    else:
+        M, N, K = 1024, 512, 256
+    torch.manual_seed(42)
+    A = torch.randn((M, K), dtype=torch.float16, device=device)
+    B = torch.randn((K, N), dtype=torch.float16, device=device)
+    C = torch.empty((M, N), dtype=torch.float16, device=device)
+    grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, BLOCK_N), 1)
+
+    A_desc = TensorDescriptor(A, A.shape, A.stride(), [BLOCK_M, BLOCK_K])
+    B_desc = TensorDescriptor(B, B.shape, B.stride(), [BLOCK_K, BLOCK_N])
+    C_desc = TensorDescriptor(C, C.shape, C.stride(), [BLOCK_M, BLOCK_N])
+
+    kernel = matmul_kernel_host_tensor_descriptor[grid](
+        A_desc,
+        B_desc,
+        C_desc,  #
+        num_warps=8,
+        num_stages=num_stages,
+        num_ctas=num_ctas,
+    )
+    ref_out = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(torch.float16)
+    torch.testing.assert_close(ref_out, C, rtol=1e-3, atol=1e-3)
+
+    if BLOCK_M >= 64 * num_ctas and BLOCK_N >= 64 and is_cuda() and torch.cuda.get_device_capability()[0] == 9:
+        # TODO: The use of stmatrix for Blackwell is currently not supported.
+        # Only a subset of TMEM and stmatrix layout pairs are compatible, for example 16x256bx2 and m8n8x4.
+        assert "stmatrix.sync.aligned.m8n8.x4.shared.b16" in kernel.asm["ptx"]
diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py
@@ -4,12 +4,14 @@
 import subprocess
 import sysconfig
 import tempfile
+import re
 from pathlib import Path
 from triton.runtime.build import _build
 from triton import knobs
 from triton.runtime.cache import get_cache_manager
 from triton.backends.compiler import GPUTarget
 from triton.backends.driver import GPUDriver, platform_key
+from triton.tools.tensor_descriptor import TensorDescriptor
 
 dirname = os.path.dirname(os.path.realpath(__file__))
 include_dir = [os.path.join(dirname, "include")]
@@ -193,8 +195,37 @@ def ty_to_cpp(ty):
     }[ty]
 
 
+_BASE_ARGS_FORMAT = "piiiKKOOOO"
+
+
 def make_launcher(constants, signature, warp_size):
 
+    def _expand_signature(signature):
+        output = []
+        # Expand tensor descriptor arguments into base pointer, shape, and
+        # strides
+        for sig in signature:
+            if isinstance(sig, str) and sig.startswith("tensordesc"):
+                ndim = sig.count(",") + 1
+                dtype = re.match("tensordesc<([^[>]*)", sig).group()
+
+                output.append("*" + dtype)
+                for _ in range(2 * ndim):
+                    output.append("i64")
+                # Currently the host side tensor descriptors get passed in as a
+                # tensor desc, shape, and strides. We have no way to use these
+                # shape and strides when processing tensor descriptors which is
+                # why we provide our own decomposition above. Sadly this means
+                # we have to pass the shape and strides twice.
+                for _ in range(ndim):
+                    output.append("i32")
+                for _ in range(ndim):
+                    output.append("i64")
+            else:
+                output.append(sig)
+
+        return output
+
     def _serialize_signature(sig):
         if isinstance(sig, tuple):
             return ','.join(map(_serialize_signature, sig))
@@ -232,8 +263,10 @@ def format_of(ty):
             "uint64_t": "K",
         }[ty_to_cpp(ty)]
 
+    signature = {idx: s for idx, s in enumerate(_expand_signature(signature.values()))}
+
     args_format = ''.join([format_of(ty) for ty in signature.values()])
-    format = "piiiKKOOOO" + args_format
+    format = _BASE_ARGS_FORMAT + args_format
     signature = ','.join(map(_serialize_signature, signature.values()))
     signature = list(filter(bool, signature.split(',')))
     signature = {i: s for i, s in enumerate(signature)}
@@ -494,6 +527,31 @@ def format_of(ty):
     return src
 
 
+def wrap_handle_tensor_descriptor(launcher):
+    """
+    Replace all tensor descriptors with the base ptr, shape, and strides
+    """
+
+    def inner(*args):
+        meta_args = args[:len(_BASE_ARGS_FORMAT)]
+        raw_kernel_args = args[len(_BASE_ARGS_FORMAT):]
+        final_args = []
+        for arg in raw_kernel_args:
+            if isinstance(arg, TensorDescriptor):
+                # Currently the host side tensor descriptors get decomposed in
+                # the frontend to tensor desc, shape, and strides. We have no
+                # way to use these shape and strides when processing tensor
+                # descriptors which is why we provide our own decomposition
+                # above. Sadly this means we have to pass the shape and strides
+                # twice.
+                final_args.extend([arg.base, *arg.shape, *arg.strides, *arg.shape, *arg.strides])
+            else:
+                final_args.append(arg)
+        return launcher(*meta_args, *final_args)
+
+    return inner
+
+
 class HIPLauncher(object):
 
     def __init__(self, src, metadata):
@@ -503,7 +561,9 @@ def __init__(self, src, metadata):
         signature = {idx: value for idx, value in src.signature.items()}
         src = make_launcher(constants, signature, metadata.warp_size)
         mod = compile_module_from_src(src, "__triton_launcher")
-        self.launch = mod.launch
+        has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
+
+        self.launch = wrap_handle_tensor_descriptor(mod.launch) if has_tensor_desc_arg else mod.launch
         self.launch_cooperative_grid = metadata.launch_cooperative_grid
 
     def __call__(self, *args):
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py