[TensorDescriptor] Improve error from creating invalid descriptor (#7028)

peterbell10 · web-flow · commit 1f53afc2c9b3 · 2025-06-03T17:07:27.000+01:00
This adds validation when creating a `TensorDescriptor` object in
python, before it's been passed to the kernel. This not only improves
the error message from the generic "invalid argument", it also means the
stack trace will point to the code causing the error rather than the
bowels of the kernel launch function.
diff --git a/python/test/unit/cuda/test_tma_descriptor.py b/python/test/unit/cuda/test_tma_descriptor.py
@@ -1,15 +1,9 @@
 from contextlib import nullcontext
 import pytest
 import torch
-import triton
 from triton.tools.tensor_descriptor import TensorDescriptor
 
 
-@triton.jit
-def dummy_kernel(desc):
-    pass
-
-
 @pytest.mark.parametrize("M, BLOCK_M, expect_error", [(128, 32, False), (127, 32, False), (128, 31, True)])
 def test_1d_tma_descriptor_exception(M, BLOCK_M, expect_error):
     if not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 9:
@@ -22,10 +16,9 @@ def test_1d_tma_descriptor_exception(M, BLOCK_M, expect_error):
     # https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY
     assert x.data_ptr() % 16 == 0
 
-    desc = TensorDescriptor.from_tensor(x, [BLOCK_M])
     ctx = pytest.raises(ValueError, match="Shape element 0 must be a power of 2") if expect_error else nullcontext()
     with ctx:
-        dummy_kernel[(1, )](desc)
+        _ = TensorDescriptor.from_tensor(x, [BLOCK_M])
 
 
 @pytest.mark.parametrize("M, BLOCK_M, expect_error_m", [(128, 32, False), (125, 33, True)])
@@ -42,14 +35,12 @@ def test_2d_tma_descriptor_exception(M, N, BLOCK_M, BLOCK_N, expect_error_n, exp
     # https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY
     assert A.data_ptr() % 16 == 0
 
-    desc = TensorDescriptor.from_tensor(A, [BLOCK_M, BLOCK_N])
-
     shape_error = expect_error_n or expect_error_m
     error_alignment = (N % 16) != 0
     expect_error = shape_error or error_alignment
 
-    exc_type = ValueError if shape_error else RuntimeError
-    match = "Shape element . must be a power of 2" if shape_error else "Triton Error \\[CUDA\\]: invalid argument"
+    exc_type = ValueError if shape_error else AssertionError
+    match = "Shape element . must be a power of 2" if shape_error else "strides must be 16-byte aligned"
     ctx = pytest.raises(exc_type, match=match) if expect_error else nullcontext()
     with ctx:
-        dummy_kernel[(1, )](desc)
+        _ = TensorDescriptor.from_tensor(A, [BLOCK_M, BLOCK_N])
diff --git a/python/triton/_utils.py b/python/triton/_utils.py
@@ -1,13 +1,15 @@
 from __future__ import annotations
 
 from functools import reduce
-from typing import Any, Callable, TYPE_CHECKING, Union
+from typing import Any, Callable, TYPE_CHECKING, Union, List, Dict
 
 if TYPE_CHECKING:
     from .language import core
     IterableType = Union[list[Any], tuple[Any, ...], core.tuple, core.tuple_type]
     ObjPath = tuple[int, ...]
 
+TRITON_MAX_TENSOR_NUMEL = 1048576
+
 
 def get_iterable_path(iterable: IterableType, path: ObjPath) -> Any:
     return reduce(lambda a, idx: a[idx], path, iterable)  # type: ignore[index]
@@ -35,3 +37,88 @@ def _impl(path: tuple[int, ...], current: Any):
     _impl((), iterable)
 
     return list(ret.keys())
+
+
+def is_power_of_two(x):
+    return (x & (x - 1)) == 0
+
+
+def validate_block_shape(shape: List[int]):
+    numel = 1
+    for i, d in enumerate(shape):
+        if not isinstance(d, int):
+            raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d)}]")
+        if not is_power_of_two(d):
+            raise ValueError(f"Shape element {i} must be a power of 2")
+        numel *= d
+
+    if numel > TRITON_MAX_TENSOR_NUMEL:
+        raise ValueError(f"numel ({numel}) exceeds triton maximum tensor numel ({TRITON_MAX_TENSOR_NUMEL})")
+    return numel
+
+
+type_canonicalisation_dict = {
+    # we canonicalise all bools to be unsigned:
+    "bool": "u1",
+    "int1": "u1",
+    "uint1": "u1",
+    "i1": "u1",
+    # floating-point dtypes:
+    "float8e4nv": "fp8e4nv",
+    "float8e5": "fp8e5",
+    "float8e4b15": "fp8e4b15",
+    "float8_e4m3fn": "fp8e4nv",
+    "float8e4b8": "fp8e4b8",
+    "float8_e4m3fnuz": "fp8e4b8",
+    "float8_e5m2": "fp8e5",
+    "float8e5b16": "fp8e5b16",
+    "float8_e5m2fnuz": "fp8e5b16",
+    "half": "fp16",
+    "float16": "fp16",
+    "bfloat16": "bf16",
+    "float": "fp32",
+    "float32": "fp32",
+    "double": "fp64",
+    "float64": "fp64",
+    # signed integers:
+    "int8": "i8",
+    "int16": "i16",
+    "int": "i32",
+    "int32": "i32",
+    "int64": "i64",
+    # unsigned integers:
+    "uint8": "u8",
+    "uint16": "u16",
+    "uint32": "u32",
+    "uint64": "u64",
+    "void": "void",
+}
+
+for v in list(type_canonicalisation_dict.values()):
+    type_canonicalisation_dict[v] = v
+
+
+def canonicalize_dtype(dtype):
+    dtype_str = str(dtype).split(".")[-1]
+    return type_canonicalisation_dict[dtype_str]
+
+
+BITWIDTH_DICT: Dict[str, int] = {
+    **{f"u{n}": n
+       for n in (1, 8, 16, 32, 64)},
+    **{f"i{n}": n
+       for n in (1, 8, 16, 32, 64)},
+    **{f"fp{n}": n
+       for n in (16, 32, 64)},
+    **{f"fp8{suffix}": 8
+       for suffix in ("e4nv", "e4b15", "e4b8", "e5", "e5b16")},
+    "bf16": 16,
+    "void": 0,
+}
+
+for k, v in type_canonicalisation_dict.items():
+    BITWIDTH_DICT[k] = BITWIDTH_DICT[v]
+
+
+def get_primitive_bitwidth(dtype: str) -> int:
+    return BITWIDTH_DICT[dtype]
diff --git a/python/triton/language/_utils.py b/python/triton/language/_utils.py
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -14,7 +14,7 @@
 
 from .._C.libtriton import ir
 from . import semantic
-from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape
+from .._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive_bitwidth
 
 T = TypeVar('T')
 
@@ -402,55 +402,43 @@ def __init__(self, name):
         name = _unwrap_if_constexpr(name)
         self.name = name
         assert name in dtype.SINT_TYPES + dtype.UINT_TYPES + dtype.FP_TYPES + dtype.OTHER_TYPES, name
+        self.primitive_bitwidth = get_primitive_bitwidth(name)
         if name in dtype.SINT_TYPES:
             self.int_signedness = dtype.SIGNEDNESS.SIGNED
-            self.int_bitwidth = int(name.split('int')[-1])
-            self.primitive_bitwidth = self.int_bitwidth
+            self.int_bitwidth = self.primitive_bitwidth
         elif name in dtype.UINT_TYPES:
             self.int_signedness = dtype.SIGNEDNESS.UNSIGNED
-            self.int_bitwidth = int(name.split('int')[-1])
-            self.primitive_bitwidth = self.int_bitwidth
+            self.int_bitwidth = self.primitive_bitwidth
         elif name in dtype.FP_TYPES:
             if name == 'fp8e4b15':
                 self.fp_mantissa_width = 3
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 15
             elif name == 'fp8e4nv':
                 self.fp_mantissa_width = 3
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 7
             elif name == 'fp8e4b8':
                 self.fp_mantissa_width = 3
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 8
             elif name == 'fp8e5':
                 self.fp_mantissa_width = 2
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 15
             elif name == 'fp8e5b16':
                 self.fp_mantissa_width = 2
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 16
             elif name == 'fp16':
                 self.fp_mantissa_width = 10
-                self.primitive_bitwidth = 16
                 self.exponent_bias = 15
             elif name == 'bf16':
                 self.fp_mantissa_width = 7
-                self.primitive_bitwidth = 16
                 self.exponent_bias = 127
             elif name == 'fp32':
                 self.fp_mantissa_width = 23
-                self.primitive_bitwidth = 32
                 self.exponent_bias = 127
             elif name == 'fp64':
                 self.fp_mantissa_width = 52
-                self.primitive_bitwidth = 64
                 self.exponent_bias = 1023
             else:
                 raise RuntimeError(f'Unsupported floating-point type {name}')
-        elif name == 'void':
-            self.primitive_bitwidth = 0
 
     def is_fp8(self):
         return 'fp8' in self.name
diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
@@ -14,7 +14,7 @@
 from types import ModuleType
 from .. import knobs
 from ..runtime.driver import driver
-from .._utils import find_paths_if, get_iterable_path
+from .._utils import find_paths_if, get_iterable_path, type_canonicalisation_dict, canonicalize_dtype
 
 TRITON_MODULE = __name__[:-len(".runtime.jit")]
 
@@ -329,7 +329,7 @@ def specialize_impl(arg, is_const=False, specialize_value=True, align=True):
             dsk = (arg.dtype, is_const)
             res = dtype2str.get(dsk, None)
             if res is None:
-                res = ("*k" if dsk[1] else "*") + type_canonicalisation_dict[str(dsk[0]).split('.')[-1]]
+                res = ("*k" if dsk[1] else "*") + canonicalize_dtype(dsk[0])
                 dtype2str[dsk] = res
             key = specialize_extra(arg, "tensor", align=align) if specialize_value else None
             return (res, key)
@@ -347,7 +347,7 @@ def specialize_impl(arg, is_const=False, specialize_value=True, align=True):
             return (tys, keys)
         elif isinstance(arg, TensorDescriptor):
             assert hasattr(arg.base, "data_ptr")
-            inner = type_canonicalisation_dict[str(arg.base.dtype).split('.')[-1]]
+            inner = canonicalize_dtype(arg.base.dtype)
             return (f"tensordesc<{inner}{list(arg.block_shape)}>", None)
         else:
             raise TypeError("Unsupported type: %s" % type(arg))
@@ -445,46 +445,6 @@ def dynamic_func({", ".join(list(map(arg, sig.parameters.items())) + ["**options
     return func_namespace['dynamic_func']
 
 
-type_canonicalisation_dict = {
-    # we canonicalise all bools to be unsigned:
-    "bool": "u1",
-    "int1": "u1",
-    "uint1": "u1",
-    "i1": "u1",
-    # floating-point dtypes:
-    "float8e4nv": "fp8e4nv",
-    "float8e5": "fp8e5",
-    "float8e4b15": "fp8e4b15",
-    "float8_e4m3fn": "fp8e4nv",
-    "float8e4b8": "fp8e4b8",
-    "float8_e4m3fnuz": "fp8e4b8",
-    "float8_e5m2": "fp8e5",
-    "float8e5b16": "fp8e5b16",
-    "float8_e5m2fnuz": "fp8e5b16",
-    "half": "fp16",
-    "float16": "fp16",
-    "bfloat16": "bf16",
-    "float": "fp32",
-    "float32": "fp32",
-    "double": "fp64",
-    "float64": "fp64",
-    # signed integers:
-    "int8": "i8",
-    "int16": "i16",
-    "int": "i32",
-    "int32": "i32",
-    "int64": "i64",
-    # unsigned integers:
-    "uint8": "u8",
-    "uint16": "u16",
-    "uint32": "u32",
-    "uint64": "u64",
-}
-
-for v in list(type_canonicalisation_dict.values()):
-    type_canonicalisation_dict[v] = v
-
-
 def get_full_name(fn):
     return f"{fn.__module__}.{fn.__qualname__}"
 
diff --git a/python/triton/tools/tensor_descriptor.py b/python/triton/tools/tensor_descriptor.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Any
+from triton._utils import validate_block_shape, canonicalize_dtype, get_primitive_bitwidth
 
 
 @dataclass
@@ -13,6 +14,15 @@ def __post_init__(self):
         rank = len(self.shape)
         assert len(self.strides) == rank, f"rank mismatch: {self}"
         assert len(self.block_shape) == rank, f"rank mismatch: {self}"
+        assert rank > 0, "rank must not be zero"
+        assert rank <= 5, "rank cannot be more than 5"
+        assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
+        validate_block_shape(self.block_shape)
+        dtype_str = canonicalize_dtype(self.base.dtype)
+        elem_bytes = get_primitive_bitwidth(dtype_str) // 8
+        for stride in self.strides[:-1]:
+            assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
+        assert self.strides[-1] == 1, "Last dimension must be contiguous"
 
     @staticmethod
     def from_tensor(tensor: Any, block_shape: List[int]):