[FRONTEND] Support tensordesc in function call arguments (#6262)

peterbell10 · web-flow · commit e19644610878 · 2025-03-21T03:07:57.000Z
This also does a few refactorings:
1. Refactor `mangle_ty(type)` -&gt; `type.mangle()`, and implement for
`tensor_descriptor_type`
2. Refactor `type.to_ir` -&gt; `type._flatten_ir_types` which matches the
`value._flatten_ir` method, but for types.
3. Updates function signature serialization and deserialization to use
the new interfaces.

Also, as part of debugging I updated `TRITON_FRONT_END_DEBUGGING=1` to
disable wrapping exceptions in `CompilationError` which makes the stack
trace point directly to the guts of the frontend making it far easier to
debug.
diff --git a/README.md b/README.md
@@ -245,6 +245,7 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi
 - `TRITON_KERNEL_OVERRIDE` enables the override of the compiled kernel with a user-specified IR/ptx/amdgcn at the beginning of each compilation stage.
 - `TRITON_OVERRIDE_DIR` specifies the directory from which to load the IR/ptx/amdgcn files when `TRITON_KERNEL_OVERRIDE` is set to 1.
 - `TRITON_F32_DEFAULT` sets the default input precision of `tl.dot` when using 32-bit floats, which can be either `ieee`, `tf32`, or `tf32x3`.
+- `TRITON_FRONT_END_DEBUGGING=1` disables exception wrapping when an error occurs in the compiler frontend, allowing the full stack trace to be seen.
 
 **Kernel Override Steps**
 
diff --git a/python/test/unit/cuda/test_tensor_descriptor.py b/python/test/unit/cuda/test_tensor_descriptor.py
@@ -421,6 +421,41 @@ def alloc_fn(size: int, align: int, stream: Optional[int]) -> torch.Tensor:
     torch.testing.assert_close(expect, out)
 
 
+@triton.jit(noinline=True)
+def tensor_descriptor_arg_helper(in_desc, out_desc, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
+    moffset = tl.program_id(0) * M_BLOCK
+    noffset = tl.program_id(1) * N_BLOCK
+    value = in_desc.load([moffset, noffset])
+    out_desc.store([moffset, noffset], value.abs())
+
+
+@requires_tma
+@pytest.mark.interpreter
+def test_tensor_descriptor_argument():
+
+    @triton.jit
+    def kernel(out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
+        out_desc = tl.make_tensor_descriptor(out_ptr, shape=[M, N], strides=[N, 1], block_shape=[M_BLOCK, N_BLOCK])
+        in_desc = tl.make_tensor_descriptor(a_ptr, shape=[M, N], strides=[N, 1], block_shape=[M_BLOCK, N_BLOCK])
+        tensor_descriptor_arg_helper(in_desc, out_desc, M_BLOCK, N_BLOCK)
+
+    M, N = 32, 128
+    inp = torch.randn((M, N), device="cuda")
+
+    M_BLOCK = 8
+    N_BLOCK = 32
+    out = inp.new_zeros((M, N))
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]) -> torch.Tensor:
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    triton.set_allocator(alloc_fn)
+
+    expect = inp.abs()
+    kernel[(M // M_BLOCK, N // N_BLOCK)](out, inp, M, N, M_BLOCK, N_BLOCK)
+    torch.testing.assert_close(expect, out)
+
+
 @triton.jit
 def matmul_kernel_make_tensor_desciptor(a_ptr, b_ptr, c_ptr,  #
                                         M, N, K,  #
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -16,6 +16,7 @@
 # ideally we wouldn't need any runtime component
 from ..runtime import JITFunction
 from .._utils import find_paths_if, get_iterable_path, set_iterable_path
+from . import config
 
 from .errors import (CompilationError, CompileTimeAssertionFailure, UnsupportedLanguageConstruct)
 
@@ -27,29 +28,9 @@ def check_identifier_legality(name, type):
     return name
 
 
-def mangle_ty(ty):
-    if ty.is_tuple():
-        return 'T' + '_'.join(map(mangle_ty, ty.types)) + 'T'
-    if ty.is_ptr():
-        return 'P' + mangle_ty(ty.element_ty)
-    if ty.is_int():
-        SIGNED = language.dtype.SIGNEDNESS.SIGNED
-        prefix = 'i' if ty.int_signedness == SIGNED else 'u'
-        return prefix + str(ty.int_bitwidth)
-    if ty.is_floating():
-        return str(ty)
-    if ty.is_block():
-        elt = mangle_ty(ty.scalar)
-        shape = '_'.join(map(str, ty.shape))
-        return f'{elt}S{shape}S'
-    if ty.is_void():
-        return 'V'
-    raise TypeError(f'Unsupported type {ty}')
-
-
 def mangle_fn(name, arg_tys, constants):
     # doesn't mangle ret type, which must be a function of arg tys
-    mangled_arg_names = '_'.join([mangle_ty(ty) for ty in arg_tys])
+    mangled_arg_names = '_'.join([ty.mangle() for ty in arg_tys])
     mangled_constants = '_'.join([f'{i}c{repr(constants[i])}' for i in sorted(constants)])
     mangled_constants = mangled_constants.replace('.', '_d_')
     mangled_constants = mangled_constants.replace("'", '_sq_')
@@ -71,8 +52,8 @@ def _is_constexpr(o: Any) -> bool:
     return o is None or isinstance(o, (constexpr, language.core.dtype))
 
 
-def _is_triton_scalar(o: Any) -> bool:
-    return _is_triton_tensor(o) and (not o.type.is_block() or o.type.numel == 1)
+def _is_non_scalar_tensor(o: Any) -> bool:
+    return _is_triton_tensor(o) and (o.type.is_block() and o.type.numel != 1)
 
 
 def _is_list_like(o: Any) -> bool:
@@ -82,7 +63,7 @@ def _is_list_like(o: Any) -> bool:
 def _check_fn_args(node, fn, args):
     if fn.noinline:
         for idx, arg in enumerate(args):
-            if not _is_constexpr(arg) and not _is_triton_scalar(arg):
+            if not _is_constexpr(arg) and _is_non_scalar_tensor(arg):
                 raise UnsupportedLanguageConstruct(
                     fn.src, node,
                     f'Function {fn.__name__} is marked noinline, but was called with non-scalar argument {fn.arg_names[idx]}:{arg}'
@@ -241,26 +222,26 @@ def __init__(self, ret_types, arg_types, constants, attrs):
         self.constants = constants
         self.attrs = attrs
 
-    def return_types_ir(self, builder: ir.builder):
-        ret_types = []
-        for ret_ty in self.ret_types:
-            if ret_ty is None:
+    def flatten_ir_types(self, builder: ir.builder, types: List[base_type]) -> List[ir.type]:
+        ir_types = []
+        for ty in types:
+            if ty is None:
                 continue
-            ir_ty = ret_ty.to_ir(builder)
-            if isinstance(ir_ty, list):
-                ret_types.extend(ir_ty)
-            else:
-                ret_types.append(ir_ty)
-        return ret_types
+            ty._flatten_ir_types(builder, ir_types)
+        return ir_types
+
+    def return_types_ir(self, builder: ir.builder) -> List[ir.type]:
+        return self.flatten_ir_types(builder, self.ret_types)
 
     def serialize(self, builder: ir.builder):
         # fill up IR values in template
         # > build function
         is_val = lambda path, _: path not in self.constants and _ is not None
         val_paths = list(find_paths_if(self.arg_types, is_val))
-        arg_types = [get_iterable_path(self.arg_types, path).to_ir(builder) for path in val_paths]
-        ret_types = self.return_types_ir(builder)
-        return builder.get_function_ty(arg_types, ret_types)
+        arg_types = [get_iterable_path(self.arg_types, path) for path in val_paths]
+        arg_types_ir = self.flatten_ir_types(builder, arg_types)
+        ret_types_ir = self.return_types_ir(builder)
+        return builder.get_function_ty(arg_types_ir, ret_types_ir)
 
     def deserialize(self, fn):
         # create "template"
@@ -282,9 +263,12 @@ def make_template(ty):
             if isinstance(ty, nv_tma_desc_type):
                 fn.set_arg_attr(i, "tt.nv_tma_desc", 1)
         # > add IR values to the template
-        for i, path in enumerate(val_paths):
+        cursor = 0
+        handles = [fn.args(i) for i in range(fn.get_num_args())]
+        for path in val_paths:
             ty = get_iterable_path(self.arg_types, path)
-            set_iterable_path(vals, path, language.tensor(fn.args(i), ty))
+            val, cursor = ty._unflatten_ir(handles, cursor)
+            set_iterable_path(vals, path, val)
         # > add constexpr values to the template
         constants = self.constants
         for path, val in constants.items():
@@ -1218,14 +1202,16 @@ def call_JitFunction(self, fn: JITFunction, args, kwargs):
                 generator.visit(fn.parse())
             except Exception as e:
                 # Wrap the error in the callee with the location of the call.
+                if config.front_end_debugging():
+                    raise
                 raise CompilationError(self.jit_fn.src, self.cur_node, None) from e
 
             callee_ret_type = generator.ret_type
             self.function_ret_types[fn_name] = callee_ret_type
         else:
             callee_ret_type = self.function_ret_types[fn_name]
         symbol = self.module.get_function(fn_name)
-        args_val = [arg.handle for arg in args_val]
+        args_val = flatten_values_to_ir(args_val)
         call_op = self.builder.call(symbol, args_val)
         if callee_ret_type == language.void:
             return None
@@ -1256,6 +1242,8 @@ def visit_Call(self, node):
                     ret = language.tuple(ret)
                 return ret
             except Exception as e:
+                if config.front_end_debugging():
+                    raise
                 # Normally when we raise a CompilationError, we raise it as
                 # `from None`, because the original fileline from the exception
                 # is not relevant (and often points into code_generator.py
@@ -1335,6 +1323,8 @@ def visit(self, node):
             except CompilationError:
                 raise
             except Exception as e:
+                if config.front_end_debugging():
+                    raise
                 # Wrap the error in a CompilationError which contains the source
                 # of the @jit function.
                 raise CompilationError(self.jit_fn.src, self.cur_node, repr(e)) from None
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
@@ -11,6 +11,7 @@
 from ..tools.disasm import get_sass
 # TODO: this shouldn't be here
 from .code_generator import ast_to_ttir
+from . import config
 from pathlib import Path
 import re
 import functools
@@ -179,7 +180,7 @@ def filter_traceback(e: BaseException):
 
     These are uninteresting to the user -- "just show me *my* code!"
     """
-    if os.getenv("TRITON_FRONT_END_DEBUGGING", "0") == "1":
+    if config.front_end_debugging():
         return
 
     if e.__cause__ is not None:
diff --git a/python/triton/compiler/config.py b/python/triton/compiler/config.py
@@ -0,0 +1,5 @@
+import os
+
+
+def front_end_debugging():
+    return os.getenv("TRITON_FRONT_END_DEBUGGING", "0") == "1"
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -307,6 +307,12 @@ def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_valu
         """
         raise NotImplementedError
 
+    def mangle(self) -> str:
+        raise NotImplementedError(f"NYI: Type mangling for type {self.__class__}")
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        raise NotImplementedError
+
 
 # -----------------------
 # dtype
@@ -502,10 +508,6 @@ def is_ptr():
     def is_const():
         return False
 
-    @staticmethod
-    def is_tuple():
-        return False
-
     def __eq__(self, other: dtype):
         if not isinstance(other, dtype):
             return False
@@ -518,6 +520,9 @@ def __hash__(self):
     def scalar(self):
         return self
 
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+
     def to_ir(self, builder: ir.builder) -> ir.type:
         if self.name.startswith("fp8"):
             if self.name not in builder.options.supported_fp8_dtypes:
@@ -581,6 +586,17 @@ def __repr__(self):
     def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
         return tensor(handles[cursor], self), cursor + 1
 
+    def mangle(self) -> str:
+        if self.is_int():
+            SIGNED = dtype.SIGNEDNESS.SIGNED
+            prefix = 'i' if self.int_signedness == SIGNED else 'u'
+            return prefix + str(self.int_bitwidth)
+        if self.is_floating():
+            return str(self)
+        if self.is_void():
+            return 'V'
+        return super().mangle()
+
 
 # Some functions have a param named `dtype`, which shadows the `dtype` class.
 # We can't change the param name because it is part of function's public API.
@@ -623,6 +639,9 @@ def __eq__(self, other: pointer_type) -> bool:
     def scalar(self):
         return self
 
+    def mangle(self) -> str:
+        return f"P{self.element_ty.mangle()}"
+
 
 class nv_tma_desc_type(pointer_type):
 
@@ -672,6 +691,11 @@ def __eq__(self, other) -> bool:
     def scalar(self):
         return self.element_ty
 
+    def mangle(self) -> str:
+        elt = self.scalar.mangle()
+        shape = '_'.join(map(str, self.shape))
+        return f'{elt}S{shape}S'
+
 
 class tuple_type(base_type):
 
@@ -686,15 +710,14 @@ def __str__(self):
     def __iter__(self):
         return iter(self.types)
 
-    def to_ir(self, builder: ir.builder):
-        return [ty.to_ir(builder) for ty in self.types]
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]):
+        for ty in self.types:
+            if not isinstance(ty, constexpr):
+                ty._flatten_ir_types(builder, out)
 
     def __getitem__(self, index: int) -> dtype:
         return self.types[index]
 
-    def is_tuple(self):
-        return True
-
     def __eq__(self, other):
         return type(self) is type(other) and self.types == other.types and self.fields == other.fields
 
@@ -705,6 +728,9 @@ def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tuple, in
             values.append(value)
         return tuple(values, self), cursor
 
+    def mangle(self):
+        return 'T' + '_'.join(ty.mangle for ty in self.types) + 'T'
+
 
 class slice_type(dtype):
 
@@ -1263,8 +1289,8 @@ def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_de
         value = tensor_descriptor_base(handles[cursor], self.block_type)
         return value, cursor + 1
 
-    def to_ir(self, builder: ir.builder):
-        return builder.create_tensor_descriptor_type(self.block_type.to_ir(builder))
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        out.append(builder.create_tensor_descriptor_type(self.block_type.to_ir(builder)))
 
     def __str__(self) -> str:
         # ex. "tensor_descriptor<float32[16, 32]>"
@@ -1278,6 +1304,9 @@ def __eq__(self, other) -> bool:
     def __neq__(self, other) -> bool:
         return not (self == other)
 
+    def mangle(self) -> str:
+        return f"TD{self.block_type.mangle()}"
+
 
 class tensor_descriptor_base(base_value):
     """"
@@ -1363,8 +1392,10 @@ def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_de
         value = tensor_descriptor(handle, shape, strides, self.block_type)
         return value, cursor
 
-    def to_ir(self, builder: ir.builder):
-        return [super().to_ir(builder), *self.shape_type.to_ir(builder), *self.strides_type.to_ir(builder)]
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        super()._flatten_ir_types(builder, out)
+        self.shape_type._flatten_ir_types(builder, out)
+        self.strides_type._flatten_ir_types(builder, out)
 
     def __eq__(self, other):
         return super().__eq__(other) and (self.shape_type == other.shape_type) and (self.strides_type