[inductor][2/N] triton support post-pytorch#5512, user-defined triton kernels (pytorch#145348)

davidberard98 · pytorchmergebot · commit b2c89bc11512 · 2025-01-24T00:34:01.000Z
Triton commit 5220 adds tuple support in Triton (changing the indexing format in AttrsDescriptor) and commit 5512 replaces AttrsDescriptor with raw tuples. This PR fixes user-defined triton kernel handling (in most cases) for these new triton commits. What this PR fixes: * in triton_kernel_wrap.py, AST->TTIR parsing was to be updated for the new triton API * ir.py - don't remove None args when using newer triton versions * wrapper.py - update signature & constant handling What this doesn't fix: * correct None handling - I want to do a closer look at constant handling (including None, equal_to_1, and other constants). * cpp wrapper (which needs to be fixed for both user-defined triton kernels and inductor-generated kernels) test/inductor/test_triton_kernels.py passed on triton commit 74de6b46, with the exception of three tests (those shown here: pytorch@1374074) Pull Request resolved: pytorch#145348 Approved by: https://github.com/jansel ghstack dependencies: pytorch#145051
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
@@ -1231,6 +1231,8 @@ def f(x):
     @requires_gpu
     @common_utils.parametrize("dynamic", [False, True])
     def test_triton_kernel_equal_to_1_arg(self, dynamic):
+        from torch._inductor.utils import triton_version_uses_attrs_dict
+
         @triton.jit
         def add_kernel_half_n_elements(
             in_ptr0,
@@ -1263,17 +1265,25 @@ def f(x, y):
             torch.compile(f, dynamic=dynamic), x, y
         )
 
-        if dynamic:
-            # when half_n_elements passed to the Triton kernel is
-            # dynamic, equal_to_1 specializaiton can't be enforced
-            self.assertTrue(_triton_get_ast_equal_to_str(()) in sources[0])
+        if triton_version_uses_attrs_dict():
+            self.assertFalse("equal_to" in sources[0])
         else:
-            self.assertTrue(_triton_get_ast_equal_to_str((3,)) in sources[0])
+            if dynamic:
+                # when half_n_elements passed to the Triton kernel is
+                # dynamic, equal_to_1 specializaiton can't be enforced
+
+                # also, equal_to_1 specialization doesn't occur (or appear in the signature)
+                # for newer versions ofo triton (i.e. the ones where triton_version_uses_attrs_dict() == True)
+                self.assertTrue(_triton_get_ast_equal_to_str(()) in sources[0])
+            else:
+                self.assertTrue(_triton_get_ast_equal_to_str((3,)) in sources[0])
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
     @common_utils.parametrize("dynamic", [False, True])
     def test_triton_kernel_equal_to_1_float_arg(self, dynamic):
+        from torch._inductor.utils import triton_version_uses_attrs_dict
+
         def f(x, y):
             out = torch.empty_like(x)
             n_elements = x.numel()
@@ -1297,7 +1307,8 @@ def f(x, y):
 
         # float 1.0 (both literal or symbolic)
         # should not be added to equal_to_1
-        self.assertTrue(_triton_get_ast_equal_to_str(()) in sources[0])
+        if not triton_version_uses_attrs_dict():
+            self.assertTrue(_triton_get_ast_equal_to_str(()) in sources[0])
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -172,10 +172,19 @@ def generate_ttir(
     """
     import sympy
     import triton
+    import triton.runtime.jit
     from triton.compiler.compiler import ASTSource
     from triton.runtime.autotuner import Autotuner
     from triton.runtime.jit import JITFunction
 
+    from torch._inductor.utils import (
+        get_triton_attrs_descriptor_version,
+        triton_version_uses_attrs_dict,
+        TritonAttrsDescriptorVersion,
+    )
+
+    triton_version = get_triton_attrs_descriptor_version()
+
     import torch._inductor.ir
     from torch._subclasses.fake_tensor import FakeTensor
 
@@ -225,26 +234,78 @@ def generate_ttir(
     ]
 
     def _get_specialization(args):  # type: ignore[no-untyped-def]
-        try:
+        # Support multiple triton versions.
+        # This code basically copies JITFunction.run() logic to get the attrs to construct an ASTSource.
+        if triton_version == TritonAttrsDescriptorVersion.V1_COMPILER:
+            return kernel._get_config(*args)
+        elif triton_version in {
+            TritonAttrsDescriptorVersion.V2_BACKENDS,
+            TritonAttrsDescriptorVersion.V3_BACKENDS_TUPLE,
+        }:
             from triton.backends.compiler import AttrsDescriptor  # noqa: F401
 
             target = triton.runtime.driver.active.get_current_target()
-            backend = triton.compiler.compiler.make_backend(target)
-            return backend.get_attrs_descriptor(args, kernel.params)
-        except ImportError:
-            return kernel._get_config(*args)
+            backend_ = triton.compiler.compiler.make_backend(target)
+            return backend_.get_attrs_descriptor(args, kernel.params)
+        else:
+            assert (
+                get_triton_attrs_descriptor_version()
+                == TritonAttrsDescriptorVersion.V4_DICT
+            )
+            from triton._utils import find_paths_if, get_iterable_path
+            from triton.runtime.jit import specialize_impl
+
+            # logic is copied from: binder = create_function_from_signature(self.signature, self.params, backend)
+            attrvals = []
+            for arg, kp in zip(args, kernel.params):
+                if kp.is_constexpr:
+                    attrvals.append(arg)
+                else:
+                    spec = specialize_impl(
+                        arg,
+                        specialize_extra=backend.get_arg_specialization,
+                        is_const=kp.is_const,
+                        specialize_value=not kp.do_not_specialize,
+                        align=not kp.do_not_specialize_on_alignment,
+                    )
+                    attrvals.append(spec[1])
+
+            attrs = find_paths_if(attrvals, lambda _, x: isinstance(x, str))
+            attrs = {
+                k: backend.parse_attr(get_iterable_path(attrvals, k)) for k in attrs
+            }
+            return attrs
 
     specialization = _get_specialization(ordered_args.values())
     constants = {
         name: arg for name, arg in ordered_args.items() if not isinstance(arg, Tensor)
     }
 
-    # Build kernel signature -- doesn't include constexpr arguments.
-    signature = {
-        name: kernel._type_of(kernel._key_of(arg))
-        for i, (name, arg) in enumerate(ordered_args.items())
-        if i not in kernel.constexprs
-    }
+    if (mangle_type := getattr(triton.runtime.jit, "mangle_type", None)) is not None:
+
+        def get_signature_value(idx: int, arg: Any) -> str:
+            if kernel.params[idx].is_constexpr:
+                return "constexpr"
+            return mangle_type(arg)
+
+    else:
+
+        def get_signature_value(idx: int, arg: Any) -> str:
+            return kernel._type_of(kernel.key_of(arg))
+
+    if triton_version_uses_attrs_dict():
+        # In newer versions of Triton, the signature includes constexpr args
+        signature = {
+            name: get_signature_value(i, arg)
+            for i, (name, arg) in enumerate(ordered_args.items())
+        }
+    else:
+        # In older versions of Triton, the signature does not include constexpr args
+        signature = {
+            name: get_signature_value(i, arg)
+            for i, (name, arg) in enumerate(ordered_args.items())
+            if i not in kernel.constexprs
+        }
 
     triton._C.libtriton.ir.load_dialects(context)
     backend.load_dialects(context)
@@ -254,13 +315,17 @@ def _get_specialization(args):  # type: ignore[no-untyped-def]
     # Triton changes ASTSource.make_ir to take 3/4 arguments. Handle
     # backward compatibility here.
     make_ir_sig_params = len(inspect.signature(src.make_ir).parameters)
+    get_codegen_implementation_sig_params = len(
+        inspect.signature(backend.get_codegen_implementation).parameters
+    )
     if make_ir_sig_params == 2:
         ttir_module = src.make_ir(options, context)
     elif make_ir_sig_params == 3:
         codegen_fns = backend.get_codegen_implementation()
         ttir_module = src.make_ir(options, codegen_fns, context)
     else:
-        codegen_fns = backend.get_codegen_implementation()
+        codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []
+        codegen_fns = backend.get_codegen_implementation(*codegen_args)
         module_map = backend.get_module_map()
         ttir_module = src.make_ir(options, codegen_fns, module_map, context)
     if not ttir_module.verify():
diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py
@@ -7,7 +7,7 @@
 
 from .. import config
 from ..runtime.hints import AttrsDescriptorWrapper
-from ..utils import _type_of, expr_fits_within_32bit
+from ..utils import _type_of, expr_fits_within_32bit, triton_version_uses_attrs_dict
 from ..virtualized import V
 from .common import (
     ConstexprArg,
@@ -55,9 +55,15 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
             return tye
     if isinstance(arg, SizeArg):
         if arg.expr is None:
-            # From triton/runtime/jit.py
-            # `None` is nullptr.  Implicitly convert to *i8.
-            return "*i8"
+            if triton_version_uses_attrs_dict():
+                # In newer versions of Triton, the signature includes "None" args
+                # and their type is marked as "constexpr"
+                return "constexpr"
+            else:
+                # In older versions of Triton...
+                # From triton/runtime/jit.py
+                # `None` is nullptr.  Implicitly convert to *i8.
+                return "*i8"
         elif isinstance(arg.expr, (float, sympy.Float)):
             return "fp32"
 
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -42,6 +42,7 @@
     LineContext,
     sympy_product,
     sympy_str,
+    triton_version_uses_attrs_dict,
 )
 from ..virtualized import V
 from .common import (
@@ -1577,63 +1578,85 @@ def define_user_defined_triton_kernel(
 
         original_name = kernel.__name__
 
-        from .common import KernelArgType, SizeArg, TensorArg, TMADescriptorArg
+        from .common import (
+            ConstexprArg,
+            KernelArgType,
+            SizeArg,
+            TensorArg,
+            TMADescriptorArg,
+        )
 
         signature: list[KernelArgType] = []
         constants: dict[str, Any] = {}
         non_constant_indices = []
         equal_to_1_args: list[str] = []
+
+        def add_to_signature(idx, arg):
+            signature.append(arg)
+            non_constant_indices.append(idx)
+
         for idx, key in enumerate(kernel.arg_names):
+            if idx in kernel.constexprs:
+                if key in kwargs:
+                    constants[key] = kwargs[key]
+                if triton_version_uses_attrs_dict():
+                    add_to_signature(idx, ConstexprArg(name=key))
+                continue
+
             if key not in kwargs:
                 continue
+
             arg = kwargs[key]
-            if idx in kernel.constexprs:
-                constants[key] = arg
-            elif kwargs[key] is None:
+
+            if kwargs[key] is None:
                 constants[key] = None
             else:
-                non_constant_indices.append(idx)
                 if isinstance(arg, ir.TMADescriptor):
-                    signature.append(
+                    add_to_signature(
+                        idx,
                         TMADescriptorArg(
                             name=key,
-                        )
+                        ),
                     )
                 elif isinstance(arg, ir.Buffer):
-                    signature.append(
+                    add_to_signature(
+                        idx,
                         TensorArg(
                             name=key,
                             buffer=arg.get_name(),
                             dtype=arg.get_dtype(),
-                        )
+                        ),
                     )
                 elif isinstance(arg, ir.ReinterpretView):
                     # for ReinterpretView we use the underlying
                     # buffer name and note the (possibly non-zero)
                     # offset relative to the underlying buffer
-                    signature.append(
+                    add_to_signature(
+                        idx,
                         TensorArg(
                             name=key,
                             buffer=arg.data.get_name(),
                             dtype=arg.get_dtype(),
                             offset=arg.layout.offset,
-                        )
+                        ),
                     )
                 else:
-                    signature.append(SizeArg(key, arg))
+                    add_to_signature(idx, SizeArg(key, arg))
                     if isinstance(
                         arg, (int, sympy.Integer)
                     ) and V.graph.sizevars.statically_known_equals(
                         arg, 1  # type: ignore[arg-type]
                     ):
                         equal_to_1_args.append(key)
+
+        triton_signature = signature_to_meta(
+            signature,
+            size_dtype=None,  # try to infer based on symints
+            indices=non_constant_indices,
+            argdefs=kernel.arg_names,
+        )
         triton_meta: dict[str, Any] = {
-            "signature": signature_to_meta(
-                signature,
-                size_dtype=None,  # try to infer based on symints
-                indices=non_constant_indices,
-                argdefs=kernel.arg_names,
-            ),
+            "signature": triton_signature,
             "device": DeviceProperties.create(V.graph.get_current_device_or_throw()),
             # Triton compiler includes equal_to_1 args into constants even
             # when they are not constexpr. otherwise there may be a segfault
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py