[inductor][1/N] triton support post-pytorch#5512, main components (pytorch#145051)

davidberard98 · pytorchmergebot · commit b963ab5325cf · 2025-01-24T00:34:01.000Z
Triton commit 5220 adds tuple support in Triton (changing the indexing format in AttrsDescriptor) and commit 5512 replaces AttrsDescriptor with raw tuples. This is an initial PR to add support for Triton versions after commit 5512 landed. The main changes in 5220 and 5512 that need to be supported: * AttrsDescriptor() gets replaced with a raw dict. The raw dict has the format `{(TUPLES): [["tt.divisibility", 16]]}`, where `(TUPLES)` is a tuple of indices, e.g. `((0,), (1,), (3,))` to indicate that args 0, 1, and 3 are divisible by 16. These indices are, themselves, represented as tuples to support nested inputs (e.g. an argument that's a tuple), but support for tuples is not implemented right now. * "signature" changes: the signature now contains _all_ args, including constexpr and constant args. * ASTSource now takes "constexprs" instead of "constants" - for example, equal-to-1 args are constants but not constexprs so we don't need to pass these args as "constants". What this PR supports: * Triton versions before Dec 9, 2024, and (partial support for) Triton versions after Jan 1, 2025 * (triton jan 1+) typical inductor-generated triton: updated AttrsDescriptor, signatures, constexpr/constant handling. What this PR doesn't support (TODO in follow-up PRs): * Triton versions between Dec 9, 2024 and before Jan 1, 2025 * (triton jan 1+) user-defined triton kernel support (this is implemented already in @anmyachev's patch) * (triton jan 1+) triton_helper support (failing in triton codegen - needs investigation) * (triton jan 1+) AOTI / cpp wrapper thanks to @anmyachev for patches in https://github.com/intel/intel-xpu-backend-for-triton/blob/main/scripts/pytorch.patch, which contains most of these changes already Pull Request resolved: pytorch#145051 Approved by: https://github.com/jansel
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -9271,14 +9271,23 @@ def run(x):
             self.assertEqual(fw_code.count("halide_helpers.rand"), 2)
             self.assertEqual(bw_code.count("halide_helpers.rand"), 0)
         elif self.device == GPU_TYPE:
-            self.assertEqual(fw_code.count("tl.rand"), 2)
+            # the load_seed_offset arg can be 1 or non-1; depending on whether
+            # the triton signature specializes on 1 vs non-1, you might get 1
+            # or 2 kernels. In newer versions of triton, there's no specialization
+            # so we get only 1 kernel.
+            from torch._inductor.utils import triton_version_uses_attrs_dict
+
+            expected_kernels = 1 if triton_version_uses_attrs_dict() else 2
+            self.assertEqual(fw_code.count("tl.rand"), expected_kernels)
             self.assertEqual(bw_code.count("tl.rand"), 0)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
     def test_randint_kernel_count(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("Only valid for GPU!")
 
+        from torch._inductor.utils import triton_version_uses_attrs_dict
+
         @torch._dynamo.optimize_assert("inductor")
         def fn1():
             random_tensor1 = torch.randint(10, [32], device=self.device)
@@ -9289,7 +9298,15 @@ def fn1():
         _, source_codes = run_and_get_code(fn1)
         # cpp_wrapper does a 2-pass generation on GPU.
         self.assertEqual(len(source_codes), 1)
-        self.assertEqual(source_codes[0].count("async_compile.triton"), 2)
+
+        # the load_seed_offset arg can be 1 or non-1; depending on whether
+        # the triton signature specializes on 1 vs non-1, you might get 1
+        # or 2 kernels. In newer versions of triton, there's no specialization
+        # so we get only 1 kernel.
+        expected_kernels = 1 if triton_version_uses_attrs_dict() else 2
+        self.assertEqual(
+            source_codes[0].count("async_compile.triton"), expected_kernels
+        )
 
     def test_roll(self):
         def fn(a):
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
@@ -189,6 +189,11 @@ def alias_of(self):
         return None
 
 
+@dataclasses.dataclass
+class ConstexprArg:
+    name: str
+
+
 @dataclasses.dataclass
 class TMADescriptorArg:
     name: str
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -56,13 +56,15 @@
     sympy_product,
     sympy_subs,
     triton_type,
+    triton_version_uses_attrs_dict,
     upcast_compute_type,
 )
 from ..virtualized import _ops as ops, OpsHandler, ReductionType, StoreMode, V
 from ..wrapper_benchmark import get_kernel_category_by_source_code
 from .block_analysis import BlockPatternMatcher
 from .common import (
     BackendFeature,
+    ConstexprArg,
     CSE,
     CSEVariable,
     DeferredLine,
@@ -85,8 +87,8 @@
 )
 from .triton_utils import (
     config_of,
+    non_constexpr_signature,
     should_unwrap_unspec_arg,
-    signature_of,
     signature_to_meta,
 )
 
@@ -3357,6 +3359,35 @@ def codegen_kernel(self, name=None):
 
         mutated_args = sorted(mutated_args)
 
+        for tree in self.active_range_trees():
+            sizearg = SizeArg(f"{tree.prefix}numel", tree.numel)
+            signature.append(sizearg)
+            argdefs.append(sizearg.name)
+            # constexpr version causes issues, see
+            # https://github.com/pytorch/torchdynamo/pull/1362
+            # triton_meta["constants"][len(argdefs)] = V.graph.sizevars.size_hint(
+            #     tree.numel
+            # )
+            # argdefs.append(f"{tree.prefix}numel: tl.constexpr")
+
+        def add_constexpr_arg(arg_name):
+            # new versions (but not old versions) of Triton need constexprs included in the signature
+            if triton_version_uses_attrs_dict():
+                signature.append(ConstexprArg(arg_name))
+            argdefs.append(f"{arg_name} : tl.constexpr")
+
+        for tree in self.range_trees:
+            if tree.is_reduction and self.persistent_reduction:
+                # Rn_BLOCK for persistent_reduction is defined in codegen_static_numels
+                continue
+            if tree.tensor_dim is None:
+                continue
+
+            add_constexpr_arg(f"{tree.prefix.upper()}BLOCK")
+
+        if self.cooperative_reduction:
+            add_constexpr_arg("RSPLIT")
+
         triton_meta_signature = signature_to_meta(
             signature, size_dtype=self.index_dtype, argdefs=argdefs
         )
@@ -3390,42 +3421,19 @@ def codegen_kernel(self, name=None):
             num_gb = self.estimate_kernel_num_bytes() / 1e9
             inductor_meta["kernel_num_gb"] = num_gb
 
-        for tree in self.active_range_trees():
-            sizearg = SizeArg(f"{tree.prefix}numel", tree.numel)
-            signature.append(sizearg)
-            triton_meta_signature[sizearg.name] = signature_of(
-                sizearg, size_dtype=self.index_dtype
-            )
-            argdefs.append(f"{tree.prefix}numel")
-            # constexpr version causes issues, see
-            # https://github.com/pytorch/torchdynamo/pull/1362
-            # triton_meta["constants"][len(argdefs)] = V.graph.sizevars.size_hint(
-            #     tree.numel
-            # )
-            # argdefs.append(f"{tree.prefix}numel: tl.constexpr")
         triton_meta["configs"] = [config_of(signature)]
 
-        # Triton compiler includes equal_to_1 args into constants even
-        # when they are not constexpr. otherwise there may be a segfault
-        # during launching the Inductor-compiled Triton kernel.
-        # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
-        # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
-        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
-            triton_meta["constants"][signature[arg_num].name] = 1  # type: ignore[index]
+        if not triton_version_uses_attrs_dict():
+            # Triton compiler includes equal_to_1 args into constants even
+            # when they are not constexpr. otherwise there may be a segfault
+            # during launching the Inductor-compiled Triton kernel.
+            # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+            # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+            for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+                triton_meta["constants"][signature[arg_num].name] = 1  # type: ignore[index]
 
         self.triton_meta = triton_meta
 
-        for tree in self.range_trees:
-            if tree.is_reduction and self.persistent_reduction:
-                # Rn_BLOCK for persistent_reduction is defined in codegen_static_numels
-                continue
-            if tree.tensor_dim is None:
-                continue
-            argdefs.append(f"{tree.prefix.upper()}BLOCK : tl.constexpr")
-
-        if self.cooperative_reduction:
-            argdefs.append("RSPLIT : tl.constexpr")
-
         self.codegen_body()
 
         for helper in self.helper_functions:
@@ -3457,7 +3465,9 @@ def codegen_kernel(self, name=None):
         else:
             tile_hint = ""
             if len(size_hints) == 2:
-                if len(signature) == 4:  # input, output and 2 args
+                if (
+                    len(non_constexpr_signature(signature)) == 4
+                ):  # input, output and 2 args
                     tile_hint = "tile_hint=TileHint.SQUARE,"
                 else:
                     tile_hint = "tile_hint=TileHint.DEFAULT,"
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -16,9 +16,10 @@
 from ..runtime.runtime_utils import next_power_of_2
 from ..runtime.triton_heuristics import grid_combo_kernels
 from ..scheduler import BaseSchedulerNode
-from ..utils import Placeholder
+from ..utils import Placeholder, triton_version_uses_attrs_dict
 from ..virtualized import V
 from .common import (
+    ConstexprArg,
     DeferredLine,
     IndentedBuffer,
     Kernel,
@@ -727,8 +728,12 @@ def codegen_blocks(self, code: IndentedBuffer) -> None:
             code.splice(f"R0_BLOCK: tl.constexpr = {self.block_size_reduce}")
             code.splice(f"RBLOCK: tl.constexpr = {self.block_size_reduce}")
 
-    def add_blockd_to_args(self, argdefs: list[str]) -> list[str]:
-        block_args = {}
+    def get_block_args(self) -> list[ConstexprArg]:
+        """
+        Calculate blocks from sub_kernels and range_trees.
+        **Update self.block_args**
+        Return the block args
+        """
         block_names = {}
         for sub_kernel in self.sub_kernels:
             # TODO: we assume all sub_kernels have the same block size
@@ -739,13 +744,10 @@ def add_blockd_to_args(self, argdefs: list[str]) -> list[str]:
                     continue
                 if tree.prefix == "x" and sub_kernel.no_x_dim:
                     continue
-                block_args[f"{tree.prefix.upper()}BLOCK : tl.constexpr"] = tree.prefix
                 block_names[f"{tree.prefix.upper()}BLOCK"] = tree.prefix
-        if self.enable_autotune:
-            argdefs.extend(block_args)
         self.block_args = list(block_names.keys())
 
-        return argdefs
+        return [ConstexprArg(x) for x in block_names.keys()]
 
     def add_numel_to_args(self, argdefs: list[str], signature: list[Any]) -> list[str]:
         for num, sub_kernel in enumerate(self.sub_kernels):
@@ -830,7 +832,12 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
 
         argdefs, _, signature, _ = self.args.python_argdefs()
         argdefs = self.add_numel_to_args(argdefs, signature)
-        argdefs = self.add_blockd_to_args(argdefs)
+        block_args = self.get_block_args()
+        if self.enable_autotune:
+            argdefs.extend([f"{x.name}: tl.constexpr" for x in block_args])
+            if triton_version_uses_attrs_dict():
+                signature.extend(block_args)
+
         code.splice(
             self.jit_line(
                 heuristics,
diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py
@@ -9,7 +9,14 @@
 from ..runtime.hints import AttrsDescriptorWrapper
 from ..utils import _type_of, expr_fits_within_32bit
 from ..virtualized import V
-from .common import KernelArgType, SizeArg, TensorArg, TMADescriptorArg, WorkspaceArg
+from .common import (
+    ConstexprArg,
+    KernelArgType,
+    SizeArg,
+    TensorArg,
+    TMADescriptorArg,
+    WorkspaceArg,
+)
 
 
 def should_unwrap_unspec_arg(name: str):
@@ -73,9 +80,20 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
         return _type_of(arg.dtype)
     if isinstance(arg, TMADescriptorArg):
         return "nvTmaDesc"
+    if isinstance(arg, ConstexprArg):
+        return "constexpr"
     raise NotImplementedError(f"unhandled {type(arg)}: {arg}")
 
 
+def non_constexpr_signature(signature):
+    new_signature = []
+    for arg in signature:
+        if not isinstance(arg, ConstexprArg):
+            new_signature.append(arg)
+
+    return new_signature
+
+
 def signature_to_meta(
     signature: list[KernelArgType],
     *,
@@ -152,7 +170,7 @@ def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
         if isinstance(x, WorkspaceArg):
             # We allocate the workspace ourselves, so it is always aligned
             return True
-        if isinstance(x, TMADescriptorArg):
+        if isinstance(x, (TMADescriptorArg, ConstexprArg)):
             return False
         raise NotImplementedError(f"unhandled {type(x)}: {x}")
 
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
@@ -47,6 +47,7 @@ def _is_triton_available() -> bool:
     import triton.compiler.compiler
 
     if hasattr(triton.backends.compiler, "AttrsDescriptor"):
+        # Triton 3.2.0 - the second implementation
         from triton.backends.compiler import AttrsDescriptor
 
         def AttrsDescriptorWrapper(
@@ -67,7 +68,8 @@ def AttrsDescriptorWrapper(
             assert res.property_values["tt.equal_to"] == 1
             return res
 
-    else:
+    elif hasattr(triton.compiler.compiler, "AttrsDescriptor"):
+        # Triton 3.0.0 - the original implementation
         from triton.compiler.compiler import AttrsDescriptor
 
         def AttrsDescriptorWrapper(
@@ -83,6 +85,20 @@ def AttrsDescriptorWrapper(
             # Instantiate AttrsDescriptor with the prepared arguments
             return AttrsDescriptor(**kwargs)
 
+    else:
+        # Triton in 2025:
+        # note: there's also a range of triton commits not currently supported
+        # from ~Dec 9, 2024 to Jan 1 2025, in which AttrsDescriptors are still
+        # used, but the contents are different.
+
+        def AttrsDescriptorWrapper(
+            divisible_by_16=None,
+            equal_to_1=None,
+        ):
+            return {
+                tuple((x,) for x in divisible_by_16): [["tt.divisibility", 16]],
+            }
+
 else:
     # Define a namedtuple as a fallback when AttrsDescriptor is not available
     AttrsDescriptorWrapper = collections.namedtuple(  # type: ignore[no-redef, name-match]
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py