[export] serialization support for triton_kernel_wrapper_functional (pytorch#161314)

dolpm · pytorchmergebot · commit affd071858ca · 2025-08-28T17:42:47.000Z
Summary: att Test Plan: buck2 test mode/opt //caffe2/test:test_export -- test_triton_hop Rollback Plan: Differential Revision: D80827767 Pull Request resolved: pytorch#161314 Approved by: https://github.com/angelayi
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
@@ -14,13 +14,24 @@
 from pathlib import Path
 from typing import NamedTuple
 
+from torch.testing._internal.inductor_utils import HAS_GPU
+
+
+if HAS_GPU:
+    import triton
+    import triton.language as tl
+
+    from torch.library import wrap_triton
+    from torch.utils._triton import has_triton
+
 import torch
 import torch._dynamo as torchdynamo
 import torch._export.serde.schema as schema
 import torch.export._trace
 import torch.utils._pytree as pytree
 from torch._export.db.case import ExportCase, SupportLevel
 from torch._export.db.examples import all_examples
+from torch._export.serde.schema import ArgumentKind
 from torch._export.serde.serialize import (
     _dict_to_dataclass,
     _to_json_bytes,
@@ -582,6 +593,118 @@ def forward(self, x):
             serialized.exported_program.range_constraints[symint.name].max_val, 3
         )
 
+    @unittest.skipIf(
+        not torch.cuda.is_available() or not has_triton(), "requires cuda and triton"
+    )
+    def test_triton_hop(self) -> None:
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def custom_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            wrap_triton(add_kernel)[grid](x, y, output, n_elements, 16)
+
+            return output
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x, y):
+                return custom_add(x, y)
+
+        def custom_add_autotune(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            wrap_triton(add_kernel)[grid](x, y, output, n_elements, 16, num_warps=8)
+
+            return output
+
+        class MyModelAutotune(torch.nn.Module):
+            def forward(self, x, y):
+                return custom_add_autotune(x, y)
+
+        device = "cuda"
+
+        for m in [MyModel().to(device), MyModelAutotune().to(device)]:
+            args = (torch.randn(3, device=device), torch.randn(3, device=device))
+            ep = torch.export.export(m, args=args)
+            ep = ep.run_decompositions(decompose_custom_triton_ops=False)
+            assert torch.allclose(m(*args), ep.module()(*args))
+
+            serialized = ExportedProgramSerializer().serialize(ep)
+
+            for node in serialized.exported_program.graph_module.graph.nodes:
+                if (
+                    node.target
+                    == "torch.ops.higher_order.triton_kernel_wrapper_functional"
+                ):
+                    triton_node = node
+
+            self.assertIsNotNone(triton_node)
+
+            args = []
+            kwargs = []
+
+            for arg in triton_node.inputs:
+                if arg.kind == ArgumentKind.POSITIONAL:
+                    args.append(arg.arg)
+                elif arg.kind == ArgumentKind.KEYWORD:
+                    kwargs.append(arg.arg)
+
+            self.assertEqual(len(args), 4)
+            self.assertEqual(len(kwargs), 4)
+
+            for i in range(3):
+                self.assertIsNotNone(args[i].as_tensor)
+
+            self.assertEqual(args[3].as_int, 3)
+
+            self.assertEqual(kwargs[0].as_string, "add_kernel")  # name
+            self.assertEqual(kwargs[1].as_ints, [1, 1, 1])  # grid
+            self.assertEqual(kwargs[2].as_ints, [2])  # output indices
+            self.assertEqual(
+                kwargs[3].as_int, 8 if isinstance(m, MyModelAutotune) else 4
+            )  # num warps
+
+            self.assertEqual(len(triton_node.outputs), 1)
+            self.assertIsNotNone(triton_node.outputs[0].as_tensors)
+            self.assertEqual(
+                len(triton_node.outputs[0].as_tensors), len(kwargs[2].as_ints)
+            )
+            self.assertEqual(triton_node.outputs[0].as_tensors[0].name, "getitem")
+
+            with self.assertRaisesRegex(
+                SerializeError,
+                "deserialize nyi for torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional",
+            ):
+                ExportedProgramDeserializer().deserialize(
+                    serialized.exported_program,
+                    serialized.state_dict,
+                    serialized.constants,
+                    serialized.example_inputs,
+                )
+
     def test_kwargs_default(self) -> None:
         """
         Tests that the kwargs default values are serialized even if they are not
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
@@ -35,6 +35,7 @@
 from torch.utils._sympy.symbol import prefix_str, SymT
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils._traceback import CapturedTraceback
+from torch.utils._triton import has_triton
 
 from ..utils import remove_proxy_from_state_dict
 from .schema import (  # type: ignore[attr-defined]
@@ -93,6 +94,14 @@
 from .union import _Union
 
 
+if has_triton():
+    from triton.runtime.autotuner import Autotuner
+else:
+
+    class Autotuner:  # type: ignore[no-redef]
+        pass
+
+
 __all__ = [
     "serialize",
     "GraphModuleSerializer",
@@ -670,6 +679,75 @@ def serialize_tensor_list_output(node):
                     metadata=self.serialize_metadata(node),
                     is_hop_single_tensor_return=False,
                 )
+            elif (
+                node.target
+                is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
+            ):
+                assert has_triton(), "triton required to serialize triton kernels"
+
+                meta_val = node.meta["val"]
+                assert isinstance(meta_val, dict)
+
+                output_keys = meta_val.keys()
+                output_indices = []
+
+                assert isinstance(node.kwargs["kernel_idx"], int)
+                kernel = torch._higher_order_ops.triton_kernel_wrap.kernel_side_table.get_kernel(
+                    node.kwargs["kernel_idx"]
+                )
+
+                if isinstance(kernel, Autotuner):
+                    assert len(kernel.configs) == 1
+                    num_warps = kernel.configs[0].num_warps
+                    assert kernel.configs[0].num_ctas == 1, (
+                        "serialization only supports num_ctas == 1"
+                    )
+                    kernel = kernel.fn
+                else:
+                    num_warps = 4
+
+                constexpr_keys = set()
+                for p in kernel.params:
+                    if p.is_constexpr:
+                        constexpr_keys.add(p.name)
+
+                found_constexpr = False
+                args_new = ()
+                i = 0
+
+                assert isinstance(node.kwargs["kwargs"], dict)
+                for k, v in node.kwargs["kwargs"].items():
+                    # don't serialize constexpr since they will
+                    # be embedded into the binary and don't
+                    # need to be passed around as attributes
+                    if k in constexpr_keys:
+                        found_constexpr = True
+                        continue
+
+                    assert not found_constexpr, (
+                        "non-constexpr args found after constexpr arg(s)"
+                    )
+
+                    if k in output_keys:
+                        output_indices.append(i)
+                    args_new += (v,)  # type: ignore[assignment]
+                    i += 1
+
+                assert isinstance(node.kwargs["grid"], list)
+                kwargs_new = {
+                    "name": kernel.fn.__name__,
+                    "grid": node.kwargs["grid"][0],
+                    "output_indices": output_indices,
+                    "num_warps": num_warps,
+                }
+
+                ex_node = Node(
+                    target=self.serialize_operator(node.target),
+                    inputs=self.serialize_hoo_inputs(args_new, kwargs_new),
+                    outputs=self.serialize_hoo_outputs(node),
+                    metadata=self.serialize_metadata(node),
+                    is_hop_single_tensor_return=_is_hop_single_tensor_return(node),
+                )
             else:
                 ex_node = Node(
                     target=self.serialize_operator(node.target),
@@ -1541,6 +1619,17 @@ def serialize_hoo_outputs(self, node: torch.fx.Node) -> list[Argument]:
                     outputs.append(self.serialize_output(name, element_meta_val))
 
             return outputs
+        elif isinstance(meta_val, dict):
+            tensor_args = []
+            # use the dict key as the idx
+            for idx, meta in meta_val.items():
+                if not isinstance(meta, torch.Tensor):
+                    raise SerializeError(
+                        f"Serialize list output with type {type(meta)} nyi"
+                    )
+                name = self._output_node_name_at_index(node, idx)
+                tensor_args.append(self.serialize_tensor_output(name, meta))
+            return [Argument.create(as_tensors=tensor_args)]
         else:
             return [self.serialize_output(node.name, meta_val)]
 
@@ -2067,7 +2156,13 @@ def _is_single_tensor_return(target) -> bool:
 
             fx_node = self.graph.create_node("call_function", target, args, {}, name)
             self.deserialize_sym_op_outputs(serialized_node, fx_node)
-
+        elif (
+            target
+            is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
+        ):
+            raise SerializeError(
+                "deserialize nyi for torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional"
+            )
         elif isinstance(target, torch._ops.HigherOrderOperator):
             args, kwargs = self.deserialize_hoo_inputs(serialized_node.inputs)
             metadata = self.deserialize_metadata(serialized_node.metadata)