Dont exclude constant_pad_nd in prologue fusion (pytorch#150145)

pytorchbot · eellison · web-flow · commit 6569576c4ecf · 2025-04-02T20:24:52.000-04:00
Dont exclude constant_pad_nd in prologue fusion (pytorch#149947) Originally, I excluded constant_pad_nd from fusing to be conservative on compilation time. But, on benchmarking, you do occasionally get speedups by fusing it. Also includes a fix for making single, contiguous dep for prologues. For instance, the following benchmark gets a 7% speedup by fusing in the constant_pad_nd. ``` import torch import torch.nn.functional as F torch._inductor.config.force_disable_caches = True padded_N = 2048 n_pad_rows = 100 K, N = 2048, 4096 tensor1 = torch.randn(padded_N - n_pad_rows, 4096, device="cuda").to(torch.bfloat16) tensor2 = torch.randn(4096, 4096, device="cuda").to(torch.bfloat16) @torch.compile(mode='max-autotune-no-cudagraphs') def masked_linear(input, weight, n_pad_input_rows): """ Linear layer with input padded by `n_pad_input_rows` rows """ # Use constant_pad_nd to pad with zeros for the invalid rows padded_input = F.pad(tensor1, (0, 0, 0, n_pad_input_rows), "constant", 0) return F.linear(padded_input, weight) # Invoke the function masked_linear(tensor1, tensor2, n_pad_rows) ``` Pull Request resolved: pytorch#149947 Approved by: https://github.com/drisspg (cherry picked from commit 4c57aec) Co-authored-by: eellison <elias.ellison@gmail.com>
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -1646,21 +1646,32 @@ def foo(x, y, z):
     @skipIfXpu
     @config.patch(shape_padding=True)
     @config.patch(force_shape_pad=True)
-    @parametrize("sizes", ((250, 245, 128), (250, 256, 128), (256, 128, 62)))
-    def test_prologue_masked_load(self, sizes):
-        M, K, N = sizes
-
+    def test_prologue_masked_load(self):
         def foo(x, y):
-            return x @ y
+            return x @ y.T
 
         x = torch.rand([250, 245], device=GPU_TYPE)
-        y = torch.rand([245, 128], device=GPU_TYPE)
+        y = torch.rand([245, 128], device=GPU_TYPE).T.contiguous()
 
         # we should not attempt prologue fusion if it turns an aligned load
         # into an unaligned load
         out, code = run_and_get_code(torch.compile(foo), x, y)
         self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
-        self.check_code(code[0], num_kernels=3, num_allocs=3, num_deallocs=4)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+
+    def test_masked_numeric(self):
+        # correctly detect upcast inside the cat mask, dont fuse
+        def foo(a, b, y):
+            return torch.cat([a, (b * 4)]) @ y.T
+
+        a = torch.rand([220, 245], device=GPU_TYPE, dtype=torch.float16)
+        b = torch.rand([20, 245], device=GPU_TYPE, dtype=torch.float16)
+        y = torch.rand([245, 128], device=GPU_TYPE, dtype=torch.float16).T.contiguous()
+
+        out, code = run_and_get_code(torch.compile(foo), a, b, y)
+
+        self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=4)
+        self.assertEqual(out, foo(a, b, y), atol=0.05, rtol=0.05)
 
 
 if __name__ == "__main__":
diff --git a/torch/_inductor/analyze_preserves_zero_mask.py b/torch/_inductor/analyze_preserves_zero_mask.py
@@ -1,13 +1,14 @@
 import dataclasses
 import itertools
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import sympy
 
 import torch
 from torch._inductor import config
 from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
 from torch._inductor.index_propagation import SymPyOps, TypedExpr
+from torch._prims_common import type_to_dtype
 
 from .ops_handler import DefaultHandler
 from .virtualized import StoreMode, V
@@ -109,11 +110,32 @@ def check_bounds(
     def indirect_indexing(*args: Any, **kwargs: Any) -> sympy.Expr:
         return sympy.S.Zero
 
+    def masked(
+        self,
+        mask: DTypeContainer,
+        body: Callable[[], DTypeContainer],
+        other: DTypeContainer,
+    ) -> DTypeContainer:
+        return self.where(mask, other, body())
+
     def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+        def to_constant(c: Union[int, float]) -> DTypeContainer:
+            return DTypeContainer(type_to_dtype(type(c)), is_scalar=True)
+
+        args = tuple(
+            a if not isinstance(a, (int, float)) else to_constant(a) for a in args
+        )
+        kwargs = {
+            k: v if not isinstance(v, (int, float)) else to_constant(v)
+            for k, v in kwargs.items()
+        }
+
         out_dtype = getattr(self.dtype_prop, name)(*args, **kwargs)
-        out = DTypeContainer(out_dtype, is_scalar=(name == "constant"))
-        if name == "constant":
-            return DTypeContainer(torch.float, is_scalar=True)
+        is_scalar = all(
+            not isinstance(v, DTypeContainer) or v.is_scalar
+            for v in itertools.chain(args, kwargs.values())
+        )
+        out = DTypeContainer(out_dtype, is_scalar=is_scalar)
 
         uses_low_prec = any(
             isinstance(dtype_cont, DTypeContainer)
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -4365,7 +4365,19 @@ def dummy(index, rindex):  # type: ignore[no-untyped-def]
         )
 
         for inp in self.inputs:
-            indexer = inp.layout.make_indexer()
+            layout = inp.layout
+
+            # we dont know what the iteration order is of the template,
+            # so we just want to make a single, contiguous dependency
+            if not layout.is_contiguous():
+                layout = FixedLayout(
+                    device=layout.device,
+                    dtype=layout.dtype,
+                    size=layout.size,
+                    stride=FlexibleLayout.contiguous_strides(layout.size),
+                    offset=layout.offset,
+                )
+            indexer = layout.make_indexer()
 
             def dummy(index, rindex):  # type: ignore[no-untyped-def]
                 assert len(rindex) == 0
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -3460,24 +3460,6 @@ def check_prologue_fusion_heuristics_fusable(
             why("prologue fusion will not increase amount of bytes read in kernel")
             return False
 
-        # we want to avoid attempting to fuse predictably unprofitable prologues
-        # such as increasing the unaligned reads or writes.
-        # TODO - would be nice to generalize this, however, we would need more explicit
-        # knowledge of memory access patterns in the TritonTemplate in order to know
-        # the stride order to check alignment.
-        origins = tuple(
-            e.target
-            for n in prologue_node.get_nodes()
-            if n.node is not None
-            for e in n.node.get_origins()
-            if e.op == "call_function"
-        )
-        if origins == (torch.ops.aten.constant_pad_nd.default,):
-            why(
-                "prologue fusion will not increase attempt to fuse in padding bc it increases unaligned reads"
-            )
-            return False
-
         def low_prec_fp(dtype: torch.dtype) -> bool:
             return dtype.itemsize <= 2 and dtype.is_floating_point