Revert "Dont exclude constant_pad_nd in prologue fusion" (pytorch#150699)

atalman · web-flow · commit a6321d62273d · 2025-04-04T15:51:44.000-04:00
Revert "Dont exclude constant_pad_nd in prologue fusion (pytorch#150145)" This reverts commit 6569576.
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -1646,32 +1646,21 @@ def foo(x, y, z):
     @skipIfXpu
     @config.patch(shape_padding=True)
     @config.patch(force_shape_pad=True)
-    def test_prologue_masked_load(self):
+    @parametrize("sizes", ((250, 245, 128), (250, 256, 128), (256, 128, 62)))
+    def test_prologue_masked_load(self, sizes):
+        M, K, N = sizes
+
         def foo(x, y):
-            return x @ y.T
+            return x @ y
 
         x = torch.rand([250, 245], device=GPU_TYPE)
-        y = torch.rand([245, 128], device=GPU_TYPE).T.contiguous()
+        y = torch.rand([245, 128], device=GPU_TYPE)
 
         # we should not attempt prologue fusion if it turns an aligned load
         # into an unaligned load
         out, code = run_and_get_code(torch.compile(foo), x, y)
         self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
-        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
-
-    def test_masked_numeric(self):
-        # correctly detect upcast inside the cat mask, dont fuse
-        def foo(a, b, y):
-            return torch.cat([a, (b * 4)]) @ y.T
-
-        a = torch.rand([220, 245], device=GPU_TYPE, dtype=torch.float16)
-        b = torch.rand([20, 245], device=GPU_TYPE, dtype=torch.float16)
-        y = torch.rand([245, 128], device=GPU_TYPE, dtype=torch.float16).T.contiguous()
-
-        out, code = run_and_get_code(torch.compile(foo), a, b, y)
-
-        self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=4)
-        self.assertEqual(out, foo(a, b, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=3, num_allocs=3, num_deallocs=4)
 
 
 if __name__ == "__main__":
diff --git a/torch/_inductor/analyze_preserves_zero_mask.py b/torch/_inductor/analyze_preserves_zero_mask.py
@@ -1,14 +1,13 @@
 import dataclasses
 import itertools
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING
 
 import sympy
 
 import torch
 from torch._inductor import config
 from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
 from torch._inductor.index_propagation import SymPyOps, TypedExpr
-from torch._prims_common import type_to_dtype
 
 from .ops_handler import DefaultHandler
 from .virtualized import StoreMode, V
@@ -110,32 +109,11 @@ def check_bounds(
     def indirect_indexing(*args: Any, **kwargs: Any) -> sympy.Expr:
         return sympy.S.Zero
 
-    def masked(
-        self,
-        mask: DTypeContainer,
-        body: Callable[[], DTypeContainer],
-        other: DTypeContainer,
-    ) -> DTypeContainer:
-        return self.where(mask, other, body())
-
     def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
-        def to_constant(c: Union[int, float]) -> DTypeContainer:
-            return DTypeContainer(type_to_dtype(type(c)), is_scalar=True)
-
-        args = tuple(
-            a if not isinstance(a, (int, float)) else to_constant(a) for a in args
-        )
-        kwargs = {
-            k: v if not isinstance(v, (int, float)) else to_constant(v)
-            for k, v in kwargs.items()
-        }
-
         out_dtype = getattr(self.dtype_prop, name)(*args, **kwargs)
-        is_scalar = all(
-            not isinstance(v, DTypeContainer) or v.is_scalar
-            for v in itertools.chain(args, kwargs.values())
-        )
-        out = DTypeContainer(out_dtype, is_scalar=is_scalar)
+        out = DTypeContainer(out_dtype, is_scalar=(name == "constant"))
+        if name == "constant":
+            return DTypeContainer(torch.float, is_scalar=True)
 
         uses_low_prec = any(
             isinstance(dtype_cont, DTypeContainer)
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -4365,19 +4365,7 @@ def dummy(index, rindex):  # type: ignore[no-untyped-def]
         )
 
         for inp in self.inputs:
-            layout = inp.layout
-
-            # we dont know what the iteration order is of the template,
-            # so we just want to make a single, contiguous dependency
-            if not layout.is_contiguous():
-                layout = FixedLayout(
-                    device=layout.device,
-                    dtype=layout.dtype,
-                    size=layout.size,
-                    stride=FlexibleLayout.contiguous_strides(layout.size),
-                    offset=layout.offset,
-                )
-            indexer = layout.make_indexer()
+            indexer = inp.layout.make_indexer()
 
             def dummy(index, rindex):  # type: ignore[no-untyped-def]
                 assert len(rindex) == 0
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -3460,6 +3460,24 @@ def check_prologue_fusion_heuristics_fusable(
             why("prologue fusion will not increase amount of bytes read in kernel")
             return False
 
+        # we want to avoid attempting to fuse predictably unprofitable prologues
+        # such as increasing the unaligned reads or writes.
+        # TODO - would be nice to generalize this, however, we would need more explicit
+        # knowledge of memory access patterns in the TritonTemplate in order to know
+        # the stride order to check alignment.
+        origins = tuple(
+            e.target
+            for n in prologue_node.get_nodes()
+            if n.node is not None
+            for e in n.node.get_origins()
+            if e.op == "call_function"
+        )
+        if origins == (torch.ops.aten.constant_pad_nd.default,):
+            why(
+                "prologue fusion will not increase attempt to fuse in padding bc it increases unaligned reads"
+            )
+            return False
+
         def low_prec_fp(dtype: torch.dtype) -> bool:
             return dtype.itemsize <= 2 and dtype.is_floating_point