Extend reinplace pass to select_copy.int (pytorch#15136)

larryliu0820 · web-flow · commit e159e6512ee1 · 2025-10-15T09:33:10.000-07:00
This pull request refactors and centralizes the logic for replacing
"view_copy" operations with "view" operations in the graph
transformation passes for both the AOTInductor and CUDA backends. The
main change is the creation of a unified pass in
`backends/aoti/passes/replace_view_copy_with_view.py`, which replaces
the previous backend-specific implementations and expands support to
additional ops. The backend code is updated to use this new shared pass,
and redundant files are removed.

**Pass refactoring and centralization:**

* Created a new unified pass `replace_view_copy_with_view.py` in
`backends/aoti/passes` that replaces "view_copy" type ops (including
`slice_copy` and `select_copy`) with their corresponding "view" ops for
use in AOTInductor and CUDA backends.
[[1]](diffhunk://#diff-725a4a1f4634a11f716ae6f649894f6eea64edb21f56ad56cde92f18fdd2f713L7-R12)
[[2]](diffhunk://#diff-725a4a1f4634a11f716ae6f649894f6eea64edb21f56ad56cde92f18fdd2f713L18-R44)
[[3]](diffhunk://#diff-374a8b362bdad92dce92e7c3bb474dd6106fc80d7253e6b5d5a1c9fb971dc76eR1-R17)
* Removed the old backend-specific pass files
(`replace_slice_copy_with_slice.py`) from both `backends/apple/metal`
and `backends/cuda`.
[[1]](diffhunk://#diff-c4a228b182f50f778545991d472609ad705d2325994342174093ff374738851dL1-L118)
[[2]](diffhunk://#diff-f0e6cbb7940752204a85a43708b5424de89eb4556698043d6cc652c07eabd624L9-R15)

**Backend integration and API updates:**

* Updated both `metal_backend.py` and `cuda_backend.py` to import and
use the new `ReplaceViewCopyWithViewPass` instead of the previous
backend-specific implementations.
[[1]](diffhunk://#diff-20452c18c868bce8db75555905fdbc3a6347536697bdfea9b7187bd6c765a24eL15-R16)
[[2]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aL15-R16)
* Modified the preprocessing step in both backends to apply the new
pass, which now handles both `slice_copy` and `select_copy` ops.
[[1]](diffhunk://#diff-20452c18c868bce8db75555905fdbc3a6347536697bdfea9b7187bd6c765a24eL96-R96)
[[2]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aL126-R127)
diff --git a/backends/aoti/passes/TARGETS b/backends/aoti/passes/TARGETS
@@ -0,0 +1,17 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.python_library(
+    name = "passes",
+    srcs = [
+      "replace_view_copy_with_view.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:pass_base",
+    ],
+)
diff --git a/backends/aoti/passes/replace_view_copy_with_view.py b/backends/aoti/passes/replace_view_copy_with_view.py
@@ -4,9 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-strict
+# This pass replaces view_copy ops with view ops. This is different than
+# exir/passes/replace_view_copy_with_view.py and exir/passes/reinplace.py
+# because this should only be used in the AOTInductor backend, as it
+# has less restrictions on whether the tensor memory is densely packed,
 
-from typing import Dict, Iterable, Tuple
+from typing import Dict, Iterable
 
 import torch
 from executorch.exir.dialects._ops import ops
@@ -15,33 +18,30 @@
 from torch import fx
 
 
-_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = (
-    torch.ops.aten.slice_copy.Tensor,
-    ops.edge.aten.slice_copy.Tensor,
-)
-
-_SLICE_TARGETS: Dict[
+_VIEW_TARGETS: Dict[
     torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload
 ] = {
     torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
     ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
+    torch.ops.aten.select_copy.int: torch.ops.aten.select.int,
+    ops.edge.aten.select_copy.int: ops.edge.aten.select.int,
 }
 
 
-class ReplaceSliceCopyWithSlicePass(ExportPass):
-    """Replace non-mutated ``slice_copy`` results with ``slice`` views."""
+class ReplaceViewCopyWithViewPass(ExportPass):
+    """Replace non-mutated ``view_copy`` type of ops with ``view`` ops."""
 
     def call(self, graph_module: fx.GraphModule) -> PassResult:
         graph_changed = False
 
         for node in graph_module.graph.nodes:
-            if node.op != "call_function" or node.target not in _SLICE_COPY_TARGETS:
+            if node.op != "call_function" or node.target not in _VIEW_TARGETS:
                 continue
 
             if self._has_blocking_user(node, node.users.keys()):
                 continue
 
-            node.target = _SLICE_TARGETS[node.target]
+            node.target = _VIEW_TARGETS[node.target]
             graph_changed = True
 
         if graph_changed:
diff --git a/backends/apple/metal/metal_backend.py b/backends/apple/metal/metal_backend.py
@@ -12,8 +12,8 @@
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
-from executorch.backends.apple.metal.replace_slice_copy_with_slice import (
-    ReplaceSliceCopyWithSlicePass,
+from executorch.backends.aoti.passes.replace_view_copy_with_view import (
+    ReplaceViewCopyWithViewPass,
 )
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
@@ -93,7 +93,7 @@ def preprocess(
         mps_edge_program = move_to_device_pass(edge_program, "mps")
 
         # replace slice_copy with slice
-        ReplaceSliceCopyWithSlicePass()(mps_edge_program.graph_module)
+        ReplaceViewCopyWithViewPass()(mps_edge_program.graph_module)
 
         edge_program_module = mps_edge_program.module()
 
diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
@@ -6,13 +6,13 @@ runtime.python_library(
     name = "cuda_backend",
     srcs = [
         "cuda_backend.py",
-        "replace_slice_copy_with_slice.py",
     ],
     visibility = [
         "//executorch/...",
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/backends/aoti/passes:passes",
         "//executorch/exir/_serialize:lib",
         "//executorch/exir/backend:backend_details",
         "//executorch/exir/backend:compile_spec_schema",
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -12,8 +12,8 @@
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
-from executorch.backends.cuda.replace_slice_copy_with_slice import (
-    ReplaceSliceCopyWithSlicePass,
+from executorch.backends.aoti.passes.replace_view_copy_with_view import (
+    ReplaceViewCopyWithViewPass,
 )
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
@@ -123,8 +123,8 @@ def preprocess(
         # Move the edge_program from CPU to CUDA for aoti compile
         cuda_edge_program = move_to_device_pass(edge_program, "cuda")
 
-        # replace slice_copy with slice
-        ReplaceSliceCopyWithSlicePass()(cuda_edge_program.graph_module)
+        # replace slice_copy.Tensor with slice.Tensor, select_copy.int with select.int
+        ReplaceViewCopyWithViewPass()(cuda_edge_program.graph_module)
 
         cuda_edge_program = cuda_edge_program.run_decompositions(
             cuda_decomposition_table
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py