Merge branch 'main' into improve-vgf-runtime-and-update-mlsdk-url

YufengShi-dudu · web-flow · commit c1f05f3fb719 · 2025-08-06T15:05:15.000+01:00
diff --git a/backends/transforms/remove_clone_ops.py b/backends/transforms/remove_clone_ops.py
@@ -6,26 +6,45 @@
 
 # pyre-strict
 
+from typing import Set
+
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from executorch.exir.passes.remove_noop_pass import _DEQUANT_OPS, eliminate_dq_q
 
 
-def remove_clone_ops(graph: torch.fx.Graph) -> torch.fx.Graph:
+class RemoveCloneOpsTransform(ExportPass):
     """
-    Remove clone op nodes and replace uses with parent node.
+    Trim the 'identity' operators to reduce the unnecessary copy overhead.
     """
-    clone_op = exir_ops.edge.aten.clone.default
-    for node in graph.nodes:
-        if node.op == "call_function" and node.target == clone_op:
-            with graph.inserting_after(node):
-                node.replace_all_uses_with(node.args[0])
 
-    graph.eliminate_dead_code()
-    return graph
+    clone_ops: Set[torch._ops.OpOverload] = {
+        exir_ops.edge.aten.clone.default,
+    }
 
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _remove(self, graph_module: torch.fx.GraphModule) -> None:
+        dequant_nodes = []
+
+        for n in graph_module.graph.nodes:
+            if n.target not in self.clone_ops:
+                continue
+
+            to_be_remove = n
+            for user_n in list(n.users.keys()):
+                user_n.replace_input_with(n, n.args[0])
+            if n.args[0].target in _DEQUANT_OPS:
+                dequant_nodes += [n.args[0]]
+            graph_module.graph.erase_node(to_be_remove)
+
+        eliminate_dq_q(graph_module, dequant_nodes)
 
-class RemoveCloneOpsTransform(ExportPass):
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        graph_module.graph = remove_clone_ops(graph_module.graph)
+        self._remove(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
         return PassResult(graph_module, True)
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
@@ -109,6 +109,7 @@ def define_common_targets():
         srcs = ["remove_clone_ops.py"],
         visibility = [
             "//executorch/backends/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         deps = [
             "//caffe2:torch",
@@ -242,3 +243,15 @@ def define_common_targets():
             ":rank_0_to_rank_1",
         ],
     )
+
+    runtime.python_test(
+        name = "test_remove_clone_ops",
+        srcs = [
+            "test/test_remove_clone_ops.py",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:lib",
+            ":remove_clone_ops",
+        ],
+    )
diff --git a/backends/transforms/test/test_remove_clone_ops.py b/backends/transforms/test/test_remove_clone_ops.py
@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx import GraphModule
+from torch.testing import FileCheck
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestRemoveCloneOpsTransform(TestCase):
+    def test_dq_clone_q_linear(self):
+        """
+        Test RemoveCloneOpsTransform on a graph with d/q -> clone -> q -> linear pattern
+
+        Before: Should contain all nodes
+        After: Should only have the linear operation
+        """
+
+        # Create a graph module directly with the pattern: quant -> clone -> dequant -> fp linear
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 5)
+
+            def forward(self, x):
+                # This will be replaced with our custom graph
+                return self.linear(x)
+
+        # Create a module instance
+        module = TestModule()
+
+        # Create a new graph with our desired pattern
+        graph = torch.fx.Graph()
+
+        # Add placeholders
+        input_node = graph.placeholder("x")
+
+        # Create nodes for our pattern: quant -> clone -> dequant -> fp linear
+        # Constants for quantization parameters
+        scale = graph.create_node(
+            "call_function", torch.tensor, args=([0.1],), kwargs={}
+        )
+        zero_point = graph.create_node(
+            "call_function", torch.tensor, args=([0],), kwargs={}
+        )
+
+        # Dequantize node
+        dequant_node = graph.create_node(
+            "call_function",
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            args=(input_node, scale, zero_point, torch.int8),
+            kwargs={},
+        )
+
+        # Clone node.
+        # Use Edge op as this is an executorch pass
+        clone_node = graph.create_node(
+            "call_function",
+            exir_ops.edge.aten.clone.default,
+            args=(dequant_node,),
+            kwargs={},
+        )
+
+        # Quantize node
+        quant_node = graph.create_node(
+            "call_function",
+            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+            args=(clone_node, scale, zero_point, torch.int8),
+            kwargs={},
+        )
+
+        # Linear node (using the module's linear layer)
+        # Technically, should use quantized weight and bias
+        # but we are just inspecting graph patterns in this test
+        weight = graph.create_node("get_attr", "linear.weight")
+        bias = graph.create_node("get_attr", "linear.bias")
+        linear_node = graph.create_node(
+            "call_function",
+            torch.nn.functional.linear,
+            args=(quant_node, weight, bias),
+            kwargs={},
+        )
+
+        # Output
+        graph.output(linear_node)
+
+        # Create a GraphModule with our custom graph
+        gm = GraphModule(module, graph)
+
+        # Verify we have the expected nodes before transformation using FileCheck
+        FileCheck().check(
+            "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+        ).check(
+            "executorch_exir_dialects_edge__ops_aten_clone_default",
+        ).check(
+            "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+        ).check(
+            "torch._C._nn.linear",
+        ).run(
+            gm.code
+        )
+
+        # Apply the transform
+        transformed_gm = RemoveCloneOpsTransform()(gm).graph_module
+
+        # Verify the dq -> clone -> q pattern is removed and linear op is still present using FileCheck
+        FileCheck().check_not(
+            "executorch_exir_dialects_edge__ops_aten_clone_default"
+        ).check_not("quantized_decomposed.dequantize_per_tensor.default").check_not(
+            "quantized_decomposed.quantize_per_tensor.default"
+        ).check_count(
+            "torch._C._nn.linear",
+            1,
+            exactly=True,
+        ).run(
+            transformed_gm.code
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -670,7 +670,7 @@ Error Method::resolve_operator(
     size_t kernel_index,
     InstructionArgs args,
     size_t n_args) {
-  // TODO(T153505381, T153506819) Investigate optimizing this function for both
+  // TODO(T153506819) Investigate optimizing this function for both
   // space and time.
 
   // resolve name
@@ -691,8 +691,16 @@ Error Method::resolve_operator(
   }
 
   // resolve tensor meta
-  auto method_allocator = memory_manager_->method_allocator();
-  TensorMeta* meta = method_allocator->allocateList<TensorMeta>(n_args);
+  // Since temp allocator can be freed, we optimistically
+  // try to use that allocator first.
+  auto allocator = memory_manager_->temp_allocator();
+  // However, it does not have to be provided, so if it
+  // is not provided (or an empty one is provided), we
+  // fall back to the method allocator.
+  if (allocator == nullptr || allocator->size() == 0) {
+    allocator = memory_manager_->method_allocator();
+  }
+  TensorMeta* meta = allocator->allocateList<TensorMeta>(n_args);
   if (meta == nullptr) {
     return Error::MemoryAllocationFailed;
   }
@@ -705,8 +713,7 @@ Error Method::resolve_operator(
       auto tensor = eval->toTensor();
       meta[count].dtype_ = tensor.scalar_type();
       executorch::aten::DimOrderType* dim_order_ptr =
-          method_allocator->allocateList<executorch::aten::DimOrderType>(
-              tensor.dim());
+          allocator->allocateList<executorch::aten::DimOrderType>(tensor.dim());
       if (dim_order_ptr == nullptr) {
         return Error::MemoryAllocationFailed;
       }