Update base for Update on "[ET-VK][ez] Support exporting of custom operator calls via higher_order_auto_functionalized, checkpoint"

SS-JIA · SS-JIA · commit 94967e77660b · 2024-10-04T13:10:17.000-07:00
As title. This diff adds the ability to partition custom op calls to the Vulkan delegate. Differential Revision: [D63913434](https://our.internmc.facebook.com/intern/diff/D63913434/) [ghstack-poisoned]
diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS
@@ -28,6 +28,7 @@ runtime.python_library(
         "//executorch/backends/transforms:fuse_view_copy",
         "//executorch/backends/transforms:mean_to_sum_div",
         "//executorch/backends/transforms:remove_clone_ops",
+        "//executorch/backends/vulkan/passes:remove_local_scalar_dense",
         "//executorch/exir:graph_module",
         "//executorch/exir/_serialize:_bindings",
         "//executorch/exir/_serialize:lib",
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
@@ -47,17 +47,16 @@ def __contains__(self, op):
     operator.getitem,
 ]
 
-BINARY_OPS = [
+SUPPORTS_DYNAMIC_SHAPE = [
+    # Binary broadcasting operators
     exir_ops.edge.aten.add.Tensor,
     exir_ops.edge.aten.sub.Tensor,
     exir_ops.edge.aten.minimum.default,
     exir_ops.edge.aten.mul.Tensor,
     exir_ops.edge.aten.div.Tensor,
     exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.pow.Tensor_Tensor,
-]
-
-UNARY_OPS = [
+    # Unary elementwise operators
     exir_ops.edge.aten.abs.default,
     exir_ops.edge.aten.clamp.default,
     exir_ops.edge.aten.cos.default,
@@ -71,60 +70,46 @@ def __contains__(self, op):
     exir_ops.edge.aten.sin.default,
     exir_ops.edge.aten.sqrt.default,
     exir_ops.edge.aten.tanh.default,
-]
-
-MATMUL_OPS = [
+    # Matrix Multiplication Operators
     exir_ops.edge.aten.bmm.default,
     exir_ops.edge.aten.mm.default,
     exir_ops.edge.aten.addmm.default,
     exir_ops.edge.aten.linear.default,
-]
-
-POOLING_OPS = [
+    # Reduction operators
+    exir_ops.edge.aten._log_softmax.default,
+    exir_ops.edge.aten._softmax.default,
+    # 2D Pooling ops
     exir_ops.edge.aten.avg_pool2d.default,
     exir_ops.edge.aten.max_pool2d_with_indices.default,
-]
-
-CONVOLUTION_OPS = [
+    # Convolution ops
     exir_ops.edge.aten.convolution.default,
     exir_ops.edge.et_vk.conv_with_clamp.default,
 ]
 
-REDUCTION_OPS = [
+NO_DYNAMIC_SHAPE = [
+    # Reduction operators
     exir_ops.edge.aten.mean.dim,
     exir_ops.edge.aten.sum.dim_IntList,
-    exir_ops.edge.aten._log_softmax.default,
-    exir_ops.edge.aten._softmax.default,
-]
-
-NORMALIZATION_OPS = [
+    # Normalization operators
     exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
     exir_ops.edge.aten.native_layer_norm.default,
-]
-
-SHAPE_MANIPULATION_OPS = [
+    # Shape Manipulation operators
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.unsqueeze_copy.default,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.permute_copy.default,
     exir_ops.edge.aten.t_copy.default,
-]
-
-INDEXING_OPS = [
+    # Indexing and lookup operators
     exir_ops.edge.aten.embedding.default,
     exir_ops.edge.aten.index_select.default,
     exir_ops.edge.aten.select_copy.int,
     exir_ops.edge.aten.slice_copy.Tensor,
-]
-
-ORCHESTRATION_OPS = [
+    # Tensor combination operators
     exir_ops.edge.aten.cat.default,
     exir_ops.edge.aten.split_with_sizes_copy.default,
     exir_ops.edge.aten.split.Tensor,
     exir_ops.edge.aten.repeat.default,
-]
-
-CREATION_OPS = [
+    # Tensor creation operators
     exir_ops.edge.aten.arange.start_step,
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.constant_pad_nd.default,
@@ -139,39 +124,20 @@ def __contains__(self, op):
 ]
 
 
-def register_prim_ops(ops: OpList):
-    for op in PRIM_OPS:
-        ops[op].supports_texture = True
-        ops[op].supports_buffer = True
-        ops[op].supports_dynamic_shape = True
+def enumerate_supported_ops():
+    ops = OpList()
 
+    # Register in order of least to most capabilities
 
-def register_no_dynamic_shape_ops(ops: OpList):
-    for op in [
-        *REDUCTION_OPS,
-        *NORMALIZATION_OPS,
-        *SHAPE_MANIPULATION_OPS,
-        *INDEXING_OPS,
-        *ORCHESTRATION_OPS,
-        *CREATION_OPS,
-    ]:
+    for op in NO_DYNAMIC_SHAPE:
         ops[op].supports_dynamic_shape = False
 
-
-def register_dynamic_shape_ops(ops: OpList):
-    for op in [
-        *BINARY_OPS,
-        *UNARY_OPS,
-        *MATMUL_OPS,
-        *POOLING_OPS,
-        *CONVOLUTION_OPS,
-    ]:
+    for op in SUPPORTS_DYNAMIC_SHAPE:
         ops[op].supports_dynamic_shape = True
 
+    for op in PRIM_OPS:
+        ops[op].supports_texture = True
+        ops[op].supports_buffer = True
+        ops[op].supports_dynamic_shape = True
 
-def enumerate_supported_ops():
-    ops = OpList()
-    register_prim_ops(ops)
-    register_no_dynamic_shape_ops(ops)
-    register_dynamic_shape_ops(ops)
     return ops
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -108,6 +108,30 @@ def is_linear_permute(self, node: torch.fx.Node) -> bool:
 
         return False
 
+    def is_in_local_scalar_dense_chain(self, node: torch.fx.Node) -> bool:
+        """
+        Scalar tensors are usually converted to scalar values in the graph via`
+        scalar_tensor[0].item()` in Python, which translates to a chain of
+        `local_scalar_dense(torch.select.int(scalar_tensor, 0, 0))` in the graph.
+        This function marks the entire chain as supported by the Vulkan delegate.
+
+        Later, within vulkan_preprocess there will be a graph transform which
+        replaces the chain with passing in the scalar tensor directly.
+        """
+        if node.target == exir_ops.edge.aten.select_copy.int:
+            if len(node.users) != 1:
+                return False
+            if node.args[0].meta["val"].numel() != 1:
+                return False
+
+            user = list(node.users.keys())[0]
+            return user.target == torch.ops.aten._local_scalar_dense.default
+
+        if node.target == torch.ops.aten._local_scalar_dense.default:
+            return True
+
+        return False
+
     def is_node_supported(
         self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
     ) -> bool:
@@ -122,6 +146,9 @@ def _is_node_supported(
         if self.is_linear_permute(node):
             return True
 
+        if self.is_in_local_scalar_dense_chain(node):
+            return True
+
         if node.target not in VulkanSupportedOperators._ops:
             return False
 
diff --git a/backends/vulkan/passes/TARGETS b/backends/vulkan/passes/TARGETS
@@ -27,3 +27,16 @@ python_unittest(
         "//caffe2:torch",
     ],
 )
+
+runtime.python_library(
+    name = "remove_local_scalar_dense",
+    srcs = ["remove_local_scalar_dense_ops.py"],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+    ],
+)
diff --git a/backends/vulkan/passes/remove_local_scalar_dense_ops.py b/backends/vulkan/passes/remove_local_scalar_dense_ops.py
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+def remove_local_scalar_dense_ops(graph: torch.fx.Graph) -> torch.fx.Graph:
+    """
+    Remove local_scalar_dense op nodes and replace uses with parent node, or the
+    original scalar tensor.
+    """
+    target_op = torch.ops.aten._local_scalar_dense.default
+    for node in graph.nodes:
+        if node.op == "call_function" and node.target == target_op:
+            replace_node = node.args[0]
+            # If the argument to the local_scalar_dense op is a select op with only
+            # one user, and the argument to the select op is a tensor with only one
+            # element (i.e. a scalar tensor), then replace the entire pattern with the
+            # scalar tensor.
+            if (
+                replace_node.op == "call_function"
+                and replace_node.target == exir_ops.edge.aten.select_copy.int
+            ):
+                if replace_node.args[0].meta["val"].numel() == 1:
+                    replace_node = replace_node.args[0]
+
+            with graph.inserting_after(node):
+                node.replace_all_uses_with(replace_node)
+
+    graph.eliminate_dead_code()
+    return graph
+
+
+class RemoveLocalScalarDenseOpsTransform(ExportPass):
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph_module.graph = remove_local_scalar_dense_ops(graph_module.graph)
+        return PassResult(graph_module, True)
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
@@ -17,6 +17,10 @@
 from executorch.backends.transforms.mean_to_sum_div import MeanToSumDiv
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 
+from executorch.backends.vulkan.passes.remove_local_scalar_dense_ops import (
+    RemoveLocalScalarDenseOpsTransform,
+)
+
 from executorch.backends.vulkan.serialization.vulkan_graph_builder import VkGraphBuilder
 from executorch.backends.vulkan.serialization.vulkan_graph_serialize import (
     serialize_vulkan_graph,
@@ -57,6 +61,7 @@ def preprocess(  # noqa: C901
             MeanToSumDiv(),
             SpecPropPass(),
             ConstraintBasedSymShapeEvalPass(),
+            RemoveLocalScalarDenseOpsTransform(),
             MemoryPlanningPass(),
         ]