Conditionally support expand_copy in XNNPACK delegate

GregoryComer · GregoryComer · commit 144b5501f924 · 2025-12-05T19:50:03.000-08:00
diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
@@ -23,6 +23,7 @@
     op_dynamic_quantize_ops,
     op_elu,
     op_exp,
+    op_expand_copy,
     op_floor,
     op_gelu,
     op_hardswish,
diff --git a/backends/xnnpack/operators/op_expand_copy.py b/backends/xnnpack/operators/op_expand_copy.py
@@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNExpandDims,
+    XNNGraph,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+def check_expand_copy_constraints(node: torch.fx.Node) -> bool:
+    """
+    Checks whether the given expand_copy node is delegatable to XNNPACK.
+    XNNPACK only allows insertion of size-1 dimensions, not expanding existing
+    dims.
+    """
+    in_shape = get_input_node(node, 0).meta["val"].shape
+    new_shape = list(node.args[1])
+
+    assert len(new_shape) >= len(
+        in_shape
+    ), "Expanded shape must have rank >= input rank."
+
+    # Check new leading dims (if any). They must be of size 1.
+    new_leading_dims_count = len(new_shape) - len(in_shape)
+    for i in range(new_leading_dims_count):
+        if new_shape[i] != 1:
+            return False
+
+    # Check existing dims. PyTorch expand semantics don't allow for dim insertion other
+    # than at the front, so we just need to make sure none of the dims are expanded.
+    for i in range(len(new_shape) - new_leading_dims_count):
+        new_shape_at_dim = new_shape[new_leading_dims_count + i]
+        # -1 means preserve dim.
+        if new_shape_at_dim != -1 and new_shape_at_dim != in_shape[i]:
+            return False
+
+    return True
+
+
+def get_inserted_dim_indices(
+    node: torch.fx.Node,
+) -> list[int]:
+    """
+    Returns the indices of the inserted dimensions in the expanded shape. Assumes that
+    the node meets the conditions checked in check_expand_copy_constraints.
+    """
+    in_shape = get_input_node(node, 0).meta["val"].shape
+    new_shape = list(node.args[1])
+    new_dim_indices = []
+
+    assert len(new_shape) >= len(
+        in_shape
+    ), "Expanded shape must have rank >= input rank."
+
+    # PyTorch expand semantics enforce new dim insertion only at the front.
+    new_leading_dims_count = len(new_shape) - len(in_shape)
+    for i in range(new_leading_dims_count):
+        if new_shape[i] != 1:
+            return False
+        else:
+            new_dim_indices.append(i)
+
+    return new_dim_indices
+
+
+@register_node_visitor
+class ExpandCopyVisitor(NodeVisitor):
+    target = "aten.expand_copy.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input
+        input_id = vals_to_ids[get_input_node(node, 0)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        new_dim_indices = get_inserted_dim_indices(node)
+
+        ser_node = XNode(
+            xnode_union=XNNExpandDims(
+                num_new_dims=len(new_dim_indices),
+                new_dim_indices=new_dim_indices,
+                input_id=input_id,
+                output_id=output_id,
+                flags=0,
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -27,6 +27,7 @@
     ConstantPadConfig,
     DeQuantizedPerTensorConfig,
     DivConfig,
+    ExpandCopyConfig,
     # EluConfig,
     ExpConfig,
     FloorConfig,
@@ -87,6 +88,7 @@
     DivConfig,
     # EluConfig, # Waiting for PyTorch Pin Update
     ExpConfig,
+    ExpandCopyConfig,
     FloorConfig,
     GeluConfig,
     HardtanhConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -9,6 +9,8 @@
 import logging
 from typing import cast, List, Optional
 
+import executorch.backends.xnnpack.operators.op_expand_copy as op_expand_copy
+
 import numpy as np
 import torch
 from executorch.backends.xnnpack.partition.config.xnnpack_config import (
@@ -262,6 +264,30 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.elu.default
 
 
+class ExpandCopyConfig(GenericNodePartitionerConfig):
+    target_name = "expand_copy.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        return torch.ops.aten.expand_copy.default
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        """
+        Only partition expand_copy nodes that can be converted to view_copy (insertion of
+        singleton dims).
+        """
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        # Explicit false check here avoids non partitioning identity expand_copy.
+        if not op_expand_copy.check_expand_copy_constraints(node):
+            why(node, reason="only insertion of singleton dims is supported")
+            return False
+        return True
+
+
 class SoftmaxConfig(GenericNodePartitionerConfig):
     target_name = "_softmax.default"
 
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1130,6 +1130,35 @@ Error defineStaticReshapeNode(
   return Error::Ok;
 }
 
+Error defineExpandDimsNode(
+    xnn_subgraph_t subgraph_ptr,
+    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
+    const NodePtr node,
+    const fb_xnnpack::XNNGraph* graph) noexcept {
+  MAYBE_UNUSED(graph);
+
+  auto graph_node = node->xnode_union_as_XNNExpandDims();
+
+  // Get tensor dims, we need to convert the uint32_t* to size_t*
+  std::vector<size_t> dims_data =
+      flatbufferDimsToVector(graph_node->new_dim_indices());
+  xnn_status status = xnn_define_static_expand_dims(
+      subgraph_ptr,
+      graph_node->num_new_dims(),
+      dims_data.data(),
+      remapped_ids.at(graph_node->input_id()),
+      remapped_ids.at(graph_node->output_id()),
+      graph_node->flags());
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create expand_dims node %i with code: %s",
+      node->debug_handle(),
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
 /*
 Define serialized maxpool2d node into the subgraph, using the remapped ids
 to map the serialized ids, to the new ids generated when defining the
@@ -1784,6 +1813,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Convert)
     _DEFINE(GlobalAvgPooling2d)
     _DEFINE(StaticReshape)
+    _DEFINE(ExpandDims)
     _DEFINE(ArgMaxPooling2d)
     _DEFINE(Concatenate2)
     _DEFINE(Concatenate3)
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -158,6 +158,7 @@ union XNodeUnion {
   XNNExp: _XNNNode1x1,
   XNNSin: _XNNNode1x1,
   XNNCopy: _XNNNode1x1,
+  XNNExpandDims,
 }
 
 union XValueUnion {
@@ -296,6 +297,14 @@ table XNNStaticReshape {
   flags: uint;
 }
 
+table XNNExpandDims {
+  num_new_dims:uint;
+  new_dim_indices:[uint];
+  input_id: uint;
+  output_id: uint;
+  flags: uint;
+}
+
 table XNNStaticSlice {
   num_dims:uint;
   offsets:[uint];
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
@@ -154,6 +154,7 @@ union XNodeUnion {
   XNNExp: _XNNNode1x1,
   XNNSin: _XNNNode1x1,
   XNNCopy: _XNNNode1x1,
+  XNNExpandDims,
 }
 
 union XValueUnion {
@@ -292,6 +293,14 @@ table XNNStaticReshape {
   flags: uint;
 }
 
+table XNNExpandDims {
+  num_new_dims:uint;
+  new_dim_indices:[uint];
+  input_id: uint;
+  output_id: uint;
+  flags: uint;
+}
+
 table XNNStaticSlice {
   num_dims:uint;
   offsets:[uint];
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -368,6 +368,15 @@ class XNNScaledDotProductAttention:
     flags: int
 
 
+@dataclass
+class XNNExpandDims:
+    num_new_dims: int
+    new_dim_indices: List[int]
+    input_id: int
+    output_id: int
+    flags: int
+
+
 XNodeUnion = Union[
     XNNAdd,
     XNNFullyConnected,
diff --git a/backends/xnnpack/test/ops/test_expand.py b/backends/xnnpack/test/ops/test_expand.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class TestExpand(unittest.TestCase):
+    class Expand(torch.nn.Module):
+        def __init__(self, out_shape):
+            super().__init__()
+            self.out_shape = out_shape
+
+        def forward(self, x):
+            return x.expand(self.out_shape)
+
+    def test_fp32_insert_dim(self):
+        inputs = (torch.randn(8, 12),)
+        new_shapes = (
+            (1, 8, 12),
+            (1, 1, 8, 12),
+            (8, -1),
+            (-1, 12),
+            (1, -1, -1),
+            (1, 1, 8, -1),
+        )
+
+        for new_shape in new_shapes:
+            (
+                Tester(self.Expand(new_shape), tuple(inputs))
+                .export()
+                .check_node_count({torch.ops.aten.expand.default: 1})
+                .to_edge_transform_and_lower()
+                .check_node_count(
+                    {
+                        exir_ops.edge.aten.expand_copy.default: 0,
+                        exir_ops.edge.aten.view_copy.default: 0,
+                        torch.ops.higher_order.executorch_call_delegate: 1,
+                    }
+                )
+                .to_executorch()
+                .run_method_and_compare_outputs()
+            )
+
+    def test_fp32_unsupported_expand(self):
+        inputs = (torch.randn(1, 8, 12),)
+        new_shapes = (
+            (2, 8, 12),
+            (1, 2, 8, 12),
+            (2, 1, 8, 12),
+        )
+
+        for new_shape in new_shapes:
+            (
+                Tester(self.Expand(new_shape), tuple(inputs))
+                .export()
+                .check_node_count({torch.ops.aten.expand.default: 1})
+                .to_edge_transform_and_lower()
+                .check_node_count(
+                    {
+                        exir_ops.edge.aten.expand_copy.default: 1,
+                        exir_ops.edge.aten.view_copy.default: 0,
+                    }
+                )
+                .to_executorch()
+                .run_method_and_compare_outputs()
+            )