pytorch · SS-JIA · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
@@ -490,7 +490,6 @@ def register_rotary_emb_op():
 @update_features(
     [
         exir_ops.edge.aten.permute.default,
-        exir_ops.edge.aten.permute_copy.default,
     ]
 )
 def register_view_ops():
@@ -506,6 +505,7 @@ def register_view_ops():
         exir_ops.edge.aten.squeeze_copy.dims,
         exir_ops.edge.aten.unsqueeze_copy.default,
         exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.permute_copy.default,
     ]
 )
 def register_view_ops_with_buffer_meta():
@@ -515,6 +515,11 @@ def register_view_ops_with_buffer_meta():
     )
 
 
+@update_features(exir_ops.edge.aten.expand_copy.default)
+def register_expand():
+    return OpFeatures(inputs_storage=utils.ANY_BUFFER, supports_resize=False)
+
+
 # Fully featured transfer operators (i.e. operators that copy data from the input
 # tensor(s) to the output tensor(s)), which have memory layout agnostic implementations
 # for both texture and buffer storage types.

@@ -22,4 +22,5 @@ runtime.python_library(
         "//executorch/exir/backend:utils",
         "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
     ],
+    typing = True,
 )
@@ -61,6 +61,8 @@ def __init__(
         operator_blocklist: Optional[Set[OpKey]] = None,
         operator_allowlist: Optional[Set[OpKey]] = None,
         fusable_subgraphs: Optional[List[InternalMatch]] = None,
+        nn_module_blocklist: Optional[Set[str]] = None,
+        nn_module_allowlist: Optional[Set[str]] = None,
     ) -> None:
         super().__init__()
         self.texture_limits: utils.ImageExtents = texture_limits
@@ -78,6 +80,9 @@ def __init__(
         for match in self.fusable_subgraphs:
             self.fusable_nodes.update(match.nodes_map.values())
 
+        self.nn_module_blocklist = nn_module_blocklist
+        self.nn_module_allowlist = nn_module_allowlist
+
     def op_node_is_compatible(  # noqa: C901: Function is too complex
         self, node: torch.fx.Node, features: Optional[OpFeatures] = None
     ) -> Tuple[bool, str]:
@@ -213,10 +218,26 @@ def is_node_supported(
         r = self._is_node_supported(node)
         return r
 
-    def _is_node_supported(self, node: torch.fx.Node) -> bool:
-        # Check if this node is part of a fusable subgraph
-        if node.op == "call_function" and node in self.fusable_nodes:
-            return True
+    def _is_node_supported(self, node: torch.fx.Node) -> bool:  # noqa: C901
+        if node.op == "call_function":
+            # Apply nn module allowlist and blocklist
+            if self.nn_module_allowlist is not None:
+                if not utils.node_comes_from_any_nn_module_in_set(
+                    node, self.nn_module_allowlist
+                ):
+                    self.log_skip(node, "source nn.Module is not in allowlist")
+                    return False
+
+            if self.nn_module_blocklist is not None:
+                if utils.node_comes_from_any_nn_module_in_set(
+                    node, self.nn_module_blocklist
+                ):
+                    self.log_skip(node, "source nn.Module is in blocklist")
+                    return False
+
+            # Check if this node is part of a fusable subgraph
+            if node in self.fusable_nodes:
+                return True
 
         target = node.target
         if node.target == torch.ops.higher_order.auto_functionalized:
@@ -311,6 +332,8 @@ def __init__(
         compile_options: Optional[Dict[str, Any]] = None,
         operator_blocklist: Optional[List[OpKey]] = None,
         operator_allowlist: Optional[List[OpKey]] = None,
+        nn_module_blocklist: Optional[List[str]] = None,
+        nn_module_allowlist: Optional[List[str]] = None,
     ) -> None:
         self.options: Dict[str, Any] = {}
         if compile_options is not None:
@@ -331,6 +354,20 @@ def __init__(
                 assert self.operator_allowlist is not None
                 self.operator_allowlist.add(entry)
 
+        self.nn_module_blocklist: Optional[Set[str]] = None
+        if nn_module_blocklist is not None:
+            self.nn_module_blocklist = set()
+            for entry in nn_module_blocklist or []:
+                assert self.nn_module_blocklist is not None
+                self.nn_module_blocklist.add(entry)
+
+        self.nn_module_allowlist: Optional[Set[str]] = None
+        if nn_module_allowlist is not None:
+            self.nn_module_allowlist = set()
+            for entry in nn_module_allowlist:
+                assert self.nn_module_allowlist is not None
+                self.nn_module_allowlist.add(entry)
+
     def ops_to_not_decompose(
         self, ep: ExportedProgram
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
@@ -362,6 +399,8 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
                 operator_blocklist=self.operator_blocklist,
                 operator_allowlist=self.operator_allowlist,
                 fusable_subgraphs=fusable_subgraphs,
+                nn_module_blocklist=self.nn_module_blocklist,
+                nn_module_allowlist=self.nn_module_allowlist,
             ),
             allows_single_node_partition=True,
         )

@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const uint outp_bufi = gl_GlobalInvocationID.x;
+  if (outp_bufi >= numel(outp)) {
+    return;
+  }
+
+  TensorIndex outp_tidx;
+  linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
+
+  // Map output tensor index to input tensor index by taking modulo
+  // with input tensor sizes for each dimension
+  TensorIndex inp_tidx = outp_tidx;
+  for (int d = 0; d < ndim(inp); ++d) {
+    uint inp_size = size_at(inp, d);
+    uint outp_idx = idx_at(outp_tidx, d);
+    inp_tidx.data[div_4(d)][mod_4(d)] = outp_idx % inp_size;
+  }
+
+  const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
+  // Copy data from input to output
+  t_outp[outp_bufi] = t_inp[inp_bufi];
+}
@@ -0,0 +1,10 @@
+expand_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int32
+  shader_variants:
+    - NAME: expand_buffer
@@ -98,6 +98,15 @@ uint idx_at(const TensorIndex tidx, const int dim) {
   return tidx.data[div_4(dim)][mod_4(dim)];
 }
 
+void permute(inout TensorIndex tidx, const ivec4 permute_order[DIMLIMIT_DIV4]) {
+  TensorIndex new_tidx = tidx;
+  for (int d = 0; d < DIMLIMIT; ++d) {
+    int src_dim = permute_order[div_4(d)][mod_4(d)];
+    new_tidx.data[div_4(d)][mod_4(d)] = idx_at(tidx, src_dim);
+  }
+  tidx = new_tidx;
+}
+
 //
 // Index Conversions
 //

@@ -18,55 +18,31 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+#include "indexing.glslh"
 
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}
 
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
-${layout_declare_ubo(B, "ivec4", "out_strides")}
-${layout_declare_ubo(B, "int", "out_numel")}
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
 
-layout(push_constant) uniform restrict Block {
-  ivec4 in_strides;
-  ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
-};
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+${layout_declare_ubo(B, "ivec4[DIMLIMIT_DIV4]", "permute_order")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// Convert output tensor index to input tensor index based on permutation
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx;
-
-  // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
-  in_tidx[permute_dims.x] = out_tidx.x;
-  in_tidx[permute_dims.y] = out_tidx.y;
-  in_tidx[permute_dims.z] = out_tidx.z;
-  in_tidx[permute_dims.w] = out_tidx.w;
-
-  return in_tidx;
-}
-
 void main() {
-  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
-  if (out_bufi >= out_numel) {
+  const uint inp_bufi = gl_GlobalInvocationID.x;
+  if (inp_bufi >= numel(inp)) {
     return;
   }
 
-  // Convert buffer index to tensor index for output
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
-
-  // Convert output tensor index to input tensor index using permutation
-  const ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+  TensorIndex inp_tidx;
+  linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
 
-  // Convert input tensor index back to buffer index
-  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+  TensorIndex outp_tidx = inp_tidx;
+  permute(outp_tidx, permute_order);
 
+  const uint outp_bufi = tensor_idx_to_linear_idx(outp, outp_tidx);
   // Copy data from input to output
-  t_out[out_bufi] = t_in[in_bufi];
+  t_outp[outp_bufi] = t_inp[inp_bufi];
 }
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_expand_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef size,
+    const ValueRef out) {
+  std::string kernel_name = "expand";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  vkapi::ParamsBindList param_buffers = {
+      graph.buffer_meta_ubo(out),
+      graph.buffer_meta_ubo(in),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Parameter buffers
+      param_buffers,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {},
+      // Resize Args
+      {size},
+      // Resizing Logic
+      nullptr));
+}
+
+void expand(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int idx = 0;
+  const ValueRef in = args.at(idx++);
+  const ValueRef size = args.at(idx++);
+  const ValueRef implicit = args.at(idx++);
+  (void)implicit;
+  const ValueRef out = args.at(idx++);
+
+  if (graph.is_buffer_storage(out)) {
+    return add_expand_buffer_node(graph, in, size, out);
+  }
+
+  VK_THROW("Expand operator only supports buffer storage");
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.expand_copy.default, expand);
+}
+
+} // namespace vkcompute