Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,6 @@ def register_rotary_emb_op():
@update_features(
[
exir_ops.edge.aten.permute.default,
exir_ops.edge.aten.permute_copy.default,
]
)
def register_view_ops():
Expand All @@ -506,6 +505,7 @@ def register_view_ops():
exir_ops.edge.aten.squeeze_copy.dims,
exir_ops.edge.aten.unsqueeze_copy.default,
exir_ops.edge.aten.clone.default,
exir_ops.edge.aten.permute_copy.default,
]
)
def register_view_ops_with_buffer_meta():
Expand All @@ -515,6 +515,11 @@ def register_view_ops_with_buffer_meta():
)


@update_features(exir_ops.edge.aten.expand_copy.default)
def register_expand():
return OpFeatures(inputs_storage=utils.ANY_BUFFER, supports_resize=False)


# Fully featured transfer operators (i.e. operators that copy data from the input
# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations
# for both texture and buffer storage types.
Expand Down
1 change: 1 addition & 0 deletions backends/vulkan/partitioner/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ runtime.python_library(
"//executorch/exir/backend:utils",
"//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
],
typing = True,
)
47 changes: 43 additions & 4 deletions backends/vulkan/partitioner/vulkan_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(
operator_blocklist: Optional[Set[OpKey]] = None,
operator_allowlist: Optional[Set[OpKey]] = None,
fusable_subgraphs: Optional[List[InternalMatch]] = None,
nn_module_blocklist: Optional[Set[str]] = None,
nn_module_allowlist: Optional[Set[str]] = None,
) -> None:
super().__init__()
self.texture_limits: utils.ImageExtents = texture_limits
Expand All @@ -78,6 +80,9 @@ def __init__(
for match in self.fusable_subgraphs:
self.fusable_nodes.update(match.nodes_map.values())

self.nn_module_blocklist = nn_module_blocklist
self.nn_module_allowlist = nn_module_allowlist

def op_node_is_compatible( # noqa: C901: Function is too complex
self, node: torch.fx.Node, features: Optional[OpFeatures] = None
) -> Tuple[bool, str]:
Expand Down Expand Up @@ -213,10 +218,26 @@ def is_node_supported(
r = self._is_node_supported(node)
return r

def _is_node_supported(self, node: torch.fx.Node) -> bool:
# Check if this node is part of a fusable subgraph
if node.op == "call_function" and node in self.fusable_nodes:
return True
def _is_node_supported(self, node: torch.fx.Node) -> bool: # noqa: C901
if node.op == "call_function":
# Apply nn module allowlist and blocklist
if self.nn_module_allowlist is not None:
if not utils.node_comes_from_any_nn_module_in_set(
node, self.nn_module_allowlist
):
self.log_skip(node, "source nn.Module is not in allowlist")
return False

if self.nn_module_blocklist is not None:
if utils.node_comes_from_any_nn_module_in_set(
node, self.nn_module_blocklist
):
self.log_skip(node, "source nn.Module is in blocklist")
return False

# Check if this node is part of a fusable subgraph
if node in self.fusable_nodes:
return True

target = node.target
if node.target == torch.ops.higher_order.auto_functionalized:
Expand Down Expand Up @@ -311,6 +332,8 @@ def __init__(
compile_options: Optional[Dict[str, Any]] = None,
operator_blocklist: Optional[List[OpKey]] = None,
operator_allowlist: Optional[List[OpKey]] = None,
nn_module_blocklist: Optional[List[str]] = None,
nn_module_allowlist: Optional[List[str]] = None,
) -> None:
self.options: Dict[str, Any] = {}
if compile_options is not None:
Expand All @@ -331,6 +354,20 @@ def __init__(
assert self.operator_allowlist is not None
self.operator_allowlist.add(entry)

self.nn_module_blocklist: Optional[Set[str]] = None
if nn_module_blocklist is not None:
self.nn_module_blocklist = set()
for entry in nn_module_blocklist or []:
assert self.nn_module_blocklist is not None
self.nn_module_blocklist.add(entry)

self.nn_module_allowlist: Optional[Set[str]] = None
if nn_module_allowlist is not None:
self.nn_module_allowlist = set()
for entry in nn_module_allowlist:
assert self.nn_module_allowlist is not None
self.nn_module_allowlist.add(entry)

def ops_to_not_decompose(
self, ep: ExportedProgram
) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
Expand Down Expand Up @@ -362,6 +399,8 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
operator_blocklist=self.operator_blocklist,
operator_allowlist=self.operator_allowlist,
fusable_subgraphs=fusable_subgraphs,
nn_module_blocklist=self.nn_module_blocklist,
nn_module_allowlist=self.nn_module_allowlist,
),
allows_single_node_partition=True,
)
Expand Down
51 changes: 51 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}
#define T ${buffer_scalar_type(DTYPE)}

${define_required_extensions(DTYPE)}

layout(std430) buffer;

#include "indexing.glslh"

${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}

${layout_declare_ubo(B, "BufferMetadata", "outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
const uint outp_bufi = gl_GlobalInvocationID.x;
if (outp_bufi >= numel(outp)) {
return;
}

TensorIndex outp_tidx;
linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);

// Map output tensor index to input tensor index by taking modulo
// with input tensor sizes for each dimension
TensorIndex inp_tidx = outp_tidx;
for (int d = 0; d < ndim(inp); ++d) {
uint inp_size = size_at(inp, d);
uint outp_idx = idx_at(outp_tidx, d);
inp_tidx.data[div_4(d)][mod_4(d)] = outp_idx % inp_size;
}

const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
// Copy data from input to output
t_outp[outp_bufi] = t_inp[inp_bufi];
}
10 changes: 10 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/expand_buffer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
expand_buffer:
parameter_names_with_default_values:
DTYPE: float
generate_variant_forall:
DTYPE:
- VALUE: half
- VALUE: float
- VALUE: int32
shader_variants:
- NAME: expand_buffer
9 changes: 9 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,15 @@ uint idx_at(const TensorIndex tidx, const int dim) {
return tidx.data[div_4(dim)][mod_4(dim)];
}

void permute(inout TensorIndex tidx, const ivec4 permute_order[DIMLIMIT_DIV4]) {
TensorIndex new_tidx = tidx;
for (int d = 0; d < DIMLIMIT; ++d) {
int src_dim = permute_order[div_4(d)][mod_4(d)];
new_tidx.data[div_4(d)][mod_4(d)] = idx_at(tidx, src_dim);
}
tidx = new_tidx;
}

//
// Index Conversions
//
Expand Down
52 changes: 14 additions & 38 deletions backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -18,55 +18,31 @@ ${define_required_extensions(DTYPE)}

layout(std430) buffer;

#include "indexing_utils.h"
#include "indexing.glslh"

${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}

${layout_declare_ubo(B, "ivec4", "in_sizes")}
${layout_declare_ubo(B, "ivec4", "out_strides")}
${layout_declare_ubo(B, "int", "out_numel")}
${layout_declare_ubo(B, "BufferMetadata", "outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}

layout(push_constant) uniform restrict Block {
ivec4 in_strides;
ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
};

${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}

const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
${layout_declare_ubo(B, "ivec4[DIMLIMIT_DIV4]", "permute_order")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

// Convert output tensor index to input tensor index based on permutation
ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
ivec4 in_tidx;

// Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
in_tidx[permute_dims.x] = out_tidx.x;
in_tidx[permute_dims.y] = out_tidx.y;
in_tidx[permute_dims.z] = out_tidx.z;
in_tidx[permute_dims.w] = out_tidx.w;

return in_tidx;
}

void main() {
const int out_bufi = ivec3(gl_GlobalInvocationID).x;
if (out_bufi >= out_numel) {
const uint inp_bufi = gl_GlobalInvocationID.x;
if (inp_bufi >= numel(inp)) {
return;
}

// Convert buffer index to tensor index for output
const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);

// Convert output tensor index to input tensor index using permutation
const ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
TensorIndex inp_tidx;
linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);

// Convert input tensor index back to buffer index
const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
TensorIndex outp_tidx = inp_tidx;
permute(outp_tidx, permute_order);

const uint outp_bufi = tensor_idx_to_linear_idx(outp, outp_tidx);
// Copy data from input to output
t_out[out_bufi] = t_in[in_bufi];
t_outp[outp_bufi] = t_inp[inp_bufi];
}
71 changes: 71 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Expand.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>

namespace vkcompute {

void add_expand_buffer_node(
ComputeGraph& graph,
const ValueRef in,
const ValueRef size,
const ValueRef out) {
std::string kernel_name = "expand";
kernel_name.reserve(kShaderNameReserve);
add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
add_dtype_suffix(kernel_name, graph.dtype_of(out));

vkapi::ParamsBindList param_buffers = {
graph.buffer_meta_ubo(out),
graph.buffer_meta_ubo(in),
};

graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
default_pick_global_wg_size,
default_pick_local_wg_size,
{{out, vkapi::kWrite}, {in, vkapi::kRead}},
// Parameter buffers
param_buffers,
// Push Constants
{},
// Specialization Constants
{},
// Resize Args
{size},
// Resizing Logic
nullptr));
}

void expand(ComputeGraph& graph, const std::vector<ValueRef>& args) {
int idx = 0;
const ValueRef in = args.at(idx++);
const ValueRef size = args.at(idx++);
const ValueRef implicit = args.at(idx++);
(void)implicit;
const ValueRef out = args.at(idx++);

if (graph.is_buffer_storage(out)) {
return add_expand_buffer_node(graph, in, size, out);
}

VK_THROW("Expand operator only supports buffer storage");
}

REGISTER_OPERATORS {
VK_REGISTER_OP(aten.expand_copy.default, expand);
}

} // namespace vkcompute
Loading
Loading