diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index a6cc59e26f0..a711f81b738 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -489,10 +489,8 @@ def register_rotary_emb_op(): @update_features( [ - exir_ops.edge.aten.clone.default, exir_ops.edge.aten.permute.default, exir_ops.edge.aten.permute_copy.default, - exir_ops.edge.aten.view_copy.default, ] ) def register_view_ops(): @@ -502,6 +500,21 @@ def register_view_ops(): ) +@update_features( + [ + exir_ops.edge.aten.view_copy.default, + exir_ops.edge.aten.squeeze_copy.dims, + exir_ops.edge.aten.unsqueeze_copy.default, + exir_ops.edge.aten.clone.default, + ] +) +def register_view_ops_with_buffer_meta(): + return OpFeatures( + inputs_storage=utils.ANY_STORAGE, + supports_resize=True, + ) + + # Fully featured transfer operators (i.e. operators that copy data from the input # tensor(s) to the output tensor(s)), which have memory layout agnostic implementations # for both texture and buffer storage types. @@ -562,9 +575,6 @@ def register_ported_op(): # Ops ported from PyTorch Vulkan backend. These ops are in a separate registry because they support all packed dimensions @update_features( [ - # Shape Manipulation - exir_ops.edge.aten.squeeze_copy.dims, - exir_ops.edge.aten.unsqueeze_copy.default, # Tensor combination exir_ops.edge.aten.repeat.default, exir_ops.edge.aten.split_with_sizes_copy.default, diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl new file mode 100644 index 00000000000..2c02803a9b1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl @@ -0,0 +1,44 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "indexing.glslh" + +${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)} + +${layout_declare_ubo(B, "BufferMetadata", "outp")} +${layout_declare_ubo(B, "BufferMetadata", "inp")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +/* + * The insight behind the view operation is that the contiguous index of each + * tensor element in the input and output tensors are the same. + */ +void main() { + const uint outp_bufi = gl_GlobalInvocationID.x; + if (outp_bufi >= numel(outp)) { + return; + } + + TensorIndex outp_tidx; + linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx); + + // To map the output to the input, find the input element that has the same + // contiguous index as the output element. + const uint contig_idx = tensor_idx_to_contiguous_idx(outp, outp_tidx); + + TensorIndex inp_tidx; + contiguous_idx_to_tensor_idx(inp, contig_idx, inp_tidx); + + const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx); + + t_outp[outp_bufi] = t_inp[inp_bufi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml new file mode 100644 index 00000000000..ec92bf483c8 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +view_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: double + - VALUE: int8 + - VALUE: uint8 + - VALUE: int32 + shader_variants: + - NAME: view_buffer diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp index 04e74af4e0c..0ae9d53a481 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp @@ -143,7 +143,11 @@ void clone(ComputeGraph& graph, const std::vector& args) { if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) { return add_buffer_to_image_node(graph, src, dst); } - VK_THROW("Buffer to buffer memory layout transition not supported yet!"); + + std::vector extra_args = {}; + // Buffer to buffer copy + return add_view_copy_buffer_node( + graph, src, dst, extra_args, resize_clone_node); } // Clone node is not the most efficient implementation for the aten.clone diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp index 249f5e7fa6b..13801b45cc7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -55,8 +56,49 @@ void add_squeeze_copy_dims_node( } } +void resize_squeeze_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef in = args.at(1).refs.at(0); + const ValueRef dims_ref = extra_args.at(0); + + const IntListPtr dims = graph->get_int_list(dims_ref); + + std::vector out_sizes = graph->sizes_of(in); + + // Remove the dimensions specified in dims if their size is 1 + for (int64_t dim : *dims) { + if (dim >= 0 && dim < static_cast(out_sizes.size()) && + out_sizes[dim] == 1) { + out_sizes.erase(out_sizes.begin() + dim); + // After erasing, all subsequent dims shift left by one + // So we need to decrement all subsequent dims in dims + for (auto& d : *dims) { + if (d > dim) { + --d; + } + } + } + } + + graph->virtual_resize(out, out_sizes); +} + void squeeze_copy_dims(ComputeGraph& graph, const std::vector& args) { - return add_squeeze_copy_dims_node(graph, args[0], args[1], args[2]); + int idx = 0; + const ValueRef in = args.at(idx++); + const ValueRef dims = args.at(idx++); + const ValueRef out = args.at(idx++); + + std::vector resize_args = {dims}; + + if (graph.is_buffer_storage(in)) { + return add_view_copy_buffer_node( + graph, in, out, resize_args, resize_squeeze_node); + } + return add_squeeze_copy_dims_node(graph, in, dims, out); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp index c4de5d88f30..0a98f6d8f43 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -45,8 +46,42 @@ void add_unsqueeze_node( add_permute_node(graph, in, permute_dims_ref, out); } +void resize_unsqueeze_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef in = args.at(1).refs.at(0); + const ValueRef dims_ref = extra_args.at(0); + + const IntListPtr dims = graph->get_int_list(dims_ref); + + std::vector out_sizes = graph->sizes_of(in); + + // Insert singleton dimensions at the specified positions + for (auto dim : *dims) { + int64_t d = dim; + if (d < 0) { + d += static_cast(out_sizes.size()) + 1; + } + out_sizes.insert(out_sizes.begin() + d, 1); + } + + graph->virtual_resize(out, out_sizes); +} + void unsqueeze(ComputeGraph& graph, const std::vector& args) { - return add_unsqueeze_node(graph, args[0], args[1], args[2]); + int idx = 0; + const ValueRef in = args.at(idx++); + const ValueRef dims = args.at(idx++); + const ValueRef out = args.at(idx++); + + std::vector resize_args = {dims}; + if (graph.is_buffer_storage(in)) { + return add_view_copy_buffer_node( + graph, in, out, resize_args, resize_unsqueeze_node); + } + return add_unsqueeze_node(graph, in, dims, out); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp index cb868acf7e9..8701a6246b0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/View.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp @@ -89,8 +89,47 @@ void add_view_node( resize_view_node)); } +void add_view_copy_buffer_node( + ComputeGraph& graph, + ValueRef in, + ValueRef out, + const std::vector& resize_args, + const ExecuteNode::ResizeFunction& resize_fn) { + std::string kernel_name = "view_buffer"; + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + default_pick_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {in, vkapi::kRead}}, + // Parameter Buffers + {graph.buffer_meta_ubo(out), graph.buffer_meta_ubo(in)}, + // Push Constants + {}, + // Specialization Constants + {}, + // Resize Args + resize_args, + // Resizing Logic + resize_fn)); +} + void view(ComputeGraph& graph, const std::vector& args) { - return add_view_node(graph, args[0], args[1], args[2]); + int idx = 0; + const ValueRef in = args.at(idx++); + const ValueRef sizes = args.at(idx++); + const ValueRef out = args.at(idx++); + + std::vector resize_args = {sizes}; + + if (graph.is_buffer_storage(out)) { + return add_view_copy_buffer_node( + graph, in, out, resize_args, resize_view_node); + } + return add_view_node(graph, in, sizes, out); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/View.h b/backends/vulkan/runtime/graph/ops/impl/View.h index a2038d184c3..7a7a8d57742 100644 --- a/backends/vulkan/runtime/graph/ops/impl/View.h +++ b/backends/vulkan/runtime/graph/ops/impl/View.h @@ -12,6 +12,18 @@ namespace vkcompute { +/* + * Dispatches the view_copy compute shader. This can be used to implement ops + * that preserve the "contiguous" indexes of elements between the input and + * output such as view_copy, squeeze_copy, unsqueeze_copy, etc. + */ +void add_view_copy_buffer_node( + ComputeGraph& graph, + ValueRef in, + ValueRef out, + const std::vector& resize_args, + const ExecuteNode::ResizeFunction& resize_fn); + void add_view_node( ComputeGraph& graph, ValueRef in, diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index f03b9a50737..e04ad80aa86 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -911,7 +911,28 @@ def get_view_inputs(): "utils::kHeightPacked", "utils::kChannelsPacked", ] - return test_suite + + highdim_test_suite = VkTestSuite( + [ + ((1, 1, 3, 3, 3), (9, 3)), + ((2, 3, 4, 6, 5, 4), (6, 4, 6, 5, 4)), + ((2, 3, 3, 7, 8), (2, 3, 3, 8 * 7)), + ] + ) + highdim_test_suite.storage_types = [ + "utils::kBuffer", + ] + highdim_test_suite.test_name_suffix = "highdim" + highdim_test_suite.data_gen = "make_seq_tensor" + + for suite in [test_suite, highdim_test_suite]: + suite.layouts = [ + # "utils::kWidthPacked", + "utils::kHeightPacked", + "utils::kChannelsPacked", + ] + + return [test_suite, highdim_test_suite] @register_test_suite("aten.slice_copy.Tensor") @@ -1124,12 +1145,34 @@ def get_unsqueeze_inputs(): ((1, 10), -1), ] ) - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", + + highdim_test_suite = VkTestSuite( + [ + ((2, 3, 4, 5, 6), 0), + ((2, 3, 4, 5, 6), 1), + ((2, 3, 4, 5, 6), 5), + ((2, 3, 4, 5, 6), -1), + ((2, 3, 4, 5, 6), -2), + ((1, 2, 3, 4, 5), 0), + ((1, 2, 3, 4, 5), 3), + ((1, 2, 3, 4, 5), -1), + ((2, 3, 4, 5), 0), + ((1, 2, 3, 4), 1), + ] + ) + highdim_test_suite.storage_types = [ + "utils::kBuffer", ] - test_suite.data_gen = "make_seq_tensor" - return test_suite + highdim_test_suite.test_name_suffix = "highdim" + + for suite in [test_suite, highdim_test_suite]: + suite.layouts = [ + "utils::kWidthPacked", + "utils::kChannelsPacked", + ] + suite.data_gen = "make_seq_tensor" + + return [test_suite, highdim_test_suite] @register_test_suite("aten.clone.default") @@ -1149,11 +1192,28 @@ def get_clone_inputs(): ((XS,),), ] ) - test_suite.layouts = [ - "utils::kChannelsPacked", + + highdim_test_suite = VkTestSuite( + [ + ((2, 3, 4, 5, 6),), + ((2, 3, 4, 5, 1),), + ((1, 1, 3, 4, 5),), + ((2, 3, 4, 5, 6, 7),), + ((1, 2, 3, 4, 5, 6),), + ] + ) + highdim_test_suite.storage_types = [ + "utils::kBuffer", ] - test_suite.data_gen = "make_seq_tensor" - return test_suite + highdim_test_suite.test_name_suffix = "highdim" + + for suite in [test_suite, highdim_test_suite]: + suite.layouts = [ + "utils::kChannelsPacked", + ] + suite.data_gen = "make_seq_tensor" + + return [test_suite, highdim_test_suite] @register_test_suite("aten.repeat.default") @@ -1773,7 +1833,31 @@ def get_squeeze_copy_dim_inputs(): ([1, M1, M1], 0), ] ) - return test_suite + + highdim_test_suite = VkTestSuite( + [ + ([1, 2, 3, 4, 5, 1], 0), + ([1, 2, 3, 4, 5, 1], 5), + ([1, 2, 3, 4, 5, 1], [0, 5]), + ([2, 1, 3, 1, 5, 6], 1), + ([2, 1, 3, 1, 5, 6], 3), + ([2, 1, 3, 1, 5, 6], [1, 3]), + ([1, 1, 3, 4, 5, 6], [0, 1]), + ([2, 3, 4, 1, 1, 6], [3, 4]), + ] + ) + highdim_test_suite.storage_types = [ + "utils::kBuffer", + ] + highdim_test_suite.test_name_suffix = "highdim" + + for suite in [test_suite, highdim_test_suite]: + suite.layouts = [ + "utils::kWidthPacked", + "utils::kChannelsPacked", + ] + + return [test_suite, highdim_test_suite] @register_test_suite("aten.flip.default") diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 33536acb662..687a8761c6b 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -1777,20 +1777,6 @@ def forward(self, x): (torch.rand(size=[1, 5, 2, 3]),), ) - def test_vulkan_backend_high_dim_tensors_fail(self): - class UnsqueezeHigherDim(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.unsqueeze(x, 2) - - self.lower_module_and_test_output( - UnsqueezeHigherDim(), - (torch.ones(size=[5, 4, 1, 2, 6]),), - expect_no_delegates=True, - ) - def test_vulkan_backend_large_linear_layer(self): class LinearModel(torch.nn.Module): def __init__(self, large_out_channels: int) -> None: @@ -2298,6 +2284,28 @@ def forward(self, x1, x2, x3, x4, x5, x6): test_inputs=test_inputs, ) + def test_vulkan_backend_high_dimensional_tensors(self): + class HighDimTensorModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + # Unsqueeze inputs twice to create 5-dim tensors + x_5d = torch.unsqueeze(torch.unsqueeze(x, 0), 0) + y_5d = torch.unsqueeze(torch.unsqueeze(y, 0), 0) + # Add tensors together + result = x_5d + y_5d + return result + + high_dim_module = HighDimTensorModule() + # Create 2 4-dim inputs + sample_inputs = ( + torch.rand(size=(2, 3, 4, 5), dtype=torch.float32), + torch.rand(size=(2, 3, 4, 5), dtype=torch.float32), + ) + + self.lower_module_and_test_output(high_dim_module, sample_inputs) + def test_vulkan_backend_torchao_wo_quantized_linear(self): in_features = 1024 out_features = 512