From 41550c795a9ef1cf53d405c9729537d6fd6c8564 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Thu, 12 Jun 2025 08:03:05 -0700 Subject: [PATCH] [ET-VK] Use dim order when converting buffer index to tensor index ## Changes * Update callsites to `bufi_to_tidx` to account for the tensor dim order * Remove existing functions which do not accept dim order as argument. ## Motivation > Update callsites to `bufi_to_tidx` to account for the tensor dim order > Remove existing functions which do not accept dim order as argument. As mentioned in the below diff, dim order is required to properly convert from a linear buffer index to N-dimension tensor index using a tensor's strides. Technically the dim order can be inferred from the strides array by performing an index sort. However, for the sake of efficiency it is better to just pass the dim order directly into the compute shader. Currently the `bufi_to_tidx` function which performs the conversion between buffer index and tensor index assumes that the dim order follows a specific pattern using the packed dim as an input. However, it is not guaranteed that the dim order is the same as what is assumed. Furthermore, there is an existing bug when calling `bufi_to_tidx` without providing `packed_dim` as an input. In this case, the function will infer the packed dim by finding the first dim with a stride of 1. However, this causes issues when multiple dims may have a stride of 1, which may occur when there are dims with a size of 1. In this case the wrong packed dim may be inferred and therefore the assumed dim order is completely wrong. To address these issues, make it standard to either account for the packed dim when converting bufi to tidx, or to explicitly call out an assumption about the tensor's dim order. ## Performance Impact * None expected Differential Revision: [D76393428](https://our.internmc.facebook.com/intern/diff/D76393428/) [ghstack-poisoned] --- .../runtime/graph/ops/glsl/binary_op.glsl | 13 +++-- .../runtime/graph/ops/glsl/indexing_utils.h | 54 +++++++++---------- .../runtime/graph/ops/glsl/linear_qcsnw.glsl | 2 +- .../graph/ops/glsl/nchw_to_buffer.glsl | 17 +++--- .../runtime/graph/ops/glsl/select.glslh | 4 ++ .../vulkan/runtime/graph/ops/glsl/slice.glslh | 4 ++ .../graph/ops/glsl/transfer_buffer.glsl | 8 +-- .../vulkan/runtime/graph/ops/glsl/where.glsl | 28 +++------- .../runtime/graph/ops/impl/BinaryOp.cpp | 6 +-- .../graph/ops/impl/QuantizedLinearQCSNW.cpp | 4 ++ .../runtime/graph/ops/impl/Transfer.cpp | 16 ++---- .../vulkan/runtime/graph/ops/impl/Where.cpp | 7 +-- backends/vulkan/test/op_tests/cases.py | 6 ++- backends/vulkan/test/utils/test_utils.cpp | 3 +- 14 files changed, 81 insertions(+), 91 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index ce986d4e12f..a0a235154a0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -48,19 +48,18 @@ $else: layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} + $if STORAGE == "buffer": - ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")} + const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); $else: - ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); const lowp int packed_dim = unhash_packed_dim(out_layout); - ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} const lowp ivec4 other_axis_map = unhash_axis_map(other_layout); #ifdef USING_BUFFER @@ -77,7 +76,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); const ivec4 in_tidx = min(out_tidx, in_sizes - 1); const ivec4 other_tidx = min(out_tidx, other_sizes - 1); diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 2b41d2b7e1a..7e326878fa2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -68,21 +68,6 @@ */ #define mod4(x) ((x) & 3) -/* - * Find the packed dimension of a tensor given its strides. The packed dimension - * is the "fastest moving" dimension which will have a stride of 1. - */ -int find_packed_dim(const ivec4 strides) { - int packed_dim = 0; - for (int i = 0; i <= 3; i++) { - if (strides[i] == 1) { - packed_dim = i; - break; - } - } - return packed_dim; -} - /* * Get the staging buffer indices that contain the data of the texel that * corresponds to the provided tensor index. Since the texel have 4 elements, @@ -129,27 +114,26 @@ int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) { tidx.x; } -// TODO(ssjia): make this function use dim order so that it can work with any -// dim order. Currently it assumes that the dim order is contiguous, except for -// the packed dim. -ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) { +ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) { ivec4 idx; for (int i = 3; i >= 0; i--) { - if (i != packed_dim) { - idx[i] = bufi / strides[i]; - bufi %= strides[i]; - } + int dim = dim_order[i]; + idx[dim] = bufi / strides[dim]; + bufi %= strides[dim]; } - idx[packed_dim] = bufi; return idx; } -// Convenience overload of the above function, which will determine the packed -// dim from the strides automatically so it doesn't have to be passed in as a -// function argument. -ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) { - int packed_dim = find_packed_dim(strides); - return bufi_to_tidx(bufi, strides, packed_dim); +/* + * bufi_to_tidx but assumes that the tensor is contiguous + */ +ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) { + ivec4 idx; + for (int i = 3; i >= 0; i--) { + idx[i] = bufi / strides[i]; + bufi %= strides[i]; + } + return idx; } int tidx_to_bufi(const ivec4 tidx, ivec4 strides) { @@ -271,10 +255,20 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) { #define unhash_axis_map(hash) \ ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)) +/* + * + */ +#define unhash_dim_order(hash) \ + ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)) + #define unhash_packed_dim(hash) int(hash >> 16 & 0xf) #define DEFAULT_LAYOUT 0x02210 +#define DEFAULT_DIM_ORDER 0x03210 + +#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3) + /************************ * Deprecated Functions * ************************/ diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl index dfb5f1f2f9c..4dd83f0d4ed 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl @@ -62,7 +62,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0); + const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides); const FLOAT_T scale = t_scales[out_tidx.x]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl index ba4e4dd9dd9..62cd0610ffb 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl @@ -10,8 +10,8 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)} $if USE_PUSH_CONST: layout(push_constant) uniform restrict Block { @@ -20,15 +20,14 @@ $if USE_PUSH_CONST: int numel; }; $else: - ${layout_declare_ubo(2, "ivec4", "out_sizes")} - ${layout_declare_ubo(3, "ivec4", "out_strides")} - ${layout_declare_ubo(4, "int", "numel")} + ${layout_declare_ubo(B, "ivec4", "out_sizes")} + ${layout_declare_ubo(B, "ivec4", "out_strides")} + ${layout_declare_ubo(B, "int", "numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// This constant is unused in this shader but is kept so that the signature is -// consistent with nchw_to_image. -${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); ${layout_declare_spec_const(C, "int", "transpose_hw", "0")} void main() { @@ -37,7 +36,7 @@ void main() { return; } - ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides); + ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); ivec4 sizes = out_sizes; if (transpose_hw == 1) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh index 3bcbf04a3ba..6509015b4b6 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -9,6 +9,8 @@ #ifndef SELECT_GLSLH #define SELECT_GLSLH +#ifndef USING_BUFFER + /* * Enable the fast path if a texel loaded from the input texture can be used as * is to store to the output texture. The following conditions must be met: @@ -29,6 +31,8 @@ bool can_use_fast_path() { return true; } +#endif // USING_BUFFER + /* * Given an output tensor index, return the corresponding input tensor index for * the select operator. This is done by "inserting" the select index at the diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh index 5d4cc70fdc1..87325754f4d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -9,6 +9,8 @@ #ifndef SLICE_GLSLH #define SLICE_GLSLH +#ifndef USING_BUFFER + /** * Enable the fast path if a texel loaded from the input texture can be used as * is to store to the output texture. The following conditions must be met: @@ -26,6 +28,8 @@ bool can_use_fast_path() { return true; } +#endif // USING_BUFFER + /* * Converts output tensor indices to input tensor indices for the slice operation. * This function maps the output indices to the corresponding input indices based on diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl index 3ca854e0526..7e95b52d8f4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl @@ -37,8 +37,10 @@ layout(push_constant) uniform restrict Block { int selected_dim; }; -${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -50,7 +52,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); const int in_bufi = tidx_to_bufi(in_tidx, in_strides); diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl index 5df813d1241..fe6304c0fa0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -37,40 +37,28 @@ $if STORAGE == "buffer": ${layout_declare_ubo(B, "ivec4", "cond_strides")} ${layout_declare_ubo(B, "ivec4", "self_strides")} ${layout_declare_ubo(B, "ivec4", "other_strides")} - - ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "cond_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "self_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")} $else: ${layout_declare_ubo(B, "ivec3", "out_limits")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); + layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #ifdef USING_BUFFER void main() { int out_bufi = int(gl_GlobalInvocationID.x); - // ivec4 tidx = ivec4(gl_GlobalInvocationID, 0); - // int out_bufi = tidx_to_bufi(tidx, out_strides); - // int cond_bufi = tidx_to_bufi(tidx, cond_strides); - // int self_bufi = tidx_to_bufi(tidx, self_strides); - // int other_bufi = tidx_to_bufi(tidx, other_strides); if (out_bufi >= out_numl) { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - out_bufi = tidx_to_bufi(out_tidx, out_strides); - - const ivec4 cond_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int cond_bufi = tidx_to_bufi(cond_tidx, cond_strides); - - const ivec4 self_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int self_bufi = tidx_to_bufi(self_tidx, self_strides); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); - const ivec4 other_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int other_bufi = tidx_to_bufi(other_tidx, other_strides); + const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides); + const int self_bufi = tidx_to_bufi(out_tidx, self_strides); + const int other_bufi = tidx_to_bufi(out_tidx, other_strides); COND_T cond = t_condition[cond_bufi] ; T v_self = t_self[self_bufi]; diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index d260ed767d0..28279c196c0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -143,9 +143,9 @@ void add_binary_op_buffer_node( PushConstantDataInfo(&alpha_val, sizeof(float)), }}, // Specialization Constants - {graph.packed_dim_of(out), - graph.packed_dim_of(in1), - graph.packed_dim_of(in2)}, + {graph.hashed_layout_of(out), + graph.hashed_layout_of(in1), + graph.hashed_layout_of(in2)}, // Resize Args {}, // Resizing Logic diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp index 6e101195e3f..07502a7a107 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp @@ -43,6 +43,10 @@ void check_linear_qcsnw_args( VK_CHECK_COND( utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes)); } + + if (graph.is_buffer_storage(out)) { + VK_CHECK_COND(graph.is_contiguous(out)); + } } void resize_linear_qcsnw_node( diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp index 423c9789d67..7b5fad57483 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp @@ -55,7 +55,6 @@ void add_transfer_copy_node( } transfer_params{static_cast(dim_whcn)}; std::vector push_constants; - vkapi::SpecVarList spec_vars; if (graph.is_buffer_storage(out)) { push_constants = { @@ -64,23 +63,18 @@ void add_transfer_copy_node( graph.strides_pc_of(in), graph.numel_pc_of(out), PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; - - spec_vars = { - graph.packed_dim_of(out), - graph.packed_dim_of(in), - }; } else { push_constants = { graph.sizes_pc_of(out), graph.sizes_pc_of(in), PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; - - spec_vars = { - graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - }; } + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(out), + graph.hashed_layout_of(in), + }; + // Determine the shader directly std::string kernel_name; if (transfer_type == TransferType::SELECT) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp index a3be34830d3..ea610b1fe74 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp @@ -54,7 +54,7 @@ void add_where_texture_node( // Push Constants {}, // Specialization Constants - {graph.packed_dim_of(out)}, + {graph.hashed_layout_of(out)}, // Resize Arguments {}, // Resizing Logic @@ -96,10 +96,7 @@ void add_where_buffer_node( // Push Constants {}, // Specialization Constants - {graph.packed_dim_of(out), - graph.packed_dim_of(cond), - graph.packed_dim_of(self), - graph.packed_dim_of(other)}, + {graph.hashed_layout_of(out)}, // Resize Arguments {}, // Resizing Logic diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index bd67933dc93..4ea61cd7ef3 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -52,13 +52,17 @@ def get_binary_elementwise_inputs(): ((S, S1, S2), (S, S1, 1), 2.0), ((S, S1, S2), (S, 1, S2), 2.0), ((XS, S, S1, S2), (XS, S, 1, 1), 2.0), + ((3, 64, 1), (1, 64, 1)), ] ) test_suite.layouts = [ "utils::kWidthPacked", "utils::kChannelsPacked", ] - test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] + test_suite.storage_types = [ + "utils::kBuffer", + "utils::kTexture3D", + ] return test_suite diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 3f5dba9e277..faa0e7d0c47 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -26,13 +26,14 @@ void record_nchw_to_buffer_op( vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; + vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()}; context->submit_compute_job( get_nchw_to_tensor_shader(v_dst, true, false), pipeline_barrier, {uint32_t(v_dst.numel()), 1, 1}, {64, 1, 1}, - {}, + specialization_constants, VK_NULL_HANDLE, 0, v_dst.buffer(