diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index ce986d4e12f..a0a235154a0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -48,19 +48,18 @@ $else: layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} + $if STORAGE == "buffer": - ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")} + const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); $else: - ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); const lowp int packed_dim = unhash_packed_dim(out_layout); - ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} const lowp ivec4 other_axis_map = unhash_axis_map(other_layout); #ifdef USING_BUFFER @@ -77,7 +76,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); const ivec4 in_tidx = min(out_tidx, in_sizes - 1); const ivec4 other_tidx = min(out_tidx, other_sizes - 1); diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 2b41d2b7e1a..0cfd7f2f119 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -68,21 +68,6 @@ */ #define mod4(x) ((x) & 3) -/* - * Find the packed dimension of a tensor given its strides. The packed dimension - * is the "fastest moving" dimension which will have a stride of 1. - */ -int find_packed_dim(const ivec4 strides) { - int packed_dim = 0; - for (int i = 0; i <= 3; i++) { - if (strides[i] == 1) { - packed_dim = i; - break; - } - } - return packed_dim; -} - /* * Get the staging buffer indices that contain the data of the texel that * corresponds to the provided tensor index. Since the texel have 4 elements, @@ -129,27 +114,26 @@ int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) { tidx.x; } -// TODO(ssjia): make this function use dim order so that it can work with any -// dim order. Currently it assumes that the dim order is contiguous, except for -// the packed dim. -ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) { +ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) { ivec4 idx; for (int i = 3; i >= 0; i--) { - if (i != packed_dim) { - idx[i] = bufi / strides[i]; - bufi %= strides[i]; - } + int dim = dim_order[i]; + idx[dim] = bufi / strides[dim]; + bufi %= strides[dim]; } - idx[packed_dim] = bufi; return idx; } -// Convenience overload of the above function, which will determine the packed -// dim from the strides automatically so it doesn't have to be passed in as a -// function argument. -ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) { - int packed_dim = find_packed_dim(strides); - return bufi_to_tidx(bufi, strides, packed_dim); +/* + * bufi_to_tidx but assumes that the tensor is contiguous + */ +ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) { + ivec4 idx; + for (int i = 3; i >= 0; i--) { + idx[i] = bufi / strides[i]; + bufi %= strides[i]; + } + return idx; } int tidx_to_bufi(const ivec4 tidx, ivec4 strides) { @@ -269,12 +253,22 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) { * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1) */ #define unhash_axis_map(hash) \ - ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)) + (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))) + +/* + * + */ +#define unhash_dim_order(hash) \ + (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))) #define unhash_packed_dim(hash) int(hash >> 16 & 0xf) #define DEFAULT_LAYOUT 0x02210 +#define DEFAULT_DIM_ORDER 0x03210 + +#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3) + /************************ * Deprecated Functions * ************************/ diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl index dfb5f1f2f9c..4dd83f0d4ed 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl @@ -62,7 +62,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0); + const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides); const FLOAT_T scale = t_scales[out_tidx.x]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl index ba4e4dd9dd9..62cd0610ffb 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl @@ -10,8 +10,8 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)} $if USE_PUSH_CONST: layout(push_constant) uniform restrict Block { @@ -20,15 +20,14 @@ $if USE_PUSH_CONST: int numel; }; $else: - ${layout_declare_ubo(2, "ivec4", "out_sizes")} - ${layout_declare_ubo(3, "ivec4", "out_strides")} - ${layout_declare_ubo(4, "int", "numel")} + ${layout_declare_ubo(B, "ivec4", "out_sizes")} + ${layout_declare_ubo(B, "ivec4", "out_strides")} + ${layout_declare_ubo(B, "int", "numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// This constant is unused in this shader but is kept so that the signature is -// consistent with nchw_to_image. -${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); ${layout_declare_spec_const(C, "int", "transpose_hw", "0")} void main() { @@ -37,7 +36,7 @@ void main() { return; } - ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides); + ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); ivec4 sizes = out_sizes; if (transpose_hw == 1) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh index 3bcbf04a3ba..6509015b4b6 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -9,6 +9,8 @@ #ifndef SELECT_GLSLH #define SELECT_GLSLH +#ifndef USING_BUFFER + /* * Enable the fast path if a texel loaded from the input texture can be used as * is to store to the output texture. The following conditions must be met: @@ -29,6 +31,8 @@ bool can_use_fast_path() { return true; } +#endif // USING_BUFFER + /* * Given an output tensor index, return the corresponding input tensor index for * the select operator. This is done by "inserting" the select index at the diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh index 5d4cc70fdc1..87325754f4d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -9,6 +9,8 @@ #ifndef SLICE_GLSLH #define SLICE_GLSLH +#ifndef USING_BUFFER + /** * Enable the fast path if a texel loaded from the input texture can be used as * is to store to the output texture. The following conditions must be met: @@ -26,6 +28,8 @@ bool can_use_fast_path() { return true; } +#endif // USING_BUFFER + /* * Converts output tensor indices to input tensor indices for the slice operation. * This function maps the output indices to the corresponding input indices based on diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl index 3ca854e0526..7e95b52d8f4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl @@ -37,8 +37,10 @@ layout(push_constant) uniform restrict Block { int selected_dim; }; -${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -50,7 +52,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); const int in_bufi = tidx_to_bufi(in_tidx, in_strides); diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl index 5df813d1241..fe6304c0fa0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -37,40 +37,28 @@ $if STORAGE == "buffer": ${layout_declare_ubo(B, "ivec4", "cond_strides")} ${layout_declare_ubo(B, "ivec4", "self_strides")} ${layout_declare_ubo(B, "ivec4", "other_strides")} - - ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "cond_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "self_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")} $else: ${layout_declare_ubo(B, "ivec3", "out_limits")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); + layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #ifdef USING_BUFFER void main() { int out_bufi = int(gl_GlobalInvocationID.x); - // ivec4 tidx = ivec4(gl_GlobalInvocationID, 0); - // int out_bufi = tidx_to_bufi(tidx, out_strides); - // int cond_bufi = tidx_to_bufi(tidx, cond_strides); - // int self_bufi = tidx_to_bufi(tidx, self_strides); - // int other_bufi = tidx_to_bufi(tidx, other_strides); if (out_bufi >= out_numl) { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - out_bufi = tidx_to_bufi(out_tidx, out_strides); - - const ivec4 cond_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int cond_bufi = tidx_to_bufi(cond_tidx, cond_strides); - - const ivec4 self_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int self_bufi = tidx_to_bufi(self_tidx, self_strides); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); - const ivec4 other_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int other_bufi = tidx_to_bufi(other_tidx, other_strides); + const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides); + const int self_bufi = tidx_to_bufi(out_tidx, self_strides); + const int other_bufi = tidx_to_bufi(out_tidx, other_strides); COND_T cond = t_condition[cond_bufi] ; T v_self = t_self[self_bufi]; diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index d260ed767d0..28279c196c0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -143,9 +143,9 @@ void add_binary_op_buffer_node( PushConstantDataInfo(&alpha_val, sizeof(float)), }}, // Specialization Constants - {graph.packed_dim_of(out), - graph.packed_dim_of(in1), - graph.packed_dim_of(in2)}, + {graph.hashed_layout_of(out), + graph.hashed_layout_of(in1), + graph.hashed_layout_of(in2)}, // Resize Args {}, // Resizing Logic diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp index 6e101195e3f..07502a7a107 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp @@ -43,6 +43,10 @@ void check_linear_qcsnw_args( VK_CHECK_COND( utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes)); } + + if (graph.is_buffer_storage(out)) { + VK_CHECK_COND(graph.is_contiguous(out)); + } } void resize_linear_qcsnw_node( diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp index 423c9789d67..7b5fad57483 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp @@ -55,7 +55,6 @@ void add_transfer_copy_node( } transfer_params{static_cast(dim_whcn)}; std::vector push_constants; - vkapi::SpecVarList spec_vars; if (graph.is_buffer_storage(out)) { push_constants = { @@ -64,23 +63,18 @@ void add_transfer_copy_node( graph.strides_pc_of(in), graph.numel_pc_of(out), PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; - - spec_vars = { - graph.packed_dim_of(out), - graph.packed_dim_of(in), - }; } else { push_constants = { graph.sizes_pc_of(out), graph.sizes_pc_of(in), PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; - - spec_vars = { - graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - }; } + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(out), + graph.hashed_layout_of(in), + }; + // Determine the shader directly std::string kernel_name; if (transfer_type == TransferType::SELECT) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp index a3be34830d3..ea610b1fe74 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp @@ -54,7 +54,7 @@ void add_where_texture_node( // Push Constants {}, // Specialization Constants - {graph.packed_dim_of(out)}, + {graph.hashed_layout_of(out)}, // Resize Arguments {}, // Resizing Logic @@ -96,10 +96,7 @@ void add_where_buffer_node( // Push Constants {}, // Specialization Constants - {graph.packed_dim_of(out), - graph.packed_dim_of(cond), - graph.packed_dim_of(self), - graph.packed_dim_of(other)}, + {graph.hashed_layout_of(out)}, // Resize Arguments {}, // Resizing Logic diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index bd67933dc93..4ea61cd7ef3 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -52,13 +52,17 @@ def get_binary_elementwise_inputs(): ((S, S1, S2), (S, S1, 1), 2.0), ((S, S1, S2), (S, 1, S2), 2.0), ((XS, S, S1, S2), (XS, S, 1, 1), 2.0), + ((3, 64, 1), (1, 64, 1)), ] ) test_suite.layouts = [ "utils::kWidthPacked", "utils::kChannelsPacked", ] - test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] + test_suite.storage_types = [ + "utils::kBuffer", + "utils::kTexture3D", + ] return test_suite diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 3f5dba9e277..faa0e7d0c47 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -26,13 +26,14 @@ void record_nchw_to_buffer_op( vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; + vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()}; context->submit_compute_job( get_nchw_to_tensor_shader(v_dst, true, false), pipeline_barrier, {uint32_t(v_dst.numel()), 1, 1}, {64, 1, 1}, - {}, + specialization_constants, VK_NULL_HANDLE, 0, v_dst.buffer(