diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h index 958637218e2..7d04f7714e9 100644 --- a/backends/vulkan/runtime/graph/ops/DispatchNode.h +++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h @@ -46,12 +46,15 @@ class PushConstantDataInfo { payload_.attr = attr; } - explicit PushConstantDataInfo(const void* data, uint32_t dataLen) + explicit PushConstantDataInfo( + const void* data, + uint32_t dataLen, + uint32_t pushConstantLen = 0) : tensorUniformData(nullptr) { VK_CHECK_COND( dataLen <= 16, "Single push constant data size must be <= 16 bytes"); - payload_.dataSize = dataLen; - memcpy(payload_.data, data, payload_.dataSize); + payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen; + memcpy(payload_.data, data, dataLen); } /* diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index be0e1bfa20a..62aa2f810dc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -19,11 +19,6 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} -${layout_declare_ubo(B, "ivec4", "other_sizes")} -${layout_declare_ubo(B, "ivec2", "broadcast_params")} -${layout_declare_ubo(B, "float", "alpha")} #include "broadcasting_utils.h" #include "indexing_utils.h" @@ -40,6 +35,14 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} const lowp ivec4 other_axis_map = unhash_axis_map(other_layout); +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 in_sizes; + ivec4 other_sizes; + ivec2 broadcast_params; + float alpha; +}; + void main() { const ivec3 lpos = ivec3(gl_GlobalInvocationID); const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim); diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl index 5378099d03f..59d6aecdc15 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl @@ -19,15 +19,9 @@ layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in; -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { +layout(push_constant) uniform PRECISION restrict Block { + ivec4 out_limits; ivec4 sizes; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict Block { // output dims ivec4 out_ndims; // x = output channels aligned to 4, y = input channels aligned to 4 @@ -41,7 +35,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const u16vec3 pos = u16vec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(pos, out_limits))) { + if (any(greaterThanEqual(pos, out_limits.xyz))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index 33f73cd6dad..7e88982aaee 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -67,7 +67,10 @@ void add_binary_op_node( alpha_val = graph.extract_scalar(alpha); } - const utils::ivec2 broadcast_params = create_broadcast_params(*t_in1, *t_in2); + const struct BinaryOpsParams { + const utils::ivec2 broadcast_params; + const float alpha_val; + } binary_ops_params{create_broadcast_params(*t_in1, *t_in2), alpha_val}; std::string kernel_name("binary_"); kernel_name.reserve(kShaderNameReserve); @@ -83,16 +86,16 @@ void add_binary_op_node( {{out, vkapi::MemoryAccessType::WRITE}, {{arg1, arg2}, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->sizes_ubo(), - t_in1->sizes_ubo(), - t_in2->sizes_ubo(), - graph.create_params_buffer(broadcast_params), - graph.create_params_buffer(alpha_val)}, + {}, // Specialization Constants {t_out->hashed_layout(), t_in1->hashed_layout(), t_in2->hashed_layout()}, // Resizing Logic resize_binary_op_node, - {})); + {}, + {{graph.sizes_pc_of(out), + graph.sizes_pc_of(arg1), + graph.sizes_pc_of(arg2), + PushConstantDataInfo(&binary_ops_params, sizeof(binary_ops_params))}})); } #define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name) \ diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index c107f288f34..a56925751e7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -75,13 +75,7 @@ void add_permute_node( int32_t out_c_aligned = utils::align_up_4(out_channels); int32_t in_c_aligned = utils::align_up_4(in_channels); - const struct Block final { - ivec4 out_ndims; - ivec2 ch_info; - } params{ - out_dims, - {out_c_aligned, in_c_aligned}, - }; + const ivec2 ch_info = {out_c_aligned, in_c_aligned}; graph.execute_nodes().emplace_back(new DispatchNode( graph, @@ -90,14 +84,16 @@ void add_permute_node( graph.create_local_wg_size(out), {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, - {t_out->logical_limits_ubo(), - t_out->sizes_ubo(), - graph.create_params_buffer(params)}, + {}, // Specialization Constants {}, // Resizing Logic nullptr, - {})); + {}, + {{graph.logical_limits_pc_of(out), + graph.sizes_pc_of(out), + PushConstantDataInfo(&out_dims, sizeof(out_dims)), + PushConstantDataInfo(&ch_info, sizeof(ch_info))}})); } void add_permute_node( diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 77a0458d901..604ad26588d 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1601,9 +1601,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto addFn = VK_GET_OP_FN("aten.add.Tensor"); addFn(graph, {a.value, b.value, kDummyValueRef, c}); - // +2: alpha UBO, broadcast UBO for arithmetic shader - // +1: t.sizes_ubo() for arithmetic shader output c - expected_vma_allocation_count += 3; + // no new allocations if binary op uses push constants EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); IOValueRef d = graph.add_input_tensor( @@ -1624,17 +1622,16 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto mulFn = VK_GET_OP_FN("aten.mul.Tensor"); mulFn(graph, {c, d.value, e}); - // +2: alpha UBO, broadcast UBO for arithmetic shader - // +1: t.sizes_ubo() for arithmetic shader output e - expected_vma_allocation_count += 3; + // no new allocations if binary op uses push constants EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); IOValueRef out = {}; out.value = e; out.staging = graph.set_output_tensor(out.value); + // +1: staging buffer input tensor // +1: staging buffer for the output tensor - expected_vma_allocation_count += 1; + expected_vma_allocation_count += 2; EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); graph.prepare();