From 63f650a6ceb28ed8a36603fd99a31d9ea763dead Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Tue, 10 Dec 2024 08:53:06 -0800 Subject: [PATCH 1/2] [ET-VK] Replace Uniform buffers with push constants for copy op This diff replaces uniform buffers with push constants for copy op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader. Differential Revision: [D66890851](https://our.internmc.facebook.com/intern/diff/D66890851/) [ghstack-poisoned] --- .../graph/ops/glsl/copy_channel_offset.glsl | 22 +++--- .../runtime/graph/ops/glsl/copy_offset.glsl | 6 +- .../vulkan/runtime/graph/ops/impl/Copy.cpp | 70 +++++++++---------- 3 files changed, 48 insertions(+), 50 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl index 862ccdad304..9fb251da771 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl @@ -18,17 +18,15 @@ ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} - -layout(set = 0, binding = 5) uniform PRECISION restrict CopyArgs { +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 in_sizes; // Operates on (x, y, z) logical extents. - ivec3 range; + ivec4 range; // Analogus to range variable in copy. It defines the # of channel being // copied. - int channel_range; - ivec3 dst_offset; - int dst_channel_offset; + // channel_range is stored in range.w + ivec4 dst_offset; int src_channel_offset; }; @@ -47,11 +45,11 @@ void main() { // Note: Unlike other shaders, the range is often not equal to the destination // texture extent. const ivec3 lpos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(lpos, range))) { + if (any(greaterThanEqual(lpos, range.xyz))) { return; } - const ivec3 out_lpos = lpos + dst_offset; + const ivec3 out_lpos = lpos + dst_offset.xyz; const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim); @@ -61,12 +59,12 @@ void main() { ivec4 in_tidx = out_tidx; for (int i=0; i<4; i++) { - in_tidx[packed_dim] = out_tidx[packed_dim] - dst_channel_offset + i; + in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i; // Handle the partial update for begining of channel in an existing tensor. // If the source channel index is below zero or exceeds the range, we skip // updating the element to avoid overwriting existing data. - if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= channel_range)) { + if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) { continue; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl index 3dbc59e041a..a42a592762b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl @@ -17,7 +17,11 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")} +layout(push_constant) uniform restrict Block { + ivec3 range; + ivec3 src_offset; + ivec3 dst_offset; +}; #include "indexing_utils.h" diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index b98b2c504d4..6301f462536 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -33,16 +33,6 @@ void add_copy_offset_node( add_dtype_suffix(kernel_name, *t_out); add_storage_type_suffix(kernel_name, *t_out); - const struct Block final { - alignas(16) ivec3 range; - alignas(16) ivec3 src_offset; - alignas(16) ivec3 dst_offset; - } offset_params{ - range, - src_offset, - dst_offset, - }; - auto shader = VK_KERNEL_FROM_STR(kernel_name); graph.execute_nodes().emplace_back(new DispatchNode( @@ -56,11 +46,16 @@ void add_copy_offset_node( {in, vkapi::kRead}, }, // Parameter buffers - { - graph.create_params_buffer(offset_params), - }, + {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)})); + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + nullptr, + {}, + { + PushConstantDataInfo(&range, sizeof(utils::ivec4)), + PushConstantDataInfo(&src_offset, sizeof(utils::ivec4)), + PushConstantDataInfo(&dst_offset, sizeof(utils::ivec4)), + })); } void add_copy_channel_offset_node( @@ -128,28 +123,23 @@ void add_copy_channel_offset_node( // The shader combines the global invocation id and the dst_offset to get // the actual coordinate. - ivec3 dst_offset{ + const ivec3 dst_offset{ 0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)}; - uvec3 global_size{ + const uvec3 global_size{ utils::safe_downcast(dim_at(in_sizes)), utils::safe_downcast(dim_at(in_sizes)), utils::safe_downcast(dst_last_z - dst_first_z + 1)}; - uvec3 local_size = graph.create_local_wg_size(global_size); - - const struct Block final { - ivec3 range; - int32_t channel_range; - ivec3 dst_offset; - int32_t dst_channel_offset; - int32_t src_channel_offset; - } channel_offset_params{ - utils::make_ivec3(global_size), - channel_range, - dst_offset, - dst_channel_offset, - src_channel_offset, - }; + const uvec3 local_size = graph.create_local_wg_size(global_size); + + const utils::ivec4 range_params = { + static_cast(global_size[0]), + static_cast(global_size[1]), + static_cast(global_size[2]), + channel_range}; + + const utils::ivec4 offset_params = { + dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset}; auto shader = VK_KERNEL_FROM_STR(kernel_name); @@ -165,13 +155,19 @@ void add_copy_channel_offset_node( {in, vkapi::MemoryAccessType::READ}, }, // Parameter buffers - { - t_out->sizes_ubo(), - t_in->sizes_ubo(), - graph.create_params_buffer(channel_offset_params), - }, + {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)})); + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + nullptr, + {}, + {PushConstantDataInfo( + t_out->get_uniform_data(), api::vTensor::Attribute::SIZES), + PushConstantDataInfo( + t_in->get_uniform_data(), api::vTensor::Attribute::SIZES), + PushConstantDataInfo(&range_params, sizeof(range_params)), + PushConstantDataInfo(&offset_params, sizeof(offset_params)), + PushConstantDataInfo( + &src_channel_offset, sizeof(src_channel_offset))})); } } From 8c77b4de9f85a5190bfb4abef5ff1b548c992192 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Tue, 10 Dec 2024 09:02:14 -0800 Subject: [PATCH 2/2] Update on "[ET-VK] Replace Uniform buffers with push constants for copy op" This diff replaces uniform buffers with push constants for copy op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader. Differential Revision: [D66890851](https://our.internmc.facebook.com/intern/diff/D66890851/) [ghstack-poisoned] --- .../vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl | 3 ++- backends/vulkan/runtime/graph/ops/impl/Copy.cpp | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl index 9fb251da771..39aa9b11a0d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl @@ -22,10 +22,11 @@ layout(push_constant) uniform restrict Block { ivec4 out_sizes; ivec4 in_sizes; // Operates on (x, y, z) logical extents. + // channel_range is stored in range.w ivec4 range; // Analogus to range variable in copy. It defines the # of channel being // copied. - // channel_range is stored in range.w + // dst channel offset is stored in dst_offset.w ivec4 dst_offset; int src_channel_offset; }; diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index 6301f462536..a395b7e73e8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -160,10 +160,8 @@ void add_copy_channel_offset_node( {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, nullptr, {}, - {PushConstantDataInfo( - t_out->get_uniform_data(), api::vTensor::Attribute::SIZES), - PushConstantDataInfo( - t_in->get_uniform_data(), api::vTensor::Attribute::SIZES), + {graph.sizes_pc_of(out), + graph.sizes_pc_of(in), PushConstantDataInfo(&range_params, sizeof(range_params)), PushConstantDataInfo(&offset_params, sizeof(offset_params)), PushConstantDataInfo(