diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl index 862ccdad304..39aa9b11a0d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl @@ -18,17 +18,16 @@ ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec4", "out_sizes")} -${layout_declare_ubo(B, "ivec4", "in_sizes")} - -layout(set = 0, binding = 5) uniform PRECISION restrict CopyArgs { +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 in_sizes; // Operates on (x, y, z) logical extents. - ivec3 range; + // channel_range is stored in range.w + ivec4 range; // Analogus to range variable in copy. It defines the # of channel being // copied. - int channel_range; - ivec3 dst_offset; - int dst_channel_offset; + // dst channel offset is stored in dst_offset.w + ivec4 dst_offset; int src_channel_offset; }; @@ -47,11 +46,11 @@ void main() { // Note: Unlike other shaders, the range is often not equal to the destination // texture extent. const ivec3 lpos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(lpos, range))) { + if (any(greaterThanEqual(lpos, range.xyz))) { return; } - const ivec3 out_lpos = lpos + dst_offset; + const ivec3 out_lpos = lpos + dst_offset.xyz; const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim); @@ -61,12 +60,12 @@ void main() { ivec4 in_tidx = out_tidx; for (int i=0; i<4; i++) { - in_tidx[packed_dim] = out_tidx[packed_dim] - dst_channel_offset + i; + in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i; // Handle the partial update for begining of channel in an existing tensor. // If the source channel index is below zero or exceeds the range, we skip // updating the element to avoid overwriting existing data. - if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= channel_range)) { + if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) { continue; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl index 3dbc59e041a..a42a592762b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl @@ -17,7 +17,11 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")} +layout(push_constant) uniform restrict Block { + ivec3 range; + ivec3 src_offset; + ivec3 dst_offset; +}; #include "indexing_utils.h" diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index b98b2c504d4..69378524afb 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -33,16 +33,6 @@ void add_copy_offset_node( add_dtype_suffix(kernel_name, *t_out); add_storage_type_suffix(kernel_name, *t_out); - const struct Block final { - alignas(16) ivec3 range; - alignas(16) ivec3 src_offset; - alignas(16) ivec3 dst_offset; - } offset_params{ - range, - src_offset, - dst_offset, - }; - auto shader = VK_KERNEL_FROM_STR(kernel_name); graph.execute_nodes().emplace_back(new DispatchNode( @@ -56,11 +46,18 @@ void add_copy_offset_node( {in, vkapi::kRead}, }, // Parameter buffers - { - graph.create_params_buffer(offset_params), - }, + {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)})); + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + nullptr, + {}, + { + PushConstantDataInfo(&range, sizeof(range), sizeof(utils::ivec4)), + PushConstantDataInfo( + &src_offset, sizeof(src_offset), sizeof(utils::ivec4)), + PushConstantDataInfo( + &dst_offset, sizeof(dst_offset), sizeof(utils::ivec4)), + })); } void add_copy_channel_offset_node( @@ -128,28 +125,23 @@ void add_copy_channel_offset_node( // The shader combines the global invocation id and the dst_offset to get // the actual coordinate. - ivec3 dst_offset{ + const ivec3 dst_offset{ 0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)}; - uvec3 global_size{ + const uvec3 global_size{ utils::safe_downcast(dim_at(in_sizes)), utils::safe_downcast(dim_at(in_sizes)), utils::safe_downcast(dst_last_z - dst_first_z + 1)}; - uvec3 local_size = graph.create_local_wg_size(global_size); - - const struct Block final { - ivec3 range; - int32_t channel_range; - ivec3 dst_offset; - int32_t dst_channel_offset; - int32_t src_channel_offset; - } channel_offset_params{ - utils::make_ivec3(global_size), - channel_range, - dst_offset, - dst_channel_offset, - src_channel_offset, - }; + const uvec3 local_size = graph.create_local_wg_size(global_size); + + const utils::ivec4 range_params = { + static_cast(global_size[0]), + static_cast(global_size[1]), + static_cast(global_size[2]), + channel_range}; + + const utils::ivec4 offset_params = { + dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset}; auto shader = VK_KERNEL_FROM_STR(kernel_name); @@ -165,13 +157,17 @@ void add_copy_channel_offset_node( {in, vkapi::MemoryAccessType::READ}, }, // Parameter buffers - { - t_out->sizes_ubo(), - t_in->sizes_ubo(), - graph.create_params_buffer(channel_offset_params), - }, + {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)})); + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + nullptr, + {}, + {graph.sizes_pc_of(out), + graph.sizes_pc_of(in), + PushConstantDataInfo(&range_params, sizeof(range_params)), + PushConstantDataInfo(&offset_params, sizeof(offset_params)), + PushConstantDataInfo( + &src_channel_offset, sizeof(src_channel_offset))})); } }