diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl index 4b18abbb1c5..1a2c257baec 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl @@ -42,47 +42,25 @@ const lowp int packed_dim = unhash_packed_dim(t_layout); * Extends sign of int8 */ int extend_sign(int x) { - if (x >> 7 == 1) { - return x | 0xFFFFFF00; - } - return x; + return x | mix(0, 0xFFFFFF00, x >= (1 << 7)); } ivec4 read_texel(ivec4 tidx) { - ivec4 tidx_to_use = tidx; - ivec4 sizes_to_use = sizes; - int packed_dim_to_use = packed_dim; - if (transpose_hw == 1) { - sizes_to_use.xy = sizes_to_use.yx; - tidx_to_use.xy = tidx.yx; - - if (packed_dim == 1) { - packed_dim_to_use = 0; - } - if (packed_dim == 0) { - packed_dim_to_use = 1; - } - } + const ivec4 tidx_to_use = ivec4(mix(tidx.xy, tidx.yx, bvec2(transpose_hw == 1)), tidx.zw); + const ivec4 sizes_to_use = ivec4(mix(sizes.xy, sizes.yx, bvec2(transpose_hw == 1)), sizes.zw); + const int packed_dim_to_use = mix(packed_dim, packed_dim ^ transpose_hw, packed_dim < 2); const ivec4 buf_indices = tidx_to_nchwi( tidx_to_use, sizes_to_use, packed_dim_to_use); - int shift = (1 << 8) - 1; - ivec4 masks; - // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that - // little endian is assumed, as most processors use little endian. Thus the - // most significant bytes correspond to the "latter" packed values. - masks.x = shift << (8 * (buf_indices.x % 4)); - masks.y = shift << (8 * (buf_indices.y % 4)); - masks.z = shift << (8 * (buf_indices.z % 4)); - masks.w = shift << (8 * (buf_indices.w % 4)); + const int mask = (1 << 8) - 1; ivec4 out_tex = ivec4(0); [[unroll]] for (int i = 0; i < 4; ++i) { if (tidx[packed_dim] + i < sizes[packed_dim]) { - int in_texel = nchw_in[buf_indices[i] / 4]; - int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4)); + const int in_texel = nchw_in[buf_indices[i] >> 2]; + int extracted_val = (in_texel >> (8 * (buf_indices[i] & 3))) & mask; extracted_val = extend_sign(extracted_val); out_tex[i] = extracted_val; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl index b645905939f..bb7ce482a7a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl @@ -25,12 +25,15 @@ layout(std430) buffer; ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} + +layout(push_constant) uniform restrict Block { $if STORAGE == "buffer": - ${layout_declare_ubo(2, "int", "numel")} + int numel; $else: - ${layout_declare_ubo(2, "ivec3", "out_limits")} -${layout_declare_ubo(3, "float", "minimum")} -${layout_declare_ubo(4, "float", "maximum")} + ivec4 out_limits; +float minimum; +float maximum; +}; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -53,7 +56,7 @@ void main() { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(pos, out_limits))) { + if (any(greaterThanEqual(pos, out_limits.xyz))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp index 518148f12eb..ea8daf2ea64 100644 --- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp @@ -43,15 +43,7 @@ void add_unary_op_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - vkapi::ParamsBindList ubos({}); - if (graph.is_buffer_storage(out)) { - ubos.append({graph.numel_ubo(out)}); - } else { - ubos.append({graph.logical_limits_ubo(out)}); - } - ubos.append( - {graph.create_params_buffer(min), graph.create_params_buffer(max)}); - + const utils::vec2 min_max = {min, max}; graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), @@ -60,9 +52,14 @@ void add_unary_op_node( // Inputs and Outputs {{out, vkapi::kWrite}, {in, vkapi::kRead}}, // Shader params buffers - ubos, - // Push Constants {}, + // Push Constants + { + graph.is_buffer_storage(out) ? graph.numel_pc_of(out) + : graph.logical_limits_pc_of(out), + PushConstantDataInfo(&min_max, sizeof(min_max)), + }, + // pcs, // Specialization Constants {}, // Resize Args