From cb69a5b02562e02994415fd2289601e3965385ac Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:34 -0700 Subject: [PATCH 1/9] [ET-VK] De vectorise conv2d pw shader to improve perf. Pull Request resolved: https://github.com/pytorch/executorch/pull/11108 This diff optimizes the performance of the `conv2d_pw` shader by de-vectorizing its implementation. * The original vectorized implementation of the `conv2d_pw` shader has been replaced with a de-vectorized approach to improve performance. * The `sum` array has been redefined to hold `float` values instead of `vec4` to accommodate the de-vectorized computation. These changes seem to allow shader compiler to better optimize operations within the shader hence improving perf. ghstack-source-id: 286652100 @exported-using-ghexport Differential Revision: [D75307267](https://our.internmc.facebook.com/intern/diff/D75307267/) --- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 50 +++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index 468b91f0535..0ee7b94a59a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -88,10 +88,18 @@ void main() { ipos[i] = pos[i] * stride - padding; } - vec4 sum[TILE_SIZE_X * TILE_SIZE_Y]; - sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0); - for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - sum[i] = sum[0]; + // Final output array where each element is a tensor value. + // Tuple of consecutive 4 elements represents a single output texel. + float sum[TILE_SIZE_X * TILE_SIZE_Y * 4]; + + const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0); + + // Initialize the output array with the bias value + for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) { + sum[i] = bias.x; + sum[i + 1] = bias.y; + sum[i + 2] = bias.z; + sum[i + 3] = bias.w; } int z4 = 0; @@ -100,14 +108,26 @@ void main() { // During prepacking, the weight tensor has been permuted so that the // channel (IC) dim is along the x-axis, and the batch (OC) dim is along // the z-axis. - const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0)); - const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0)); - const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0)); - const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0)); + float kernel_values[4 * 4]; // 4 channels, 4 elements per channel + + // Load kernel values from texels to array + for (int i = 0; i < 4; ++i) { + const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0); + kernel_values[i * 4 + 0] = k_tex.x; + kernel_values[i * 4 + 1] = k_tex.y; + kernel_values[i * 4 + 2] = k_tex.z; + kernel_values[i * 4 + 3] = k_tex.w; + } -#pragma unroll for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0); + // Load the input texel into an array + float tex_values[4]; + tex_values[0] = in_tex.x; + tex_values[1] = in_tex.y; + tex_values[2] = in_tex.z; + tex_values[3] = in_tex.w; + // For 2x2 tile size algorithm works as follows. // To explain the calculations below, the contents of one in_tex and the // group of 4 texels loaded from t_kernel are shown: @@ -141,10 +161,12 @@ void main() { // // which is what is expressed in the following calculations. This is done // for each output position. - sum[i] = fma(in_tex.xxxx, ktex_0, sum[i]); - sum[i] = fma(in_tex.yyyy, ktex_1, sum[i]); - sum[i] = fma(in_tex.zzzz, ktex_2, sum[i]); - sum[i] = fma(in_tex.wwww, ktex_3, sum[i]); + for (int j = 0; j < 4; ++j) { + sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j]; + sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j]; + sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j]; + sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j]; + } } } @@ -152,7 +174,7 @@ void main() { const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex; const ivec3 pos = pos_shared[offset_pos_index(index)]; if (all(lessThan(pos, out_limits.xyz))) { - imageStore(t_out, pos, op(sum[i], out_min, out_max)); + imageStore(t_out, pos, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); } } } From 83ad3d323964efebb627d25c63a4622c5a54e503 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:37 -0700 Subject: [PATCH 2/9] [ET-VK] Remove the use of shared memory in conv2d pw to improve perf. Pull Request resolved: https://github.com/pytorch/executorch/pull/11110 This diff removes the use of shared memory in the conv2d pw (pointwise) operation to improve performance. ghstack-source-id: 286652103 Differential Revision: [D75316188](https://our.internmc.facebook.com/intern/diff/D75316188/) --- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 32 +++++++------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index 0ee7b94a59a..552037247fd 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -14,7 +14,6 @@ #define TILE_SIZE_X ${TILE_SIZE_X} #define TILE_SIZE_Y ${TILE_SIZE_Y} -#define LOCAL_WG_SIZE 64 #define op(X, A, B) ${OPERATOR} @@ -39,11 +38,6 @@ layout(push_constant) uniform restrict Block { layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// For performance improvement, reduce register usage by caching positions in shared memory. -// Offset index by 1 every 16 points to avoid bank access conflict. -#define offset_pos_index(index) (index + ((index) >> 4)) -shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)]; - /* * Computes a 2D pointwise convolution of an NxN output tile. Calculating an * output tile for pointwise convolution is more efficient because the kernel @@ -51,7 +45,6 @@ shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE */ void main() { const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y); - const uint shared_mem_stride = LOCAL_WG_SIZE; const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x; const ivec3 gpos = ivec3( @@ -59,33 +52,32 @@ void main() { div_by_x % out_limits_scaled.y, div_by_x / out_limits_scaled.y); + // If the top left position is out of bounds, then this invocation will have + // no work to do. + if (gpos.z >= out_limits.z) { + return; + } + // Output position for TILE_SIZE = 2 // +--------+--------+ // | pos[0] | pos[1] | // +--------+--------+ // | pos[2] | pos[3] | // +--------+--------+ - ivec2 pos[TILE_SIZE_X * TILE_SIZE_Y]; + ivec3 pos[TILE_SIZE_X * TILE_SIZE_Y]; for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) { for (int x = 0; x < TILE_SIZE_X; ++x) { - pos[i] = ivec2(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y); - pos_shared[offset_pos_index((shared_mem_stride * i) + gl_LocalInvocationIndex)] = ivec3(pos[i], gpos.z); + pos[i] = ivec3(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y, gpos.z); i++; } } - // If the top left position is out of bounds, then this invocation will have - // no work to do. - if (gpos.z >= out_limits.z) { - return; - } - // Compute the index of the input texture that needs to be loaded for each // output position. Note that negative indices can be produced indicating that // the top-left element is in a region added by padding. ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y]; for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - ipos[i] = pos[i] * stride - padding; + ipos[i] = pos[i].xy * stride - padding; } // Final output array where each element is a tensor value. @@ -171,10 +163,8 @@ void main() { } for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex; - const ivec3 pos = pos_shared[offset_pos_index(index)]; - if (all(lessThan(pos, out_limits.xyz))) { - imageStore(t_out, pos, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); + if (all(lessThan(pos[i], out_limits.xyz))) { + imageStore(t_out, pos[i], op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); } } } From 843f7d7be27659bbfb505dc9dbbd61710120e19f Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:39 -0700 Subject: [PATCH 3/9] [ET-VK] Tuning conv 2d pw op tile size to improve perf. Pull Request resolved: https://github.com/pytorch/executorch/pull/11112 This diff tunes the tile size for the conv 2d pw op to improve performance. The changes include updating the `TILE_SIZE_X` and `TILE_SIZE_Y` values in the `conv2d_pw.yaml` files and modifying the `Convolution.cpp` files to adjust the image extents calculation. The `TILE_SIZE_X` value is changed from 2 to 1, and the `TILE_SIZE_Y` value is changed from 2 to 4. ghstack-source-id: 286652109 Differential Revision: [D75317820](https://our.internmc.facebook.com/intern/diff/D75317820/) --- backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml | 4 ++-- backends/vulkan/runtime/graph/ops/impl/Convolution.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml index 1f0e8fb71be..d4cb69d7648 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml @@ -9,8 +9,8 @@ conv2d_pw: OPERATOR: X NDIM: 3 DTYPE: float - TILE_SIZE_X: 2 - TILE_SIZE_Y: 2 + TILE_SIZE_X: 1 + TILE_SIZE_Y: 4 generate_variant_forall: DTYPE: - VALUE: half diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 381b9de0d6a..a0ac58ea9bc 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -305,8 +305,8 @@ utils::uvec3 create_conv2d_global_wg_size( if (method == Conv2dMethod::Pointwise) { const utils::uvec3 image_extents = graph.logical_limits_of(out); return { - utils::div_up(image_extents[0u], 2u), - utils::div_up(image_extents[1u], 2u), + utils::div_up(image_extents[0u], 1u), + utils::div_up(image_extents[1u], 4u), image_extents[2u]}; } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) { const utils::uvec3 image_extents = graph.create_global_wg_size(out); From c4780f3d1893cd810b9fcfab8e086ce9633c87d3 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:42 -0700 Subject: [PATCH 4/9] [ET-VK] Minor tuning for conv2d pw op to improve performance. Pull Request resolved: https://github.com/pytorch/executorch/pull/11113 The diff introduces minor tuning for the Conv2d pointwise (PW) operation in the Vulkan backend to improve performance. Conv 2d pw now issues a 2D dispatch instead of 1D, where dispatch axis y is now sized based on output texture's batch size. ghstack-source-id: 286652099 Differential Revision: [D75251145](https://our.internmc.facebook.com/intern/diff/D75251145/) --- backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl | 6 +++--- backends/vulkan/runtime/graph/ops/impl/Convolution.cpp | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index 552037247fd..e44a41fc9bc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -49,12 +49,12 @@ void main() { const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x; const ivec3 gpos = ivec3( gl_GlobalInvocationID.x % out_limits_scaled.x, - div_by_x % out_limits_scaled.y, - div_by_x / out_limits_scaled.y); + div_by_x, + gl_GlobalInvocationID.y); // If the top left position is out of bounds, then this invocation will have // no work to do. - if (gpos.z >= out_limits.z) { + if (gpos.y >= out_limits_scaled.y || gpos.z >= out_limits.z) { return; } diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index a0ac58ea9bc..5250c3baef2 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -398,8 +398,10 @@ void add_conv2d_node( utils::uvec3 wg_size = create_conv2d_global_wg_size( graph, method, out, weight_data, stride_equals_dilation); - if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) { + if (method == Conv2dMethod::Depthwise) { wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1}; + } else if (method == Conv2dMethod::Pointwise) { + wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; } vkapi::ParamsBindList param_buffers; From 8b3eba79836670a0a96404f4ed8a56a872153b1f Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:44 -0700 Subject: [PATCH 5/9] [ET-VK] De vectorise positions in conv2d pw shader to improve perf. Pull Request resolved: https://github.com/pytorch/executorch/pull/11122 This improves the performance of the conv2d pw shader by de-vectorizing position storage. The optimization involved replacing the `ivec3 pos` array with a plain `int pos` array to store the position values. The `x` and `y` coordinates are now stored in separate elements of the array instead of being stored together in an `ivec3`. This change allows for more efficient memory access and computation. ghstack-source-id: 286652097 @exported-using-ghexport Differential Revision: [D75335802](https://our.internmc.facebook.com/intern/diff/D75335802/) --- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index e44a41fc9bc..ed07979afc0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -64,10 +64,11 @@ void main() { // +--------+--------+ // | pos[2] | pos[3] | // +--------+--------+ - ivec3 pos[TILE_SIZE_X * TILE_SIZE_Y]; + int pos[TILE_SIZE_X * TILE_SIZE_Y * 2]; for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) { for (int x = 0; x < TILE_SIZE_X; ++x) { - pos[i] = ivec3(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y, gpos.z); + pos[i * 2] = gpos.x * TILE_SIZE_X + x; + pos[i * 2 + 1] = gpos.y * TILE_SIZE_Y + y; i++; } } @@ -75,9 +76,10 @@ void main() { // Compute the index of the input texture that needs to be loaded for each // output position. Note that negative indices can be produced indicating that // the top-left element is in a region added by padding. - ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y]; + int ipos[TILE_SIZE_X * TILE_SIZE_Y * 2]; for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - ipos[i] = pos[i].xy * stride - padding; + ipos[i * 2] = pos[i * 2] * stride.x - padding.x; + ipos[i * 2 + 1] = pos[i * 2 + 1] * stride.y - padding.y; } // Final output array where each element is a tensor value. @@ -112,7 +114,7 @@ void main() { } for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0); + const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i * 2], ipos[i * 2 + 1], z4), 0); // Load the input texel into an array float tex_values[4]; tex_values[0] = in_tex.x; @@ -163,8 +165,9 @@ void main() { } for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - if (all(lessThan(pos[i], out_limits.xyz))) { - imageStore(t_out, pos[i], op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); + const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], gpos.z); + if (all(lessThan(pos_l, out_limits.xyz))) { + imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); } } } From 73481011b97fa9d7800814d5f95c9f47f153f7ac Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:47 -0700 Subject: [PATCH 6/9] [ET-VK] Minor unroll tuning to improve conv2d pw perf. Pull Request resolved: https://github.com/pytorch/executorch/pull/11134 This diff provides a minor unroll tuning to improve the performance of the conv2d pointwise (pw) operation in the Executorch Vulkan backend. ghstack-source-id: 286652101 @exported-using-ghexport Differential Revision: [D75420510](https://our.internmc.facebook.com/intern/diff/D75420510/) --- backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index ed07979afc0..c090c5d344f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -38,6 +38,8 @@ layout(push_constant) uniform restrict Block { layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +#extension GL_EXT_control_flow_attributes : require + /* * Computes a 2D pointwise convolution of an NxN output tile. Calculating an * output tile for pointwise convolution is more efficient because the kernel @@ -105,7 +107,7 @@ void main() { float kernel_values[4 * 4]; // 4 channels, 4 elements per channel // Load kernel values from texels to array - for (int i = 0; i < 4; ++i) { + [[unroll]] for (int i = 0; i < 4; ++i) { const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0); kernel_values[i * 4 + 0] = k_tex.x; kernel_values[i * 4 + 1] = k_tex.y; From e275147af8482b36dfc7aece327b961cbcdcfdc8 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:48 -0700 Subject: [PATCH 7/9] [ET-VK] Tuning local workgroup size calculation for conv2d pw to improve performance. Pull Request resolved: https://github.com/pytorch/executorch/pull/11135 This diff adjusts the local workgroup size (`local_wg_size`) based on batch count (stored in `wg_size[1]`), to improve conv2d pw performance. * If `wg_size[1]` is a multiple of 8, `local_wg_size_y` is set to 8. * If `wg_size[1]` is a multiple of 4, `local_wg_size_y` is set to 4. * If `wg_size[1]` is a multiple of 2, `local_wg_size_y` is set to 2. * Otherwise, we default to `local_wg_size_y` = 1. The dispatch size in 2 dimensions is then calculate based on `{64 / local_wg_size_y, local_wg_size_y, 1}`. ghstack-source-id: 286652105 @exported-using-ghexport Differential Revision: [D75420517](https://our.internmc.facebook.com/intern/diff/D75420517/) --- .../runtime/graph/ops/impl/Convolution.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 5250c3baef2..ba1f50a23c1 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -404,6 +404,21 @@ void add_conv2d_node( wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; } + utils::uvec3 local_wg_size; + if (method == Conv2dMethod::Pointwise) { + uint32_t local_wg_size_y = 1; + if (wg_size[1] % 8 == 0) { + local_wg_size_y = 8; + } else if (wg_size[1] % 4 == 0) { + local_wg_size_y = 4; + } else if (wg_size[1] % 2 == 0) { + local_wg_size_y = 2; + } + local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1}; + } else { + local_wg_size = graph.create_local_wg_size(wg_size); + } + vkapi::ParamsBindList param_buffers; std::vector push_constants; if (method == Conv2dMethod::Pointwise) { @@ -464,7 +479,7 @@ void add_conv2d_node( graph, shader, wg_size, - graph.create_local_wg_size(wg_size), + local_wg_size, // Inputs and Outputs {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}}, // Shader params buffers From fab78a41707c652339279e9bba611e30e0359f55 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:50 -0700 Subject: [PATCH 8/9] [ET-VK] De vectorise all vectors in conv2d pw shader to improve perf. Pull Request resolved: https://github.com/pytorch/executorch/pull/11136 This diff improves the performance of the conv2d pw shader by de-vectorizing all vectors. ghstack-source-id: 286652098 @exported-using-ghexport Differential Revision: [D75423245](https://our.internmc.facebook.com/intern/diff/D75423245/) --- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index c090c5d344f..c218b8ac8cc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -46,17 +46,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; * size is only 1x1, making it easier to re-use loaded texels from t_kernel. */ void main() { - const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y); + const int out_limits_scaled[2] = {out_limits.x + (TILE_SIZE_X - 1) * TILE_SIZE_X, out_limits.y + (TILE_SIZE_Y - 1) * TILE_SIZE_Y}; - const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x; - const ivec3 gpos = ivec3( - gl_GlobalInvocationID.x % out_limits_scaled.x, - div_by_x, - gl_GlobalInvocationID.y); + const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]); + const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)}; // If the top left position is out of bounds, then this invocation will have // no work to do. - if (gpos.y >= out_limits_scaled.y || gpos.z >= out_limits.z) { + if (out_pos[1] >= out_limits_scaled[1] || out_pos[2] >= out_limits.z) { return; } @@ -69,8 +66,8 @@ void main() { int pos[TILE_SIZE_X * TILE_SIZE_Y * 2]; for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) { for (int x = 0; x < TILE_SIZE_X; ++x) { - pos[i * 2] = gpos.x * TILE_SIZE_X + x; - pos[i * 2 + 1] = gpos.y * TILE_SIZE_Y + y; + pos[i * 2] = out_pos[0] * TILE_SIZE_X + x; + pos[i * 2 + 1] = out_pos[1] * TILE_SIZE_Y + y; i++; } } @@ -88,7 +85,7 @@ void main() { // Tuple of consecutive 4 elements represents a single output texel. float sum[TILE_SIZE_X * TILE_SIZE_Y * 4]; - const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0); + const vec4 bias = texelFetch(t_bias, ivec2(out_pos[2], 0), 0); // Initialize the output array with the bias value for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) { @@ -108,7 +105,7 @@ void main() { // Load kernel values from texels to array [[unroll]] for (int i = 0; i < 4; ++i) { - const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0); + const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos[2]), 0); kernel_values[i * 4 + 0] = k_tex.x; kernel_values[i * 4 + 1] = k_tex.y; kernel_values[i * 4 + 2] = k_tex.z; @@ -167,7 +164,7 @@ void main() { } for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], gpos.z); + const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos[2]); if (all(lessThan(pos_l, out_limits.xyz))) { imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); } From a5ac0562fb3365ace4d2542d4a33d001a3f98131 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 28 May 2025 07:09:52 -0700 Subject: [PATCH 9/9] [ET-VK] Creating specialized version of conv2d pw shader for X and Y stride = 1 and padding = 0. Pull Request resolved: https://github.com/pytorch/executorch/pull/11137 This diff creates a specialized version of the conv2d pw shader for X and Y stride equals 1 and padding equals 0. * It adds a new file `conv2d_pw_s1p0.glsl`, which implements the conv2d pw shader for X and Y stride equals 1 and padding equals 0. * It adds a new file `conv2d_pw_s1p0.yaml`, which defines the parameters and shader variants for the specialized conv2d pw shader. * The file `Convolution.cpp` is modified to add a new parameter `stride_1_padding_0` to the `conv2d` function, which enables the use of the specialized shader. ghstack-source-id: 286652107 @exported-using-ghexport Differential Revision: [D75423931](https://our.internmc.facebook.com/intern/diff/D75423931/) --- .../graph/ops/glsl/conv2d_pw_s1p0.glsl | 163 ++++++++++++++++++ .../graph/ops/glsl/conv2d_pw_s1p0.yaml | 21 +++ .../runtime/graph/ops/impl/Convolution.cpp | 12 +- 3 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl new file mode 100644 index 00000000000..36c7a61eb3d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl @@ -0,0 +1,163 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +#define TILE_SIZE_X ${TILE_SIZE_X} +#define TILE_SIZE_Y ${TILE_SIZE_Y} + +#define op(X, A, B) ${OPERATOR} + +#include "indexing_utils.h" + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} +${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} +${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} +${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} + +layout(push_constant) uniform restrict Block { + ivec4 out_limits; + ivec2 stride; + ivec2 padding; + int in_group_size; + int dummy_padding; + float out_min; + float out_max; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#extension GL_EXT_control_flow_attributes : require + +/* + * Computes a 2D pointwise convolution of an NxN output tile. Calculating an + * output tile for pointwise convolution is more efficient because the kernel + * size is only 1x1, making it easier to re-use loaded texels from t_kernel. + */ +void main() { + const int out_limits_scaled[2] = {out_limits.x + (TILE_SIZE_X - 1) * TILE_SIZE_X, out_limits.y + (TILE_SIZE_Y - 1) * TILE_SIZE_Y}; + + const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]); + const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)}; + + // If the top left position is out of bounds, then this invocation will have + // no work to do. + if (out_pos[1] >= out_limits_scaled[1] || out_pos[2] >= out_limits.z) { + return; + } + + // Output position for TILE_SIZE = 2 + // +--------+--------+ + // | pos[0] | pos[1] | + // +--------+--------+ + // | pos[2] | pos[3] | + // +--------+--------+ + int pos[TILE_SIZE_X * TILE_SIZE_Y * 2]; + for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) { + for (int x = 0; x < TILE_SIZE_X; ++x) { + pos[i * 2] = out_pos[0] * TILE_SIZE_X + x; + pos[i * 2 + 1] = out_pos[1] * TILE_SIZE_Y + y; + i++; + } + } + + // Final output array where each element is a tensor value. + // Tuple of consecutive 4 elements represents a single output texel. + float sum[TILE_SIZE_X * TILE_SIZE_Y * 4]; + + const vec4 bias = texelFetch(t_bias, ivec2(out_pos[2], 0), 0); + + // Initialize the output array with the bias value + for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) { + sum[i] = bias.x; + sum[i + 1] = bias.y; + sum[i + 2] = bias.z; + sum[i + 3] = bias.w; + } + + int z4 = 0; + // Since the kernel is 1x1, we only have to loop over the depth dimension. + for (int z = 0; z < in_group_size; z += 4, ++z4) { + // During prepacking, the weight tensor has been permuted so that the + // channel (IC) dim is along the x-axis, and the batch (OC) dim is along + // the z-axis. + float kernel_values[4 * 4]; // 4 channels, 4 elements per channel + + // Load kernel values from texels to array + [[unroll]] for (int i = 0; i < 4; ++i) { + const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos[2]), 0); + kernel_values[i * 4 + 0] = k_tex.x; + kernel_values[i * 4 + 1] = k_tex.y; + kernel_values[i * 4 + 2] = k_tex.z; + kernel_values[i * 4 + 3] = k_tex.w; + } + + for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { + const vec4 in_tex = texelFetch(t_in, ivec3(pos[i * 2], pos[i * 2 + 1], z4), 0); + // Load the input texel into an array + float tex_values[4]; + tex_values[0] = in_tex.x; + tex_values[1] = in_tex.y; + tex_values[2] = in_tex.z; + tex_values[3] = in_tex.w; + + // For 2x2 tile size algorithm works as follows. + // To explain the calculations below, the contents of one in_tex and the + // group of 4 texels loaded from t_kernel are shown: + // + // in_tex t_kernel + // -x-> ---x---> + // +---+ +----+----+----+----+ + // ^ | w | ^ | D0 | D1 | D2 | D3 | + // | +---+ | +----+----+----+----+ + // | | z | | | C0 | C1 | C2 | C3 | + // z +---+ z +----+----+----+----+ + // | | y | | | B0 | B2 | B2 | B3 | + // | +---+ | +----+----+----+----+ + // | x | | A0 | A1 | A2 | A3 | + // +---+ +----+----+----+----+ + // + // In the t_kernel graphic, cells sharing the same letter are from + // the same batch/output channel index, and the number denotes a unique + // channel index. To calculate the output texel, the following + // calculation is performed: + // + // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ + // | x | | D0 | | y | | D1 | | z | | D2 | | w | | D3 | + // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ + // | x | | C0 | | y | | C1 | | z | | C2 | | w | | C3 | + // +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+ + // | x | | B0 | | y | | B1 | | z | | B2 | | w | | B3 | + // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ + // | x | | A0 | | y | | A1 | | z | | A2 | | w | | A3 | + // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ + // + // which is what is expressed in the following calculations. This is done + // for each output position. + for (int j = 0; j < 4; ++j) { + sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j]; + sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j]; + sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j]; + sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j]; + } + } + } + + for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { + const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos[2]); + if (all(lessThan(pos_l, out_limits.xyz))) { + imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); + } + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml new file mode 100644 index 00000000000..ebfee11c405 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_pw_s1p0: + parameter_names_with_default_values: + OPERATOR: X + NDIM: 3 + DTYPE: float + TILE_SIZE_X: 1 + TILE_SIZE_Y: 4 + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: conv2d_pw_s1p0 + - NAME: conv2d_pw_s1p0_clamp + OPERATOR: clamp(X, A, B) diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index ba1f50a23c1..fbe4a61befc 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -127,7 +127,8 @@ vkapi::ShaderInfo get_conv2d_shader( const Conv2dMethod method, const ValueRef weight, const bool clamp_out = false, - const bool stride_equals_dilation = false) { + const bool stride_equals_dilation = false, + const bool stride_1_padding_0 = false) { std::string kernel_name; kernel_name.reserve(kShaderNameReserve); switch (method) { @@ -150,7 +151,7 @@ vkapi::ShaderInfo get_conv2d_shader( if (prepack_weights) { kernel_name = "conv2d"; } else { - kernel_name = "conv2d_pw"; + kernel_name = stride_1_padding_0 ? "conv2d_pw_s1p0" : "conv2d_pw"; } break; case Conv2dMethod::SlidingWindow: @@ -382,6 +383,10 @@ void add_conv2d_node( (kernel_params.stride[0] == kernel_params.dilation[0] && kernel_params.stride[1] == kernel_params.dilation[1]); + const bool stride_1_padding_0 = + (kernel_params.stride[0] == 1 && kernel_params.stride[1] == 1 && + kernel_params.padding[0] == 0 && kernel_params.padding[1] == 0); + OutputParams out_params = {out_min_val, out_max_val}; check_conv2d_params(kernel_params, transposed_val); @@ -393,7 +398,8 @@ void add_conv2d_node( method, weight_data, clamp_out, - stride_equals_dilation); + stride_equals_dilation, + stride_1_padding_0); utils::uvec3 wg_size = create_conv2d_global_wg_size( graph, method, out, weight_data, stride_equals_dilation);