From 73f82b96a438a0582a9dfefe28189128e8592120 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 6 Jan 2025 09:51:31 -0800 Subject: [PATCH 1/3] [ET-VK] Adding a common utility function to calculate 3d output position based on unique index. This diff adds an indexing utils header file used in Vulkan backend of Executorch. The header file includes functions for converting a global index to u16 indices based on input sizes. Differential Revision: [D67821941](https://our.internmc.facebook.com/intern/diff/D67821941/) [ghstack-poisoned] --- .../runtime/graph/ops/glsl/conv2d_dw.glsl | 7 ++---- .../graph/ops/glsl/conv2d_dw_output_tile.glsl | 7 ++---- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 10 +++------ .../graph/ops/glsl/indexing_utils_u16.h | 22 +++++++++++++++++++ 4 files changed, 29 insertions(+), 17 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl index 43a4f7c8dc7..5d7c69ab654 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl @@ -14,7 +14,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils.h" +#include "indexing_utils_u16.h" layout(std430) buffer; @@ -35,10 +35,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; * output at a single output location. */ void main() { - const ivec3 pos = ivec3( - gl_GlobalInvocationID.x % out_limits.x, - (gl_GlobalInvocationID.x / out_limits.x) % out_limits.y, - gl_GlobalInvocationID.x / (out_limits.x * out_limits.y)); + const ivec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y); if (any(greaterThanEqual(pos, out_limits))) { return; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index b2ae4953a78..93aa8e4d14c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -18,7 +18,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils.h" +#include "indexing_utils_u16.h" layout(std430) buffer; @@ -45,10 +45,7 @@ void main() { // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z const uint out_limits_y_scaled = (out_limits.y + BATCH_SIZE_Y - 1) / BATCH_SIZE_Y; - u16vec3 pos = u16vec3( - gl_GlobalInvocationID.x % out_limits.x, - ((gl_GlobalInvocationID.x / out_limits.x) % out_limits_y_scaled), - gl_GlobalInvocationID.x / (out_limits.x * out_limits_y_scaled)); + u16vec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits_y_scaled); // scale pos.y by batch size, because that's the top pixel to be processed pos.y *= uint16_t(BATCH_SIZE_Y); diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index 23ad912c11a..ad5d4adb134 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -16,7 +16,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils.h" +#include "indexing_utils_u16.h" layout(std430) buffer; @@ -43,13 +43,10 @@ shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroup * size is only 1x1, making it easier to re-use loaded texels from t_kernel. */ void main() { - const uvec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE; + const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE; const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; - const u16vec3 gpos = u16vec3( - gl_GlobalInvocationID.x % out_limits_scaled.x, - (gl_GlobalInvocationID.x / out_limits_scaled.x) % out_limits_scaled.y, - gl_GlobalInvocationID.x / (out_limits_scaled.x * out_limits_scaled.y)); + const u16vec3 gpos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y); // Output position for TILE_SIZE = 2 // +--------+--------+ @@ -98,7 +95,6 @@ void main() { const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(2, 0)); const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(3, 0)); - #pragma unroll for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) { const vec4 in_tex = texelFetch(t_in, u16vec3(ipos[i], z4), 0); diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h new file mode 100644 index 00000000000..f66187dd466 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef INDEXING_UTILS_U16_H +#define INDEXING_UTILS_U16_H + +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require + +u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) { + const uint div_by_x = idx / size_x; + return u16vec3( + idx % size_x, + div_by_x % size_y, + div_by_x / size_y); +} + +#endif // INDEXING_UTILS_U16_H From a93a2f285636a8495fa4b00e935d0efbd8a2061c Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 6 Jan 2025 10:07:31 -0800 Subject: [PATCH 2/3] Update on "[ET-VK] Adding a common utility function to calculate 3d output position based on unique index." This diff adds an indexing utils header file used in Vulkan backend of Executorch. The header file includes functions for converting a global index to u16 indices based on input sizes. Differential Revision: [D67821941](https://our.internmc.facebook.com/intern/diff/D67821941/) [ghstack-poisoned] --- .../vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index 93aa8e4d14c..20fb9374bec 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -43,7 +43,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { // y divided up by batch size is used to determine 3d position // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z - const uint out_limits_y_scaled = (out_limits.y + BATCH_SIZE_Y - 1) / BATCH_SIZE_Y; + const int out_limits_y_scaled = (out_limits.y + BATCH_SIZE_Y - 1) / BATCH_SIZE_Y; u16vec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits_y_scaled); From 563c3b14122cb5df2a387477e9dbc97878abe662 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:33:21 -0800 Subject: [PATCH 3/3] Update on "[ET-VK] Adding a common utility function to calculate 3d output position based on unique index." This diff adds an indexing utils header file used in Vulkan backend of Executorch. The header file includes functions for converting a global index to u16 indices based on input sizes. Differential Revision: [D67821941](https://our.internmc.facebook.com/intern/diff/D67821941/) [ghstack-poisoned] --- backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h index f66187dd466..6dc59b63039 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h @@ -13,10 +13,7 @@ u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) { const uint div_by_x = idx / size_x; - return u16vec3( - idx % size_x, - div_by_x % size_y, - div_by_x / size_y); + return u16vec3(idx % size_x, div_by_x % size_y, div_by_x / size_y); } #endif // INDEXING_UTILS_U16_H