diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index cd385718ce0..86e1e037261 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -14,8 +14,6 @@ #define TILE_SIZE ${TILE_SIZE} -#define STRIDE_EQ_DILATION ${STRIDE_EQ_DILATION} - #define BATCH_SIZE_X ${BATCH_SIZE_X} #define BATCH_SIZE_Y ${BATCH_SIZE_Y} @@ -43,7 +41,6 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; * output at a single output location. */ -#if STRIDE_EQ_DILATION void main() { // x and y are divided by batch size to determine 3d position // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z @@ -125,42 +122,3 @@ void main() { } } } - -#else -void main() { - const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x; - const ivec3 pos = ivec3( - gl_GlobalInvocationID.x % out_limits.x, - div_by_x % out_limits.y, - div_by_x / out_limits.y); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - // Compute the index of the top-left element of the overlay region. Negative - // indices indicate that the top-left element is in a region added by padding. - const ivec2 ipos = pos.xy * stride - padding; - - // Compute the start and end of the input indices to load. Padding is assumed - // to be constant 0 padding, so any reads from the padding region is skipped. - const ivec2 start = ipos; - const ivec2 end = ipos + overlay_region.xy; - - VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); - int kx = 0; - for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) { - for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) { - // The weight kernel was rearranged such that every NxN filter is - // flattened to fit in one row. Each filter was then stacked on top of - // each other vertically. - const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0); - sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum); - kx++; - } - } - - imageStore(t_out, pos, op(sum, out_min, out_max)); -} - -#endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml index d3672f5ec2e..9cf6c22c6ca 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml @@ -12,7 +12,6 @@ conv2d_dw_output_tile: TILE_SIZE: 3 BATCH_SIZE_X: 4 BATCH_SIZE_Y: 2 - STRIDE_EQ_DILATION: 0 generate_variant_forall: DTYPE: - VALUE: half @@ -26,15 +25,3 @@ conv2d_dw_output_tile: - NAME: conv2d_dw_output_tile_5x5_clamp OPERATOR: clamp(X, A, B) TILE_SIZE: 5 - - NAME: conv2d_dw_sed_output_tile_3x3 - STRIDE_EQ_DILATION: 1 - - NAME: conv2d_dw_sed_output_tile_3x3_clamp - OPERATOR: clamp(X, A, B) - STRIDE_EQ_DILATION: 1 - - NAME: conv2d_dw_sed_output_tile_5x5 - TILE_SIZE: 5 - STRIDE_EQ_DILATION: 1 - - NAME: conv2d_dw_sed_output_tile_5x5_clamp - OPERATOR: clamp(X, A, B) - TILE_SIZE: 5 - STRIDE_EQ_DILATION: 1 diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl new file mode 100644 index 00000000000..d0fc6707bff --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +#define TILE_SIZE ${TILE_SIZE} + +#define op(X, A, B) ${OPERATOR} + +#include "indexing_utils.h" + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} +${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} +${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} +${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} +${layout_declare_ubo(4, "ivec3", "out_limits")} +${layout_declare_ubo(5, "ivec4", "in_sizes")} +${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")} +${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")} +${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +/* + * Computes a depthwise convolution. Each shader invocation calculates the + * output at a single output location. + */ + +void main() { + const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x; + const ivec3 pos = ivec3( + gl_GlobalInvocationID.x % out_limits.x, + div_by_x % out_limits.y, + div_by_x / out_limits.y); + + if (any(greaterThanEqual(pos, out_limits))) { + return; + } + + // Compute the index of the top-left element of the overlay region. Negative + // indices indicate that the top-left element is in a region added by padding. + const ivec2 ipos = pos.xy * stride - padding; + + // Compute the start and end of the input indices to load. Padding is assumed + // to be constant 0 padding, so any reads from the padding region is skipped. + const ivec2 start = ipos; + const ivec2 end = ipos + overlay_region.xy; + + VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); + int kx = 0; + for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) { + for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) { + // The weight kernel was rearranged such that every NxN filter is + // flattened to fit in one row. Each filter was then stacked on top of + // each other vertically. + const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0); + sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum); + kx++; + } + } + + imageStore(t_out, pos, op(sum, out_min, out_max)); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml new file mode 100644 index 00000000000..f2ece8fa0f9 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml @@ -0,0 +1,25 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_dw_sned_output_tile: + parameter_names_with_default_values: + OPERATOR: X + NDIM: 3 + DTYPE: float + TILE_SIZE: 3 + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: conv2d_dw_sned_output_tile_3x3 + - NAME: conv2d_dw_sned_output_tile_3x3_clamp + OPERATOR: clamp(X, A, B) + - NAME: conv2d_dw_sned_output_tile_5x5 + TILE_SIZE: 5 + - NAME: conv2d_dw_sned_output_tile_5x5_clamp + OPERATOR: clamp(X, A, B) + TILE_SIZE: 5 diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index a7c11cc8535..8c369914c1b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -134,8 +134,8 @@ vkapi::ShaderInfo get_conv2d_shader( case Conv2dMethod::Depthwise: kernel_name = "conv2d_dw"; if (!prepack_weights) { - if (stride_equals_dilation) { - kernel_name += "_sed"; + if (!stride_equals_dilation) { + kernel_name += "_sned"; } const auto& weight_sizes = graph.get_tref(weight)->sizes; if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {