Skip to content

Commit 0a2c0a7

Browse files
committed
Update on "[ET-VK] Using push constants for conv2d dw."
This diff is related to the use of push constants for convolutional dw (depthwise) in Executorch's Vulkan backend. This optimization improves memory usage. Differential Revision: [D68493849](https://our.internmc.facebook.com/intern/diff/D68493849/) [ghstack-poisoned]
2 parents 90063b7 + d1ce0bd commit 0a2c0a7

File tree

3 files changed

+9
-11
lines changed

3 files changed

+9
-11
lines changed

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
#define BATCH_SIZE_Y ${BATCH_SIZE_Y}
2222

23+
#define LOCAL_WG_SIZE 64
24+
2325
#define op(X, A, B) ${OPERATOR}
2426

2527
#include "indexing_utils.h"
@@ -47,12 +49,10 @@ layout(push_constant) uniform restrict Block {
4749

4850
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4951

50-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
52+
// For performance improvement, reduce register usage by caching positions in shared memory.
53+
// Offset index by 1 every 16 points to avoid bank access conflict.
5154
#define offset_pos_index(index) (index + ((index) >> 4))
52-
53-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
54-
// 64 is the number of threads in the local wg
55-
shared ivec3 pos_shared[offset_pos_index(64)];
55+
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
5656

5757
/*
5858
* Computes a depthwise convolution. Each shader invocation calculates the

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,9 @@ layout(push_constant) uniform restrict Block {
4343

4444
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4545

46-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
46+
// For performance improvement, reduce register usage by caching positions in shared memory.
47+
// Offset index by 1 every 16 points to avoid bank access conflict.
4748
#define offset_pos_index(index) (index + ((index) >> 4))
48-
49-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
50-
// 64 is the number of threads in the local wg
5149
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
5250

5351
/*

backends/vulkan/runtime/graph/ops/impl/Convolution.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,13 +452,13 @@ void add_conv2d_node(
452452
{{out, vkapi::MemoryAccessType::WRITE},
453453
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
454454
// Shader params buffers
455-
std::move(param_buffers),
455+
param_buffers,
456456
// Specialization Constants
457457
{},
458458
// Resizing Logic
459459
resize_conv2d_node,
460460
{weight_data, stride, padding, dilation, transposed, output_padding},
461-
std::move(push_constants)));
461+
push_constants));
462462
}
463463

464464
void add_conv1d_node(

0 commit comments

Comments
 (0)