Skip to content

Commit 8288924

Browse files
committed
Update on "[ET-VK] Using shared memory to save position in conv2d dw output op."
This diff introduces a change to conv2d dw op to save output positions in shared memory, which reduces register usage and improves performance. Differential Revision: [D68400890](https://our.internmc.facebook.com/intern/diff/D68400890/) [ghstack-poisoned]
2 parents 64d164a + 67af6ff commit 8288924

File tree

2 files changed

+7
-9
lines changed

2 files changed

+7
-9
lines changed

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
#define BATCH_SIZE_Y ${BATCH_SIZE_Y}
2222

23+
#define LOCAL_WG_SIZE 64
24+
2325
#define op(X, A, B) ${OPERATOR}
2426

2527
#include "indexing_utils.h"
@@ -38,12 +40,10 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
3840

3941
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4042

41-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
43+
// For performance improvement, reduce register usage by caching positions in shared memory.
44+
// Offset index by 1 every 16 points to avoid bank access conflict.
4245
#define offset_pos_index(index) (index + ((index) >> 4))
43-
44-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
45-
// 64 is the number of threads in the local wg
46-
shared ivec3 pos_shared[offset_pos_index(64)];
46+
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
4747

4848
/*
4949
* Computes a depthwise convolution. Each shader invocation calculates the

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,9 @@ layout(push_constant) uniform restrict Block {
4343

4444
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4545

46-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
46+
// For performance improvement, reduce register usage by caching positions in shared memory.
47+
// Offset index by 1 every 16 points to avoid bank access conflict.
4748
#define offset_pos_index(index) (index + ((index) >> 4))
48-
49-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
50-
// 64 is the number of threads in the local wg
5149
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
5250

5351
/*

0 commit comments

Comments
 (0)