Skip to content

Commit d1ce0bd

Browse files
committed
Update base for Update on "[ET-VK] Using push constants for conv2d dw."
This diff is related to the use of push constants for convolutional dw (depthwise) in Executorch's Vulkan backend. This optimization improves memory usage. Differential Revision: [D68493849](https://our.internmc.facebook.com/intern/diff/D68493849/) [ghstack-poisoned]
1 parent af2897d commit d1ce0bd

File tree

2 files changed

+7
-9
lines changed

2 files changed

+7
-9
lines changed

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
#define BATCH_SIZE_Y ${BATCH_SIZE_Y}
2222

23+
#define LOCAL_WG_SIZE 64
24+
2325
#define op(X, A, B) ${OPERATOR}
2426

2527
#include "indexing_utils.h"
@@ -38,12 +40,10 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
3840

3941
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4042

41-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
43+
// For performance improvement, reduce register usage by caching positions in shared memory.
44+
// Offset index by 1 every 16 points to avoid bank access conflict.
4245
#define offset_pos_index(index) (index + ((index) >> 4))
43-
44-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
45-
// 64 is the number of threads in the local wg
46-
shared ivec3 pos_shared[offset_pos_index(64)];
46+
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
4747

4848
/*
4949
* Computes a depthwise convolution. Each shader invocation calculates the

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,9 @@ layout(push_constant) uniform restrict Block {
4343

4444
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4545

46-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
46+
// For performance improvement, reduce register usage by caching positions in shared memory.
47+
// Offset index by 1 every 16 points to avoid bank access conflict.
4748
#define offset_pos_index(index) (index + ((index) >> 4))
48-
49-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
50-
// 64 is the number of threads in the local wg
5149
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
5250

5351
/*

0 commit comments

Comments
 (0)