Skip to content

Commit 1c5ae12

Browse files
committed
Update base for Update on "[ET-VK] Using TmpTensor for width packed versions of q_linear op shader to reduce memory usage."
This diff introduces the use of temporary tensors to reduce memory usage in the width packed versions of the q_linear op shader. Differential Revision: [D68561647](https://our.internmc.facebook.com/intern/diff/D68561647/) [ghstack-poisoned]
1 parent 37cbf1b commit 1c5ae12

File tree

3 files changed

+9
-11
lines changed

3 files changed

+9
-11
lines changed

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
#define BATCH_SIZE_Y ${BATCH_SIZE_Y}
2222

23+
#define LOCAL_WG_SIZE 64
24+
2325
#define op(X, A, B) ${OPERATOR}
2426

2527
#include "indexing_utils.h"
@@ -47,12 +49,10 @@ layout(push_constant) uniform restrict Block {
4749

4850
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4951

50-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
52+
// For performance improvement, reduce register usage by caching positions in shared memory.
53+
// Offset index by 1 every 16 points to avoid bank access conflict.
5154
#define offset_pos_index(index) (index + ((index) >> 4))
52-
53-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
54-
// 64 is the number of threads in the local wg
55-
shared ivec3 pos_shared[offset_pos_index(64)];
55+
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
5656

5757
/*
5858
* Computes a depthwise convolution. Each shader invocation calculates the

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,9 @@ layout(push_constant) uniform restrict Block {
4343

4444
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4545

46-
// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
46+
// For performance improvement, reduce register usage by caching positions in shared memory.
47+
// Offset index by 1 every 16 points to avoid bank access conflict.
4748
#define offset_pos_index(index) (index + ((index) >> 4))
48-
49-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
50-
// 64 is the number of threads in the local wg
5149
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
5250

5351
/*

backends/vulkan/runtime/graph/ops/impl/Convolution.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,13 +452,13 @@ void add_conv2d_node(
452452
{{out, vkapi::MemoryAccessType::WRITE},
453453
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
454454
// Shader params buffers
455-
std::move(param_buffers),
455+
param_buffers,
456456
// Specialization Constants
457457
{},
458458
// Resizing Logic
459459
resize_conv2d_node,
460460
{weight_data, stride, padding, dilation, transposed, output_padding},
461-
std::move(push_constants)));
461+
push_constants));
462462
}
463463

464464
void add_conv1d_node(

0 commit comments

Comments
 (0)