Skip to content

Commit 3d6f836

Browse files
committed
Update base for Update on "[ET-VK] Changing all conv 2d pw ints from uint16 to int since it slightly improves perf."
This diff changes all integers in conv 2d pw op shader from uint16 to int in the Vulkan backend of Executorch. The change is made to improve performance since the shader does not appear to be register bound. Differential Revision: [D67906023](https://our.internmc.facebook.com/intern/diff/D67906023/) [ghstack-poisoned]
1 parent eeffa26 commit 3d6f836

File tree

3 files changed

+41
-20
lines changed

3 files changed

+41
-20
lines changed

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
#define op(X, A, B) ${OPERATOR}
1616

17-
#include "indexing_utils.h"
17+
#include "indexing_utils_u16.h"
1818

1919
layout(std430) buffer;
2020

@@ -35,7 +35,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3535
* output at a single output location.
3636
*/
3737
void main() {
38-
const ivec3 pos = idx_to_ipos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y);
38+
const ivec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y);
3939

4040
if (any(greaterThanEqual(pos, out_limits))) {
4141
return;

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
#define op(X, A, B) ${OPERATOR}
1818

19-
#include "indexing_utils.h"
19+
#include "indexing_utils_u16.h"
2020

2121
layout(std430) buffer;
2222

@@ -32,8 +32,10 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
3232

3333
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3434

35+
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
36+
3537
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
36-
shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
38+
shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
3739

3840
/*
3941
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an
@@ -44,18 +46,18 @@ void main() {
4446
const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
4547
const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
4648

47-
const ivec3 gpos = idx_to_ipos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
49+
const u16vec3 gpos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
4850

4951
// Output position for TILE_SIZE = 2
5052
// +--------+--------+
5153
// | pos[0] | pos[1] |
5254
// +--------+--------+
5355
// | pos[2] | pos[3] |
5456
// +--------+--------+
55-
ivec2 pos[TILE_SIZE * TILE_SIZE];
57+
u16vec2 pos[TILE_SIZE * TILE_SIZE];
5658
for (int y = 0, i = 0; y < TILE_SIZE; ++y) {
5759
for (int x = 0; x < TILE_SIZE; ++x) {
58-
pos[i] = ivec2(
60+
pos[i] = u16vec2(
5961
gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
6062
pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
6163
i++;
@@ -64,38 +66,38 @@ void main() {
6466

6567
// If the top left position is out of bounds, then this invocation will have
6668
// no work to do.
67-
if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits))) {
69+
if (any(greaterThanEqual(u16vec3(pos[0], gpos.z), out_limits))) {
6870
return;
6971
}
7072

7173
// Compute the index of the input texture that needs to be loaded for each
7274
// output position. Note that negative indices can be produced indicating that
7375
// the top-left element is in a region added by padding.
74-
ivec2 ipos[TILE_SIZE * TILE_SIZE];
76+
u16vec2 ipos[TILE_SIZE * TILE_SIZE];
7577
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
76-
ipos[i] = pos[i] * stride - padding;
78+
ipos[i] = pos[i] * u16vec2(stride) - u16vec2(padding);
7779
}
7880

7981
vec4 sum[TILE_SIZE * TILE_SIZE];
80-
sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
82+
sum[0] = texelFetch(t_bias, u16vec2(gpos.z, 0), 0);
8183
for (int i = 1; i < TILE_SIZE * TILE_SIZE; ++i) {
8284
sum[i] = sum[0];
8385
}
8486

8587
int z4 = 0;
8688
// Since the kernel is 1x1, we only have to loop over the depth dimension.
87-
for (int z = 0; z < in_group_size; z += 4, ++z4) {
89+
for (uint16_t z = uint16_t(0); z < uint16_t(in_group_size); z += uint16_t(4), ++z4) {
8890
// During prepacking, the weight tensor has been permuted so that the
8991
// channel (IC) dim is along the x-axis, and the batch (OC) dim is along
9092
// the z-axis.
91-
const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0));
92-
const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0));
93-
const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0));
94-
const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
93+
const vec4 ktex_0 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(0, 0));
94+
const vec4 ktex_1 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(1, 0));
95+
const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(2, 0));
96+
const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(3, 0));
9597

9698
#pragma unroll
9799
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
98-
const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
100+
const vec4 in_tex = texelFetch(t_in, u16vec3(ipos[i], z4), 0);
99101
// For 2x2 tile size algorithm works as follows.
100102
// To explain the calculations below, the contents of one in_tex and the
101103
// group of 4 texels loaded from t_kernel are shown:
@@ -137,9 +139,9 @@ void main() {
137139
}
138140

139141
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
140-
const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
141-
if (all(lessThan(ivec3(pos, gpos.z), out_limits))) {
142-
imageStore(t_out, ivec3(pos, gpos.z), op(sum[i], out_min, out_max));
142+
const u16vec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
143+
if (all(lessThan(u16vec3(pos, gpos.z), out_limits))) {
144+
imageStore(t_out, u16vec3(pos, gpos.z), op(sum[i], out_min, out_max));
143145
}
144146
}
145147
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#ifndef INDEXING_UTILS_U16_H
10+
#define INDEXING_UTILS_U16_H
11+
12+
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
13+
14+
u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) {
15+
const uint div_by_x = idx / size_x;
16+
return u16vec3(idx % size_x, div_by_x % size_y, div_by_x / size_y);
17+
}
18+
19+
#endif // INDEXING_UTILS_U16_H

0 commit comments

Comments
 (0)