1616
1717#define op(X, A, B) ${OPERATOR}
1818
19- #include "indexing_utils .h"
19+ #include "indexing_utils_u16 .h"
2020
2121layout (std430) buffer ;
2222
@@ -32,8 +32,10 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
3232
3333layout (local_size_x_id = 0 , local_size_y_id = 1 , local_size_z_id = 2 ) in ;
3434
35+ #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
36+
3537// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
36- shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
38+ shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
3739
3840/*
3941 * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
@@ -44,18 +46,18 @@ void main() {
4446 const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1 ) / TILE_SIZE;
4547 const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
4648
47- const ivec3 gpos = idx_to_ipos_x_wise (gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
49+ const u16vec3 gpos = idx_to_u16pos_x_wise (gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
4850
4951 // Output position for TILE_SIZE = 2
5052 // +--------+--------+
5153 // | pos[0] | pos[1] |
5254 // +--------+--------+
5355 // | pos[2] | pos[3] |
5456 // +--------+--------+
55- ivec2 pos[TILE_SIZE * TILE_SIZE];
57+ u16vec2 pos[TILE_SIZE * TILE_SIZE];
5658 for (int y = 0 , i = 0 ; y < TILE_SIZE; ++ y) {
5759 for (int x = 0 ; x < TILE_SIZE; ++ x) {
58- pos[i] = ivec2 (
60+ pos[i] = u16vec2 (
5961 gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
6062 pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
6163 i++ ;
@@ -64,38 +66,38 @@ void main() {
6466
6567 // If the top left position is out of bounds, then this invocation will have
6668 // no work to do.
67- if (any (greaterThanEqual (ivec3 (pos[0 ], gpos.z), out_limits))) {
69+ if (any (greaterThanEqual (u16vec3 (pos[0 ], gpos.z), out_limits))) {
6870 return ;
6971 }
7072
7173 // Compute the index of the input texture that needs to be loaded for each
7274 // output position. Note that negative indices can be produced indicating that
7375 // the top-left element is in a region added by padding.
74- ivec2 ipos[TILE_SIZE * TILE_SIZE];
76+ u16vec2 ipos[TILE_SIZE * TILE_SIZE];
7577 for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
76- ipos[i] = pos[i] * stride - padding;
78+ ipos[i] = pos[i] * u16vec2( stride) - u16vec2( padding) ;
7779 }
7880
7981 vec4 sum[TILE_SIZE * TILE_SIZE];
80- sum[0 ] = texelFetch(t_bias, ivec2 (gpos.z, 0 ), 0 );
82+ sum[0 ] = texelFetch(t_bias, u16vec2 (gpos.z, 0 ), 0 );
8183 for (int i = 1 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
8284 sum[i] = sum[0 ];
8385 }
8486
8587 int z4 = 0 ;
8688 // Since the kernel is 1x1, we only have to loop over the depth dimension.
87- for (int z = 0 ; z < in_group_size; z += 4 , ++ z4) {
89+ for (uint16_t z = uint16_t( 0 ) ; z < uint16_t( in_group_size) ; z += uint16_t( 4 ) , ++ z4) {
8890 // During prepacking, the weight tensor has been permuted so that the
8991 // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
9092 // the z-axis.
91- const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (0 , 0 ));
92- const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (1 , 0 ));
93- const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (2 , 0 ));
94- const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (3 , 0 ));
93+ const vec4 ktex_0 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (0 , 0 ));
94+ const vec4 ktex_1 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (1 , 0 ));
95+ const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (2 , 0 ));
96+ const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (3 , 0 ));
9597
9698#pragma unroll
9799 for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
98- const vec4 in_tex = texelFetch(t_in, ivec3 (ipos[i], z4), 0 );
100+ const vec4 in_tex = texelFetch(t_in, u16vec3 (ipos[i], z4), 0 );
99101 // For 2x2 tile size algorithm works as follows.
100102 // To explain the calculations below, the contents of one in_tex and the
101103 // group of 4 texels loaded from t_kernel are shown:
@@ -137,9 +139,9 @@ void main() {
137139 }
138140
139141 for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
140- const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
141- if (all (lessThan (ivec3 (pos, gpos.z), out_limits))) {
142- imageStore(t_out, ivec3 (pos, gpos.z), op(sum[i], out_min, out_max));
142+ const u16vec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
143+ if (all (lessThan (u16vec3 (pos, gpos.z), out_limits))) {
144+ imageStore(t_out, u16vec3 (pos, gpos.z), op(sum[i], out_min, out_max));
143145 }
144146 }
145147}
0 commit comments