@@ -42,47 +42,25 @@ const lowp int packed_dim = unhash_packed_dim(t_layout);
4242 * Extends sign of int8
4343 */
4444int extend_sign(int x) {
45- if (x >> 7 == 1 ) {
46- return x | 0xFFFFFF00;
47- }
48- return x;
45+ return x | mix (0 , 0xFFFFFF00, x >= (1 << 7 ));
4946}
5047
5148ivec4 read_texel(ivec4 tidx) {
52- ivec4 tidx_to_use = tidx;
53- ivec4 sizes_to_use = sizes;
54- int packed_dim_to_use = packed_dim;
55- if (transpose_hw == 1 ) {
56- sizes_to_use.xy = sizes_to_use.yx;
57- tidx_to_use.xy = tidx.yx;
58-
59- if (packed_dim == 1 ) {
60- packed_dim_to_use = 0 ;
61- }
62- if (packed_dim == 0 ) {
63- packed_dim_to_use = 1 ;
64- }
65- }
49+ const ivec4 tidx_to_use = ivec4 (mix (tidx.xy, tidx.yx, bvec2 (transpose_hw == 1 )), tidx.zw);
50+ const ivec4 sizes_to_use = ivec4 (mix (sizes.xy, sizes.yx, bvec2 (transpose_hw == 1 )), sizes.zw);
51+ const int packed_dim_to_use = mix (packed_dim, packed_dim ^ transpose_hw, packed_dim < 2 );
6652
6753 const ivec4 buf_indices = tidx_to_nchwi(
6854 tidx_to_use, sizes_to_use, packed_dim_to_use);
6955
70- int shift = (1 << 8 ) - 1 ;
71- ivec4 masks;
72- // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
73- // little endian is assumed, as most processors use little endian. Thus the
74- // most significant bytes correspond to the "latter" packed values.
75- masks.x = shift << (8 * (buf_indices.x % 4 ));
76- masks.y = shift << (8 * (buf_indices.y % 4 ));
77- masks.z = shift << (8 * (buf_indices.z % 4 ));
78- masks.w = shift << (8 * (buf_indices.w % 4 ));
56+ const int mask = (1 << 8 ) - 1 ;
7957
8058 ivec4 out_tex = ivec4 (0 );
8159
8260 [[unroll]] for (int i = 0 ; i < 4 ; ++ i) {
8361 if (tidx[packed_dim] + i < sizes[packed_dim]) {
84- int in_texel = nchw_in[buf_indices[i] / 4 ];
85- int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4 )) ;
62+ const int in_texel = nchw_in[buf_indices[i] >> 2 ];
63+ int extracted_val = (in_texel >> (8 * (buf_indices[i] & 3 ))) & mask ;
8664 extracted_val = extend_sign(extracted_val);
8765 out_tex[i] = extracted_val;
8866 }
0 commit comments