@@ -24,8 +24,25 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
2424
2525 const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
2626 const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;
27+
28+ #if defined(ADRENO)
29+ const vec4 scale_0_4_l_f = vec4(
30+ float((scale_0_4_l >> 0) & 0x3Fu),
31+ float((scale_0_4_l >> 8) & 0x3Fu),
32+ float((scale_0_4_l >> 16) & 0x3Fu),
33+ float((scale_0_4_l >> 24) & 0x3Fu)
34+ );
35+
36+ const vec4 scale8_f = vec4(
37+ float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 0 & 0xFFu),
38+ float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 8 & 0xFFu),
39+ float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 16 & 0xFFu),
40+ float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 24 & 0xFFu)
41+ );
42+ #else
2743 const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
2844 const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));
45+ #endif
2946
3047 const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
3148 const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
@@ -44,10 +61,17 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
4461 const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
4562 const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
4663
64+ #if defined(ADRENO)
65+ const vec4 qs0_lo4 = vec4(float(qs0_u32_lo4 & 0xFFu), float((qs0_u32_lo4 >> 8) & 0xFFu), float((qs0_u32_lo4 >> 16) & 0xFFu), float((qs0_u32_lo4 >> 24) & 0xFFu));
66+ const vec4 qs64_lo4 = vec4(float(qs64_u32_lo4 & 0xFFu), float((qs64_u32_lo4 >> 8) & 0xFFu), float((qs64_u32_lo4 >> 16) & 0xFFu), float((qs64_u32_lo4 >> 24) & 0xFFu));
67+ const vec4 qs0_hi4 = vec4(float(qs0_u32_hi4 & 0xFFu), float((qs0_u32_hi4 >> 8) & 0xFFu), float((qs0_u32_hi4 >> 16) & 0xFFu), float((qs0_u32_hi4 >> 24) & 0xFFu));
68+ const vec4 qs64_hi4 = vec4(float(qs64_u32_hi4 & 0xFFu), float((qs64_u32_hi4 >> 8) & 0xFFu), float((qs64_u32_hi4 >> 16) & 0xFFu), float((qs64_u32_hi4 >> 24) & 0xFFu));
69+ #else
4770 const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4));
4871 const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4));
4972 const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4));
5073 const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4));
74+ #endif
5175
5276 const FLOAT_TYPE q4_0 = qs0_lo4.x;
5377 const FLOAT_TYPE q4_1 = qs0_lo4.y;
@@ -66,7 +90,11 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
6690 const FLOAT_TYPE q4_14 = qs64_hi4.z;
6791 const FLOAT_TYPE q4_15 = qs64_hi4.w;
6892
93+ #if defined(ADRENO)
94+ for (uint j = 0; j < NUM_COLS; ++j) {
95+ #else
6996 [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
97+ #endif
7098 vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]);
7199 vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]);
72100 vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]);
0 commit comments