@@ -64,47 +64,47 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
6464 const FLOAT_TYPE sc6 = scale8_f.z;
6565 const FLOAT_TYPE sc7 = scale8_f.w;
6666
67- uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
68- uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
67+ const uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
68+ const uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
6969
7070 uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
7171 uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
7272 uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
7373 uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
7474
75- uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
75+ const uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
7676
77- uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
78- uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
79- uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010);
80- uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
77+ const uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
78+ const uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
79+ const uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010);
80+ const uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
8181
8282 qs0_16_u32_lo4 += qs0_16_lo4_offset16;
8383 qs0_16_u32_hi4 += qs0_16_hi4_offset16;
8484 qs64_80_u32_lo4 += qs64_80_lo4_offset16;
8585 qs64_80_u32_hi4 += qs64_80_hi4_offset16;
8686
87- uvec4 qs0_16_lo4 = uvec4 (unpack8(qs0_16_u32_lo4));
88- uvec4 qs64_80_lo4 = uvec4 (unpack8(qs64_80_u32_lo4));
89- uvec4 qs0_16_hi4 = uvec4 (unpack8(qs0_16_u32_hi4));
90- uvec4 qs64_80_hi4 = uvec4 (unpack8(qs64_80_u32_hi4));
91-
92- const uint32_t q4_0 = qs0_16_lo4.x;
93- const uint32_t q4_1 = qs0_16_lo4.y;
94- const uint32_t q4_2 = qs0_16_lo4.z;
95- const uint32_t q4_3 = qs0_16_lo4.w;
96- const uint32_t q4_4 = qs0_16_hi4.x;
97- const uint32_t q4_5 = qs0_16_hi4.y;
98- const uint32_t q4_6 = qs0_16_hi4.z;
99- const uint32_t q4_7 = qs0_16_hi4.w;
100- const uint32_t q4_8 = qs64_80_lo4.x;
101- const uint32_t q4_9 = qs64_80_lo4.y;
102- const uint32_t q4_10 = qs64_80_lo4.z;
103- const uint32_t q4_11 = qs64_80_lo4.w;
104- const uint32_t q4_12 = qs64_80_hi4.x;
105- const uint32_t q4_13 = qs64_80_hi4.y;
106- const uint32_t q4_14 = qs64_80_hi4.z;
107- const uint32_t q4_15 = qs64_80_hi4.w;
87+ const vec4 qs0_16_lo4 = vec4 (unpack8(qs0_16_u32_lo4));
88+ const vec4 qs64_80_lo4 = vec4 (unpack8(qs64_80_u32_lo4));
89+ const vec4 qs0_16_hi4 = vec4 (unpack8(qs0_16_u32_hi4));
90+ const vec4 qs64_80_hi4 = vec4 (unpack8(qs64_80_u32_hi4));
91+
92+ const FLOAT_TYPE q4_0 = qs0_16_lo4.x;
93+ const FLOAT_TYPE q4_1 = qs0_16_lo4.y;
94+ const FLOAT_TYPE q4_2 = qs0_16_lo4.z;
95+ const FLOAT_TYPE q4_3 = qs0_16_lo4.w;
96+ const FLOAT_TYPE q4_4 = qs0_16_hi4.x;
97+ const FLOAT_TYPE q4_5 = qs0_16_hi4.y;
98+ const FLOAT_TYPE q4_6 = qs0_16_hi4.z;
99+ const FLOAT_TYPE q4_7 = qs0_16_hi4.w;
100+ const FLOAT_TYPE q4_8 = qs64_80_lo4.x;
101+ const FLOAT_TYPE q4_9 = qs64_80_lo4.y;
102+ const FLOAT_TYPE q4_10 = qs64_80_lo4.z;
103+ const FLOAT_TYPE q4_11 = qs64_80_lo4.w;
104+ const FLOAT_TYPE q4_12 = qs64_80_hi4.x;
105+ const FLOAT_TYPE q4_13 = qs64_80_hi4.y;
106+ const FLOAT_TYPE q4_14 = qs64_80_hi4.z;
107+ const FLOAT_TYPE q4_15 = qs64_80_hi4.w;
108108
109109 [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
110110 B_TYPE_VEC2 by10 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2];
0 commit comments