@@ -1660,33 +1660,22 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
16601660
16611661 for (int i = 0; i < nb; i++) {
16621662 const float * x_block = x + i * QK_K;
1663- v128_t amax_vec = wasm_f32x4_splat(0.0f);
1664- v128_t max_vec = wasm_f32x4_splat(0.0f);
16651663
1666- // Vectorized max abs value search
1667- for (int j = 0; j < QK_K; j += 4) {
1664+ v128_t min_vec = wasm_v128_load(x_block);
1665+ v128_t max_vec = min_vec;
1666+
1667+ for (int j = 4; j < QK_K; j += 4) {
16681668 v128_t x_vec = wasm_v128_load(x_block + j);
1669- v128_t abs_x = wasm_f32x4_abs(x_vec);
1670- v128_t mask = wasm_f32x4_gt(abs_x, amax_vec);
1671- amax_vec = wasm_v128_bitselect(abs_x, amax_vec, mask);
1672- max_vec = wasm_v128_bitselect(x_vec, max_vec, mask);
1673- }
1674-
1675- // Manual unroll for lane extraction
1676- float amax = wasm_f32x4_extract_lane(amax_vec, 0);
1677- float max_val = wasm_f32x4_extract_lane(max_vec, 0);
1678- #define UPDATE_MAX(lane) \
1679- { \
1680- float a = wasm_f32x4_extract_lane(amax_vec, lane); \
1681- if (a > amax) { \
1682- amax = a; \
1683- max_val = wasm_f32x4_extract_lane(max_vec, lane); \
1684- } \
1685- }
1686- UPDATE_MAX(1)
1687- UPDATE_MAX(2)
1688- UPDATE_MAX(3)
1689- #undef UPDATE_MAX
1669+ max_vec = wasm_f32x4_pmax(max_vec, x_vec);
1670+ min_vec = wasm_f32x4_pmin(min_vec, x_vec);
1671+ }
1672+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
1673+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
1674+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
1675+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
1676+ float max = wasm_f32x4_extract_lane(max_vec, 0);
1677+ float min = wasm_f32x4_extract_lane(min_vec, 0);
1678+ float amax = -min > max ? min : max;
16901679
16911680 if (amax == 0.0f) {
16921681 yc[i].d = 0.0f;
0 commit comments