Skip to content

Commit 10dacab

Browse files
ngxsoncamel-cdr
andcommitted
better quantize_row_q8_K
Co-authored-by: camel-cdr <[email protected]>
1 parent 9517aee commit 10dacab

File tree

1 file changed

+14
-25
lines changed

1 file changed

+14
-25
lines changed

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,33 +1660,22 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
16601660

16611661
for (int i = 0; i < nb; i++) {
16621662
const float * x_block = x + i * QK_K;
1663-
v128_t amax_vec = wasm_f32x4_splat(0.0f);
1664-
v128_t max_vec = wasm_f32x4_splat(0.0f);
16651663

1666-
// Vectorized max abs value search
1667-
for (int j = 0; j < QK_K; j += 4) {
1664+
v128_t min_vec = wasm_v128_load(x_block);
1665+
v128_t max_vec = min_vec;
1666+
1667+
for (int j = 4; j < QK_K; j += 4) {
16681668
v128_t x_vec = wasm_v128_load(x_block + j);
1669-
v128_t abs_x = wasm_f32x4_abs(x_vec);
1670-
v128_t mask = wasm_f32x4_gt(abs_x, amax_vec);
1671-
amax_vec = wasm_v128_bitselect(abs_x, amax_vec, mask);
1672-
max_vec = wasm_v128_bitselect(x_vec, max_vec, mask);
1673-
}
1674-
1675-
// Manual unroll for lane extraction
1676-
float amax = wasm_f32x4_extract_lane(amax_vec, 0);
1677-
float max_val = wasm_f32x4_extract_lane(max_vec, 0);
1678-
#define UPDATE_MAX(lane) \
1679-
{ \
1680-
float a = wasm_f32x4_extract_lane(amax_vec, lane); \
1681-
if (a > amax) { \
1682-
amax = a; \
1683-
max_val = wasm_f32x4_extract_lane(max_vec, lane); \
1684-
} \
1685-
}
1686-
UPDATE_MAX(1)
1687-
UPDATE_MAX(2)
1688-
UPDATE_MAX(3)
1689-
#undef UPDATE_MAX
1669+
max_vec = wasm_f32x4_pmax(max_vec, x_vec);
1670+
min_vec = wasm_f32x4_pmin(min_vec, x_vec);
1671+
}
1672+
max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
1673+
max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
1674+
min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
1675+
min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
1676+
float max = wasm_f32x4_extract_lane(max_vec, 0);
1677+
float min = wasm_f32x4_extract_lane(min_vec, 0);
1678+
float amax = -min > max ? min : max;
16901679

16911680
if (amax == 0.0f) {
16921681
yc[i].d = 0.0f;

0 commit comments

Comments
 (0)