Skip to content

Commit 7986ea1

Browse files
committed
ggml_vec_dot_q8_q8 mma optimize
Tried this change with different batch sizes. batch_size = 4,8,16,32 -> llama-batched-benchm similar results to base. after batch_size=64, degrades perf . llama-bench also gives similar results to base. Not much perf gain. Signed-off-by: Shalini Salomi Bodapati <[email protected]>
1 parent 19e899c commit 7986ea1

File tree

1 file changed

+62
-30
lines changed

1 file changed

+62
-30
lines changed

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 62 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3920,46 +3920,78 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
39203920

39213921
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
39223922
}
3923-
#elif defined(__POWER9_VECTOR__)
3924-
const vector signed int v0 = vec_splats((int32_t)0);
3925-
vector float vsumf0 = vec_splats(0.0f);
3923+
#if defined(__POWER9_VECTOR__)
3924+
const int batch_size = 4;
3925+
float vsumf0 = 0.0f;
3926+
3927+
vector unsigned char xor_vector = vec_splats((unsigned char)0x80);
3928+
vector unsigned char one_vector = vec_splats((unsigned char)1);
3929+
3930+
for (; ib < nb; ib += batch_size) {
3931+
__vector_quad acc[batch_size * 2];
3932+
__vector_quad acc_hsum[batch_size * 2];
3933+
vector unsigned char q8x_store[batch_size][2];
3934+
float deltas[batch_size];
3935+
3936+
for (int i = 0; i < batch_size * 2; i++) {
3937+
__builtin_mma_xxsetaccz(&acc[i]);
3938+
__builtin_mma_xxsetaccz(&acc_hsum[i]);
3939+
}
39263940

3927-
#pragma GCC unroll 8
3928-
for (; ib < nb; ++ib) {
3929-
__builtin_prefetch(x[ib].qs, 0, 1);
3930-
__builtin_prefetch(y[ib].qs, 0, 1);
3941+
for (int b = 0; b < batch_size; ++b) {
3942+
deltas[b] = GGML_FP16_TO_FP32(x[ib + b].d) * GGML_FP16_TO_FP32(y[ib + b].d);
3943+
__builtin_prefetch(x[ib + b].qs, 0, 1);
3944+
__builtin_prefetch(y[ib + b].qs, 0, 1);
39313945

3932-
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
3933-
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
3934-
vector float vd = vec_mul(vxd, vyd);
3946+
__vector_pair X, Y;
3947+
__vector_unsigned char q8x[2], q8y[2];
39353948

3936-
vector signed char q8x0 = vec_xl( 0, x[ib].qs);
3937-
vector signed char q8x1 = vec_xl(16, x[ib].qs);
3938-
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
3939-
vector signed char q8y1 = vec_xl(16, y[ib].qs);
3949+
X = __builtin_vsx_lxvp(0, (__vector_pair*)x[ib + b].qs);
3950+
Y = __builtin_vsx_lxvp(0, (__vector_pair*)y[ib + b].qs);
39403951

3941-
vector signed short qv0 = vec_mule(q8x0, q8y0);
3942-
vector signed short qv1 = vec_mulo(q8x0, q8y0);
3943-
vector signed short qv2 = vec_mule(q8x1, q8y1);
3944-
vector signed short qv3 = vec_mulo(q8x1, q8y1);
3952+
__builtin_vsx_disassemble_pair(q8x, &X);
3953+
__builtin_vsx_disassemble_pair(q8y, &Y);
39453954

3946-
vector signed int vsumi0 = v0;
3947-
vector signed int vsumi1 = v0;
3955+
q8y[0] = vec_xor(q8y[0], xor_vector);
3956+
q8y[1] = vec_xor(q8y[1], xor_vector);
39483957

3949-
vsumi0 = vec_sum4s(qv0, vsumi0);
3950-
vsumi1 = vec_sum4s(qv1, vsumi1);
3951-
vsumi0 = vec_sum4s(qv2, vsumi0);
3952-
vsumi1 = vec_sum4s(qv3, vsumi1);
3958+
q8x_store[b][0] = q8x[0];
3959+
q8x_store[b][1] = q8x[1];
39533960

3954-
vsumi0 = vec_add(vsumi0, vsumi1);
3961+
__builtin_mma_xvi8ger4pp(&acc[b * 2 + 0], q8x[0], q8y[0]);
3962+
__builtin_mma_xvi8ger4pp(&acc[b * 2 + 1], q8x[1], q8y[1]);
39553963

3956-
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
3957-
}
3964+
__builtin_mma_xvi8ger4pp(&acc_hsum[b * 2 + 0], q8x[0], one_vector);
3965+
__builtin_mma_xvi8ger4pp(&acc_hsum[b * 2 + 1], q8x[1], one_vector);
3966+
}
39583967

3959-
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
3960-
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
3968+
for (int b = 0; b < batch_size; ++b) {
3969+
vector signed int temp0[4], temp1[4];
39613970

3962-
sumf = vec_extract(vsumf0, 0);
3971+
__builtin_mma_disassemble_acc(temp0, &acc[b * 2 + 0]);
3972+
__builtin_mma_disassemble_acc(temp1, &acc[b * 2 + 1]);
3973+
3974+
int32_t mma_sum0 = *((int32_t*)&temp0[0] + 0) + *((int32_t*)&temp0[1] + 1) +
3975+
*((int32_t*)&temp0[2] + 2) + *((int32_t*)&temp0[3] + 3);
3976+
int32_t mma_sum1 = *((int32_t*)&temp1[0] + 0) + *((int32_t*)&temp1[1] + 1) +
3977+
*((int32_t*)&temp1[2] + 2) + *((int32_t*)&temp1[3] + 3);
3978+
3979+
__builtin_mma_disassemble_acc(temp0, &acc_hsum[b * 2 + 0]);
3980+
__builtin_mma_disassemble_acc(temp1, &acc_hsum[b * 2 + 1]);
3981+
3982+
int32_t sum0 = *((int32_t*)&temp0[0] + 0) + *((int32_t*)&temp0[1] + 1) +
3983+
*((int32_t*)&temp0[2] + 2) + *((int32_t*)&temp0[3] + 3);
3984+
int32_t sum1 = *((int32_t*)&temp1[0] + 0) + *((int32_t*)&temp1[1] + 1) +
3985+
*((int32_t*)&temp1[2] + 2) + *((int32_t*)&temp1[3] + 3);
3986+
3987+
int32_t vsum = mma_sum0 + mma_sum1 + (sum0 + sum1) * -128;
3988+
vsumf0 += (float)vsum * deltas[b];
3989+
}
3990+
}
3991+
3992+
return vsumf0;
3993+
3994+
#endif
39633995

39643996
#elif defined(__loongarch_asx)
39653997
// Initialize accumulator with zeros

0 commit comments

Comments
 (0)