@@ -3920,46 +3920,78 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
39203920
39213921 sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
39223922 }
3923- #elif defined(__POWER9_VECTOR__)
3924- const vector signed int v0 = vec_splats((int32_t)0);
3925- vector float vsumf0 = vec_splats(0.0f);
3923+ #if defined(__POWER9_VECTOR__)
3924+ const int batch_size = 4;
3925+ float vsumf0 = 0.0f;
3926+
3927+ vector unsigned char xor_vector = vec_splats((unsigned char)0x80);
3928+ vector unsigned char one_vector = vec_splats((unsigned char)1);
3929+
3930+ for (; ib < nb; ib += batch_size) {
3931+ __vector_quad acc[batch_size * 2];
3932+ __vector_quad acc_hsum[batch_size * 2];
3933+ vector unsigned char q8x_store[batch_size][2];
3934+ float deltas[batch_size];
3935+
3936+ for (int i = 0; i < batch_size * 2; i++) {
3937+ __builtin_mma_xxsetaccz(&acc[i]);
3938+ __builtin_mma_xxsetaccz(&acc_hsum[i]);
3939+ }
39263940
3927- #pragma GCC unroll 8
3928- for (; ib < nb; ++ib) {
3929- __builtin_prefetch(x[ib].qs, 0, 1);
3930- __builtin_prefetch(y[ib].qs, 0, 1);
3941+ for (int b = 0; b < batch_size; ++b) {
3942+ deltas[b] = GGML_FP16_TO_FP32(x[ib + b].d) * GGML_FP16_TO_FP32(y[ib + b].d);
3943+ __builtin_prefetch(x[ib + b ].qs, 0, 1);
3944+ __builtin_prefetch(y[ib + b ].qs, 0, 1);
39313945
3932- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
3933- vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
3934- vector float vd = vec_mul(vxd, vyd);
3946+ __vector_pair X, Y;
3947+ __vector_unsigned char q8x[2], q8y[2];
39353948
3936- vector signed char q8x0 = vec_xl( 0, x[ib].qs);
3937- vector signed char q8x1 = vec_xl(16, x[ib].qs);
3938- vector signed char q8y0 = vec_xl( 0, y[ib].qs);
3939- vector signed char q8y1 = vec_xl(16, y[ib].qs);
3949+ X = __builtin_vsx_lxvp(0, (__vector_pair*)x[ib + b].qs);
3950+ Y = __builtin_vsx_lxvp(0, (__vector_pair*)y[ib + b].qs);
39403951
3941- vector signed short qv0 = vec_mule(q8x0, q8y0);
3942- vector signed short qv1 = vec_mulo(q8x0, q8y0);
3943- vector signed short qv2 = vec_mule(q8x1, q8y1);
3944- vector signed short qv3 = vec_mulo(q8x1, q8y1);
3952+ __builtin_vsx_disassemble_pair(q8x, &X);
3953+ __builtin_vsx_disassemble_pair(q8y, &Y);
39453954
3946- vector signed int vsumi0 = v0 ;
3947- vector signed int vsumi1 = v0 ;
3955+ q8y[0] = vec_xor(q8y[0], xor_vector) ;
3956+ q8y[1] = vec_xor(q8y[1], xor_vector) ;
39483957
3949- vsumi0 = vec_sum4s(qv0, vsumi0);
3950- vsumi1 = vec_sum4s(qv1, vsumi1);
3951- vsumi0 = vec_sum4s(qv2, vsumi0);
3952- vsumi1 = vec_sum4s(qv3, vsumi1);
3958+ q8x_store[b][0] = q8x[0];
3959+ q8x_store[b][1] = q8x[1];
39533960
3954- vsumi0 = vec_add(vsumi0, vsumi1);
3961+ __builtin_mma_xvi8ger4pp(&acc[b * 2 + 0], q8x[0], q8y[0]);
3962+ __builtin_mma_xvi8ger4pp(&acc[b * 2 + 1], q8x[1], q8y[1]);
39553963
3956- vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
3957- }
3964+ __builtin_mma_xvi8ger4pp(&acc_hsum[b * 2 + 0], q8x[0], one_vector);
3965+ __builtin_mma_xvi8ger4pp(&acc_hsum[b * 2 + 1], q8x[1], one_vector);
3966+ }
39583967
3959- vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
3960- vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)) ;
3968+ for (int b = 0; b < batch_size; ++b) {
3969+ vector signed int temp0[4], temp1[4] ;
39613970
3962- sumf = vec_extract(vsumf0, 0);
3971+ __builtin_mma_disassemble_acc(temp0, &acc[b * 2 + 0]);
3972+ __builtin_mma_disassemble_acc(temp1, &acc[b * 2 + 1]);
3973+
3974+ int32_t mma_sum0 = *((int32_t*)&temp0[0] + 0) + *((int32_t*)&temp0[1] + 1) +
3975+ *((int32_t*)&temp0[2] + 2) + *((int32_t*)&temp0[3] + 3);
3976+ int32_t mma_sum1 = *((int32_t*)&temp1[0] + 0) + *((int32_t*)&temp1[1] + 1) +
3977+ *((int32_t*)&temp1[2] + 2) + *((int32_t*)&temp1[3] + 3);
3978+
3979+ __builtin_mma_disassemble_acc(temp0, &acc_hsum[b * 2 + 0]);
3980+ __builtin_mma_disassemble_acc(temp1, &acc_hsum[b * 2 + 1]);
3981+
3982+ int32_t sum0 = *((int32_t*)&temp0[0] + 0) + *((int32_t*)&temp0[1] + 1) +
3983+ *((int32_t*)&temp0[2] + 2) + *((int32_t*)&temp0[3] + 3);
3984+ int32_t sum1 = *((int32_t*)&temp1[0] + 0) + *((int32_t*)&temp1[1] + 1) +
3985+ *((int32_t*)&temp1[2] + 2) + *((int32_t*)&temp1[3] + 3);
3986+
3987+ int32_t vsum = mma_sum0 + mma_sum1 + (sum0 + sum1) * -128;
3988+ vsumf0 += (float)vsum * deltas[b];
3989+ }
3990+ }
3991+
3992+ return vsumf0;
3993+
3994+ #endif
39633995
39643996#elif defined(__loongarch_asx)
39653997 // Initialize accumulator with zeros
0 commit comments