Skip to content

Commit 4b2490a

Browse files
committed
Exp: Perf Benefit with xvi8ger4pp signed version
The purpose of this patch is to measure gains we would get if we had an xvi8gerpp instruction which accepts both signed inputs. So, we comment out pre and post processing for tinyBLAS_Q0_PPC INt8 implementation Signed-off-by: Shalini Salomi Bodapati <[email protected]>
1 parent e298d2f commit 4b2490a

File tree

1 file changed

+42
-42
lines changed

1 file changed

+42
-42
lines changed

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1566,16 +1566,16 @@ class tinyBLAS_Q0_PPC {
15661566
}
15671567
}
15681568
}
1569-
15701569
template<int size>
15711570
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
15721571
vector signed int vec_C[4];
1573-
vector float CA[4] = {0};
1572+
//vector float CA[4] = {0};
15741573
vector float res[4] = {0};
15751574
__builtin_mma_disassemble_acc(vec_C, ACC);
15761575
for (int i = 0; i < 4; i++) {
1577-
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
1578-
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1576+
//CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
1577+
//res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1578+
res[i] = vec_ctf(vec_C[i], 0);
15791579
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
15801580
}
15811581
}
@@ -1971,7 +1971,7 @@ class tinyBLAS_Q0_PPC {
19711971
}
19721972

19731973
template<typename VA, typename VB>
1974-
void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
1974+
void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec/*, bool flip*/) {
19751975
int64_t i, j;
19761976
TB *aoffset = NULL;
19771977
VA *vecOffset = NULL;
@@ -1981,9 +1981,9 @@ class tinyBLAS_Q0_PPC {
19811981
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
19821982
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
19831983
VB t1, t2, t3, t4, t5, t6, t7, t8;
1984-
vector unsigned char xor_vector;
1985-
uint8_t flip_vec = 0x80;
1986-
xor_vector = vec_splats(flip_vec);
1984+
//vector unsigned char xor_vector;
1985+
//uint8_t flip_vec = 0x80;
1986+
//xor_vector = vec_splats(flip_vec);
19871987
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
19881988
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
19891989
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
@@ -2033,12 +2033,12 @@ class tinyBLAS_Q0_PPC {
20332033
t6 = vec_perm(t1, t3, swiz4);
20342034
t7 = vec_perm(t2, t4, swiz3);
20352035
t8 = vec_perm(t2, t4, swiz4);
2036-
if (flip == true) {
2036+
/*if (flip == true) {
20372037
t5 = vec_xor(t5, xor_vector);
20382038
t6 = vec_xor(t6, xor_vector);
20392039
t7 = vec_xor(t7, xor_vector);
20402040
t8 = vec_xor(t8, xor_vector);
2041-
}
2041+
}*/
20422042
vec_xst(t5, 0, vecOffset);
20432043
vec_xst(t6, 0, vecOffset+16);
20442044
vec_xst(t7, 0, vecOffset+32);
@@ -2052,12 +2052,12 @@ class tinyBLAS_Q0_PPC {
20522052
t6 = vec_perm(t1, t3, swiz4);
20532053
t7 = vec_perm(t2, t4, swiz3);
20542054
t8 = vec_perm(t2, t4, swiz4);
2055-
if (flip == true) {
2055+
/*if (flip == true) {
20562056
t5 = vec_xor(t5, xor_vector);
20572057
t6 = vec_xor(t6, xor_vector);
20582058
t7 = vec_xor(t7, xor_vector);
20592059
t8 = vec_xor(t8, xor_vector);
2060-
}
2060+
}*/
20612061
vec_xst(t5, 0, vecOffset+64);
20622062
vec_xst(t6, 0, vecOffset+80);
20632063
vec_xst(t7, 0, vecOffset+96);
@@ -2071,12 +2071,12 @@ class tinyBLAS_Q0_PPC {
20712071
t6 = vec_perm(t1, t3, swiz4);
20722072
t7 = vec_perm(t2, t4, swiz3);
20732073
t8 = vec_perm(t2, t4, swiz4);
2074-
if (flip == true) {
2074+
/*if (flip == true) {
20752075
t5 = vec_xor(t5, xor_vector);
20762076
t6 = vec_xor(t6, xor_vector);
20772077
t7 = vec_xor(t7, xor_vector);
20782078
t8 = vec_xor(t8, xor_vector);
2079-
}
2079+
}*/
20802080
vec_xst(t5, 0, vecOffset+128);
20812081
vec_xst(t6, 0, vecOffset+144);
20822082
vec_xst(t7, 0, vecOffset+160);
@@ -2090,12 +2090,12 @@ class tinyBLAS_Q0_PPC {
20902090
t6 = vec_perm(t1, t3, swiz4);
20912091
t7 = vec_perm(t2, t4, swiz3);
20922092
t8 = vec_perm(t2, t4, swiz4);
2093-
if (flip == true) {
2093+
/*if (flip == true) {
20942094
t5 = vec_xor(t5, xor_vector);
20952095
t6 = vec_xor(t6, xor_vector);
20962096
t7 = vec_xor(t7, xor_vector);
20972097
t8 = vec_xor(t8, xor_vector);
2098-
}
2098+
}*/
20992099
vec_xst(t5, 0, vecOffset+192);
21002100
vec_xst(t6, 0, vecOffset+208);
21012101
vec_xst(t7, 0, vecOffset+224);
@@ -2145,12 +2145,12 @@ class tinyBLAS_Q0_PPC {
21452145
t6 = vec_perm(t1, t3, swiz4);
21462146
t7 = vec_perm(t2, t4, swiz3);
21472147
t8 = vec_perm(t2, t4, swiz4);
2148-
if (flip == true) {
2148+
/*if (flip == true) {
21492149
t5 = vec_xor(t5, xor_vector);
21502150
t6 = vec_xor(t6, xor_vector);
21512151
t7 = vec_xor(t7, xor_vector);
21522152
t8 = vec_xor(t8, xor_vector);
2153-
}
2153+
}*/
21542154
vec_xst(t5, 0, vecOffset);
21552155
vec_xst(t6, 0, vecOffset+16);
21562156
vec_xst(t7, 0, vecOffset+32);
@@ -2164,12 +2164,12 @@ class tinyBLAS_Q0_PPC {
21642164
t6 = vec_perm(t1, t3, swiz4);
21652165
t7 = vec_perm(t2, t4, swiz3);
21662166
t8 = vec_perm(t2, t4, swiz4);
2167-
if (flip == true) {
2167+
/*if (flip == true) {
21682168
t5 = vec_xor(t5, xor_vector);
21692169
t6 = vec_xor(t6, xor_vector);
21702170
t7 = vec_xor(t7, xor_vector);
21712171
t8 = vec_xor(t8, xor_vector);
2172-
}
2172+
}*/
21732173
vec_xst(t5, 0, vecOffset+64);
21742174
vec_xst(t6, 0, vecOffset+80);
21752175
vec_xst(t7, 0, vecOffset+96);
@@ -2208,12 +2208,12 @@ class tinyBLAS_Q0_PPC {
22082208
t6 = vec_perm(t1, t3, swiz4);
22092209
t7 = vec_perm(t2, t4, swiz3);
22102210
t8 = vec_perm(t2, t4, swiz4);
2211-
if (flip == true) {
2211+
/*if (flip == true) {
22122212
t5 = vec_xor(t5, xor_vector);
22132213
t6 = vec_xor(t6, xor_vector);
22142214
t7 = vec_xor(t7, xor_vector);
22152215
t8 = vec_xor(t8, xor_vector);
2216-
}
2216+
}*/
22172217
vec_xst(t5, 0, vecOffset);
22182218
vec_xst(t6, 0, vecOffset+16);
22192219
vec_xst(t7, 0, vecOffset+32);
@@ -2227,12 +2227,12 @@ class tinyBLAS_Q0_PPC {
22272227
t6 = vec_perm(t1, t3, swiz4);
22282228
t7 = vec_perm(t2, t4, swiz3);
22292229
t8 = vec_perm(t2, t4, swiz4);
2230-
if (flip == true) {
2230+
/*if (flip == true) {
22312231
t5 = vec_xor(t5, xor_vector);
22322232
t6 = vec_xor(t6, xor_vector);
22332233
t7 = vec_xor(t7, xor_vector);
22342234
t8 = vec_xor(t8, xor_vector);
2235-
}
2235+
}*/
22362236
vec_xst(t5, 0, vecOffset+64);
22372237
vec_xst(t6, 0, vecOffset+80);
22382238
vec_xst(t7, 0, vecOffset+96);
@@ -2415,9 +2415,9 @@ class tinyBLAS_Q0_PPC {
24152415
if (std::is_same_v<TA, block_q4_0>) {
24162416
packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray);
24172417
} else {
2418-
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
2418+
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A/*, false*/);
24192419
}
2420-
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
2420+
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B/*, true*/);
24212421
for(int x = 0; x < 8; x++) {
24222422
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
24232423
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
@@ -2428,7 +2428,7 @@ class tinyBLAS_Q0_PPC {
24282428
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
24292429
}
24302430
}
2431-
if (!isAblock_q4) {
2431+
/*if (!isAblock_q4) {
24322432
auto aoffset = A+(ii*lda)+l;
24332433
for (int i = 0; i < 4; i++) {
24342434
comparray[i] = 0;
@@ -2439,7 +2439,7 @@ class tinyBLAS_Q0_PPC {
24392439
comparray[i] = ca;
24402440
aoffset += lda;
24412441
}
2442-
}
2442+
}*/
24432443
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
24442444
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
24452445
}
@@ -2460,9 +2460,9 @@ class tinyBLAS_Q0_PPC {
24602460
if (std::is_same_v<TA, block_q4_0>) {
24612461
packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
24622462
} else {
2463-
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
2463+
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A/*, false*/);
24642464
}
2465-
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
2465+
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B/*, true*/);
24662466
for(int x = 0; x < 8; x++) {
24672467
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
24682468
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
@@ -2472,7 +2472,7 @@ class tinyBLAS_Q0_PPC {
24722472
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
24732473
}
24742474
}
2475-
if (!isAblock_q4) {
2475+
/*if (!isAblock_q4) {
24762476
auto aoffset = A+(ii*lda)+l;
24772477
for (int i = 0; i < 8; i++) {
24782478
comparray[i] = 0;
@@ -2483,7 +2483,7 @@ class tinyBLAS_Q0_PPC {
24832483
comparray[i] = ca;
24842484
aoffset += lda;
24852485
}
2486-
}
2486+
}*/
24872487
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
24882488
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
24892489
}
@@ -2506,9 +2506,9 @@ class tinyBLAS_Q0_PPC {
25062506
if (std::is_same_v<TA, block_q4_0>) {
25072507
packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
25082508
} else {
2509-
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
2509+
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A/*, false*/);
25102510
}
2511-
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
2511+
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B/*, true*/);
25122512
for(int x = 0; x < 8; x++) {
25132513
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
25142514
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
@@ -2521,7 +2521,7 @@ class tinyBLAS_Q0_PPC {
25212521
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
25222522
}
25232523
}
2524-
if (!isAblock_q4) {
2524+
/*if (!isAblock_q4) {
25252525
auto aoffset = A+(ii*lda)+l;
25262526
for (int i = 0; i < 8; i++) {
25272527
comparray[i] = 0;
@@ -2532,7 +2532,7 @@ class tinyBLAS_Q0_PPC {
25322532
comparray[i] = ca;
25332533
aoffset += lda;
25342534
}
2535-
}
2535+
}*/
25362536
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
25372537
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
25382538
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
@@ -2576,9 +2576,9 @@ class tinyBLAS_Q0_PPC {
25762576
if (isAblock_q4) {
25772577
packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray);
25782578
} else {
2579-
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
2579+
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A/*, false*/);
25802580
}
2581-
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
2581+
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B/*, true*/);
25822582
for(int x = 0; x < 8; x+=4) {
25832583
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
25842584
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
@@ -2591,7 +2591,7 @@ class tinyBLAS_Q0_PPC {
25912591
}
25922592
}
25932593
__builtin_mma_disassemble_acc(vec_C, &acc_0);
2594-
if (!isAblock_q4) {
2594+
/*if (!isAblock_q4) {
25952595
auto aoffset = A+(ii*lda)+l;
25962596
for (int i = 0; i < RM; i++) {
25972597
comparray[i] = 0;
@@ -2602,10 +2602,10 @@ class tinyBLAS_Q0_PPC {
26022602
comparray[i] = ca;
26032603
aoffset += lda;
26042604
}
2605-
}
2605+
}*/
26062606
for (int i = 0; i < RM; i++) {
2607-
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
2608-
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
2607+
//CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
2608+
res[i] = vec_ctf(vec_C[i], 0);
26092609
fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
26102610
}
26112611
}

0 commit comments

Comments
 (0)