From 4b2490a7c2907828b953815ea917d08931b9a3c4 Mon Sep 17 00:00:00 2001 From: Shalini Salomi Bodapati Date: Mon, 16 Jun 2025 04:53:20 -0500 Subject: [PATCH] Exp: Perf Benefit with xvi8ger4pp signed version The purpose of this patch is to measure gains we would get if we had an xvi8gerpp instruction which accepts both signed inputs. So, we comment out pre and post processing for tinyBLAS_Q0_PPC INt8 implementation Signed-off-by: Shalini Salomi Bodapati --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 84 +++++++++++++-------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 1d46158f928c4..9e807745da660 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -1566,16 +1566,16 @@ class tinyBLAS_Q0_PPC { } } } - template inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array& comparray, vector float* vs, vector float* fin_res) { vector signed int vec_C[4]; - vector float CA[4] = {0}; + //vector float CA[4] = {0}; vector float res[4] = {0}; __builtin_mma_disassemble_acc(vec_C, ACC); for (int i = 0; i < 4; i++) { - CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0)); - res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + //CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0)); + //res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + res[i] = vec_ctf(vec_C[i], 0); fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]); } } @@ -1971,7 +1971,7 @@ class tinyBLAS_Q0_PPC { } template - void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) { + void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec/*, bool flip*/) { int64_t i, j; TB *aoffset = NULL; VA *vecOffset = NULL; @@ -1981,9 +1981,9 @@ class tinyBLAS_Q0_PPC { VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0}; VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0}; VB t1, t2, t3, t4, t5, t6, t7, t8; - vector unsigned char xor_vector; - uint8_t flip_vec = 0x80; - xor_vector = vec_splats(flip_vec); + //vector unsigned char xor_vector; + //uint8_t flip_vec = 0x80; + //xor_vector = vec_splats(flip_vec); vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; @@ -2033,12 +2033,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset); vec_xst(t6, 0, vecOffset+16); vec_xst(t7, 0, vecOffset+32); @@ -2052,12 +2052,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset+64); vec_xst(t6, 0, vecOffset+80); vec_xst(t7, 0, vecOffset+96); @@ -2071,12 +2071,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset+128); vec_xst(t6, 0, vecOffset+144); vec_xst(t7, 0, vecOffset+160); @@ -2090,12 +2090,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset+192); vec_xst(t6, 0, vecOffset+208); vec_xst(t7, 0, vecOffset+224); @@ -2145,12 +2145,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset); vec_xst(t6, 0, vecOffset+16); vec_xst(t7, 0, vecOffset+32); @@ -2164,12 +2164,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset+64); vec_xst(t6, 0, vecOffset+80); vec_xst(t7, 0, vecOffset+96); @@ -2208,12 +2208,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset); vec_xst(t6, 0, vecOffset+16); vec_xst(t7, 0, vecOffset+32); @@ -2227,12 +2227,12 @@ class tinyBLAS_Q0_PPC { t6 = vec_perm(t1, t3, swiz4); t7 = vec_perm(t2, t4, swiz3); t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { + /*if (flip == true) { t5 = vec_xor(t5, xor_vector); t6 = vec_xor(t6, xor_vector); t7 = vec_xor(t7, xor_vector); t8 = vec_xor(t8, xor_vector); - } + }*/ vec_xst(t5, 0, vecOffset+64); vec_xst(t6, 0, vecOffset+80); vec_xst(t7, 0, vecOffset+96); @@ -2415,9 +2415,9 @@ class tinyBLAS_Q0_PPC { if (std::is_same_v) { packNormalInt4((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false); + packNormal((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A/*, false*/); } - packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); + packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B/*, true*/); for(int x = 0; x < 8; x++) { __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]); @@ -2428,7 +2428,7 @@ class tinyBLAS_Q0_PPC { *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d)); } } - if (!isAblock_q4) { + /*if (!isAblock_q4) { auto aoffset = A+(ii*lda)+l; for (int i = 0; i < 4; i++) { comparray[i] = 0; @@ -2439,7 +2439,7 @@ class tinyBLAS_Q0_PPC { comparray[i] = ca; aoffset += lda; } - } + }*/ compute<4>(&acc_0, 0, 0, comparray, vs, fin_res); compute<4>(&acc_1, 0, 4, comparray, vs, fin_res); } @@ -2460,9 +2460,9 @@ class tinyBLAS_Q0_PPC { if (std::is_same_v) { packNormalInt4((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A/*, false*/); } - packNormal((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true); + packNormal((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B/*, true*/); for(int x = 0; x < 8; x++) { __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]); @@ -2472,7 +2472,7 @@ class tinyBLAS_Q0_PPC { *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d)); } } - if (!isAblock_q4) { + /*if (!isAblock_q4) { auto aoffset = A+(ii*lda)+l; for (int i = 0; i < 8; i++) { comparray[i] = 0; @@ -2483,7 +2483,7 @@ class tinyBLAS_Q0_PPC { comparray[i] = ca; aoffset += lda; } - } + }*/ compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); } @@ -2506,9 +2506,9 @@ class tinyBLAS_Q0_PPC { if (std::is_same_v) { packNormalInt4((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A/*, false*/); } - packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); + packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B/*, true*/); for(int x = 0; x < 8; x++) { __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]); @@ -2521,7 +2521,7 @@ class tinyBLAS_Q0_PPC { *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d)); } } - if (!isAblock_q4) { + /*if (!isAblock_q4) { auto aoffset = A+(ii*lda)+l; for (int i = 0; i < 8; i++) { comparray[i] = 0; @@ -2532,7 +2532,7 @@ class tinyBLAS_Q0_PPC { comparray[i] = ca; aoffset += lda; } - } + }*/ compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); compute<8>(&acc_2, 0, 8, comparray, vs, fin_res); @@ -2576,9 +2576,9 @@ class tinyBLAS_Q0_PPC { if (isAblock_q4) { packNormalInt4((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false); + packNormal((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A/*, false*/); } - packNormal((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true); + packNormal((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B/*, true*/); for(int x = 0; x < 8; x+=4) { __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]); @@ -2591,7 +2591,7 @@ class tinyBLAS_Q0_PPC { } } __builtin_mma_disassemble_acc(vec_C, &acc_0); - if (!isAblock_q4) { + /*if (!isAblock_q4) { auto aoffset = A+(ii*lda)+l; for (int i = 0; i < RM; i++) { comparray[i] = 0; @@ -2602,10 +2602,10 @@ class tinyBLAS_Q0_PPC { comparray[i] = ca; aoffset += lda; } - } + }*/ for (int i = 0; i < RM; i++) { - CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0)); - res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + //CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0)); + res[i] = vec_ctf(vec_C[i], 0); fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]); } }