diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 2c4ad9d58b9f2..203723ecc08d7 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -117,8 +117,38 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); } #endif #if defined(__MMA__) +#include + typedef vector unsigned char vec_t; typedef __vector_quad acc_t; +// Global key for thread-local storage cleanup +static pthread_key_t t_data_key; + +// Structure to hold the thread-local buffers +typedef struct { + vec_t* A_pack; + vec_t* B_pack; + int* comparray; +} thread_scratchpad_t; + +// Function run automatically when a thread exits +void thread_cleanup(void* arg) { + thread_scratchpad_t* data = (thread_scratchpad_t*)arg; + if (data) { + // Use 'delete[]' as the memory was allocated with 'new[]' + delete[] data->A_pack; + delete[] data->B_pack; + delete[] data->comparray; + + // Free the structure + delete data; + } +} + +// Global flag to ensure key creation happens only once +static bool key_created = false; +// Explicit declaration of the Power ISA intrinsic to resolve template lookup issues. +extern "C" void __dcbst(int, const void*); #endif //////////////////////////////////////////////////////////////////////////////////////////////////// // VECTORIZED FUSED MULTIPLY ADD @@ -1582,10 +1612,20 @@ class tinyBLAS_Q0_PPC { float *C, int64_t ldc, int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + kc=64; + //kc=k; } - - void matmul(int64_t m, int64_t n) { - mnpack(0, m, 0, n); + void matmul_q8(int64_t m, int64_t n) { + mnpack(0, m, 0, n); + } + void matmul_q4(int64_t m, int64_t n) { + int mc = 64; int nc = 64; + const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0); + if ( is_aligned) { + matmul_tiled(m, n, mc, nc, kc); + } else { + mnpack(0, m, 0, n); + } } private: @@ -1597,9 +1637,28 @@ class tinyBLAS_Q0_PPC { } } } + void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){ + for (int I = 0; I<8; I++) { + float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);// * unhalf((B+((jj+J)*ldb)+blk)->d)); + for (int J = 0; J<4; J++) { + //*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+blk)->d) * unhalf((B+((jj+J)*ldb)+blk)->d)); + //*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+blk)->d) * unhalf((B+((jj+J+4)*ldb)+blk)->d)); - template - inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array& comparray, vector float* vs, vector float* fin_res) { + *((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d)); + *((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d)); + } + } + } + inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) { + for (int I = 0; I < RM; I++) { + for (int J = 0; J < RN; J++) { + float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I); + *c_ptr += *((float*)&fin_res[idx+I]+J); + } + } + } + template + inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) { vector signed int vec_C[4]; vector float CA[4] = {0}; vector float res[4] = {0}; @@ -1610,6 +1669,18 @@ class tinyBLAS_Q0_PPC { fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]); } } + /*inline void compute_new(acc_t* ACC, int c_idx, int s_idx, int* comparray, vector float* vs, vector float* fin_res) { + vector signed int vec_C[4]; + vector float CA[4] = {0}; + vector float res[4] = {0}; + __builtin_mma_disassemble_acc(vec_C, ACC); + for (int i = 0; i < 4; i++) { + CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0)); + res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]); + } + }*/ + /* This function processes quantized data from block_q4_0 elements. * First the we try to extract the two int4 values stored in single int8_t into two signed int8. * And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8. @@ -1629,6 +1700,7 @@ class tinyBLAS_Q0_PPC { vsum = vec_add(vsum, vsum2); *(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3]; } + template inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) { @@ -1660,8 +1732,61 @@ class tinyBLAS_Q0_PPC { vec_xst(t8, 0, vecOffset+48); } - template - void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array& comparray) { + void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) { + int64_t i, j; + TA *aoffset = NULL; + int8_t *vecOffset = NULL; + TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; + TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; + vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0}; + vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0}; + aoffset = const_cast(a); + vecOffset = vec; + int index = 0; + j = (rows >> 3); + if (j > 0) { + do { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + for (int blk = 0; blk < kc; blk++) { + c1[1] = reinterpret_cast(vec_xl(0, (aoffset1+blk)->qs)); + c2[1] = reinterpret_cast(vec_xl(0, (aoffset2+blk)->qs)); + c3[1] = reinterpret_cast(vec_xl(0, (aoffset3+blk)->qs)); + c4[1] = reinterpret_cast(vec_xl(0, (aoffset4+blk)->qs)); + c5[1] = reinterpret_cast(vec_xl(0, (aoffset5+blk)->qs)); + c6[1] = reinterpret_cast(vec_xl(0, (aoffset6+blk)->qs)); + c7[1] = reinterpret_cast(vec_xl(0, (aoffset7+blk)->qs)); + c8[1] = reinterpret_cast(vec_xl(0, (aoffset8+blk)->qs)); + + process_q4_elements(c1, &comparray[index + 8*blk+0]); + process_q4_elements(c2, &comparray[index + 8*blk+1]); + process_q4_elements(c3, &comparray[index + 8*blk+2]); + process_q4_elements(c4, &comparray[index + 8*blk+3]); + process_q4_elements(c5, &comparray[index + 8*blk+4]); + process_q4_elements(c6, &comparray[index + 8*blk+5]); + process_q4_elements(c7, &comparray[index + 8*blk+6]); + process_q4_elements(c8, &comparray[index + 8*blk+7]); + vector_permute_store(c1[0], c2[0], c3[0], c4[0], vecOffset, false); + vector_permute_store(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false); + vector_permute_store(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false); + vector_permute_store(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false); + vecOffset += 256; + } + j--; + index += 8*kc; + } while (j > 0); + } + } + +template +void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array& comparray) { int64_t i, j; TA *aoffset = NULL; int8_t *vecOffset = NULL; @@ -1711,7 +1836,7 @@ class tinyBLAS_Q0_PPC { aoffset2 += lda; aoffset3 += lda; aoffset4 += lda; - aoffset5 += lda; + aoffset5 += lda; aoffset6 += lda; aoffset7 += lda; aoffset8 += lda; @@ -1722,7 +1847,6 @@ class tinyBLAS_Q0_PPC { j--; } while (j > 0); } - if (rows & 4) { aoffset1 = aoffset; aoffset2 = aoffset1 + lda; @@ -1782,6 +1906,42 @@ class tinyBLAS_Q0_PPC { } } template + void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) { + int64_t i, j; + block_q8_0 *aoffset = NULL; + VA *vecOffset = NULL; + block_q8_0* aoffsets[8]; + __vector_pair arr[8]; + VB c[8][2] = {0}; + VB c1[8] = {0}; VB c2[8] = {0}; + aoffset = const_cast(a); + vecOffset = vec; + j = (rows >> 3); + if (j > 0) { + do { + for (int it = 0; it < 8; it++) + aoffsets[it] = aoffset + it*lda; + aoffset += 8 * lda; + for (int blk = 0; blk < kc; blk++) { + for (int it = 0; it < 8; it++) { + arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs); + __builtin_vsx_disassemble_pair(c[it], &arr[it]); + c1[it] = c[it][0]; + c2[it] = c[it][1]; + } + vector_permute_store(c1[0], c1[1], c1[2], c1[3], vecOffset, flip); + vector_permute_store(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip); + vector_permute_store(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip); + vector_permute_store(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip); + /*for (int it = 0; it < 8; it++) + aoffsets[it] += lda;*/ + vecOffset += 256; + } + j--; + } while(j > 0); + } +} + template void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) { int64_t i, j; block_q8_0 *aoffset = NULL; @@ -1822,7 +1982,6 @@ class tinyBLAS_Q0_PPC { j--; } while(j > 0); } - if (rows & 4) { aoffsets[0] = aoffset; for (int it = 1; it < 4; it++ ) @@ -1919,6 +2078,7 @@ class tinyBLAS_Q0_PPC { vec_t vec_A[8], vec_B[16] = {0}; acc_t acc_0, acc_1; std::array comparray {}; + //int comparray[8] = {0}; vector float fin_res[8] = {0}; vector float vs[8] = {0}; bool isAblock_q4 = std::is_same_v; @@ -1953,8 +2113,8 @@ class tinyBLAS_Q0_PPC { aoffset += lda; } } - compute<4>(&acc_0, 0, 0, comparray, vs, fin_res); - compute<4>(&acc_1, 0, 4, comparray, vs, fin_res); + compute(&acc_0, 0, 0, comparray, vs, fin_res); + compute(&acc_1, 0, 4, comparray, vs, fin_res); } save_res(ii, jj, 0, fin_res); save_res(ii, jj+4, 4, fin_res); @@ -1964,6 +2124,7 @@ class tinyBLAS_Q0_PPC { vec_t vec_A[16], vec_B[8] = {0}; acc_t acc_0, acc_1; std::array comparray {}; + //int comparray[8] = {0}; vector float fin_res[8] = {0}; vector float vs[8] = {0}; bool isAblock_q4 = std::is_same_v; @@ -1997,8 +2158,8 @@ class tinyBLAS_Q0_PPC { aoffset += lda; } } - compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); - compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); + compute(&acc_0, 0, 0, comparray, vs, fin_res); + compute(&acc_1, 4, 4, comparray, vs, fin_res); } save_res(ii, jj, 0, fin_res); save_res(ii+4, jj, 4, fin_res); @@ -2007,6 +2168,7 @@ class tinyBLAS_Q0_PPC { void KERNEL_8x8(int64_t ii, int64_t jj) { vec_t vec_A[16], vec_B[16] = {0}; acc_t acc_0, acc_1, acc_2, acc_3; + //int comparray[8] = {0}; std::array comparray {}; vector float fin_res[16] = {0}; vector float vs[16] = {0}; @@ -2046,16 +2208,111 @@ class tinyBLAS_Q0_PPC { aoffset += lda; } } - compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); - compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); - compute<8>(&acc_2, 0, 8, comparray, vs, fin_res); - compute<8>(&acc_3, 4, 12, comparray, vs, fin_res); + compute(&acc_0, 0, 0, comparray, vs, fin_res); + compute(&acc_1, 4, 4, comparray, vs, fin_res); + compute(&acc_2, 0, 8, comparray, vs, fin_res); + compute(&acc_3, 4, 12, comparray, vs, fin_res); } save_res(ii, jj, 0, fin_res); save_res(ii+4, jj, 4, fin_res); save_res(ii, jj+4, 8, fin_res); save_res(ii+4, jj+4, 12, fin_res); } + void KERNEL_Q4(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) { + acc_t acc[4]; + for (int i = 0; i < mc ; i += 8) { + for (int j = 0; j < nc; j += 8) { + //printf("in kernel q4 with i=%d j = %d\n", i, j); + vector float fin_res[16] = {0}; + vector float vs[16] = {0}; + for (int64_t kk = 0; kk < kc; kk++) { + //printf("Block kk=%d\n", kk); + for (int x = 0; x < 4; x++) { + __builtin_mma_xxsetaccz(&acc[x]); + } + int A_block_idx = (i/8)*(16*kc) + kk*16; + int B_block_idx = (j/8)*(16*kc)+ kk*16; + vec_t *A_block = &vec_A[A_block_idx]; + vec_t *B_block = &vec_B[B_block_idx]; + /*printf("printing input vectors A and B\n"); + for (int i = 0; i< 16; i++){ + print_vec_q8("A", A_block[i]); + print_vec_q8("B", B_block[i]); + }*/ + for (int x = 0; x < 8; x++) { + __builtin_mma_xvi8ger4pp(&acc[0], A_block[x], B_block[x]); + __builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]); + __builtin_mma_xvi8ger4pp(&acc[2], A_block[x], B_block[x+8]); + __builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8], B_block[x+8]); + } + compute_scale(ii+i, jj+j, l+kk, vs); + int c_index = (i/8)*(8*kc)+ kk*8; + int* c_block = &comparray[c_index]; + compute(&acc[0], 0, 0, c_block, vs, fin_res); + compute(&acc[1], 4, 4, c_block, vs, fin_res); + compute(&acc[2], 0, 8, c_block, vs, fin_res); + compute(&acc[3], 4, 12, c_block, vs, fin_res); + } + if (l == 0) { + save_res(ii+i, jj+j, 0, fin_res); + save_res(ii+i+4, jj+j, 4, fin_res); + save_res(ii+i, jj+j+4, 8, fin_res); + save_res(ii+i+4, jj+j+4, 12, fin_res); + } else { + add_save_res(ii+i, jj+j, 0, fin_res); + add_save_res(ii+i+4, jj+j, 4, fin_res); + add_save_res(ii+i, jj+j+4, 8, fin_res); + add_save_res(ii+i+4, jj+j+4, 12, fin_res); + } + } + + } + } + + void matmul_tiled(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) { + if (!key_created) { + if (pthread_key_create(&t_data_key, thread_cleanup) == 0) { + key_created = true; + } else { + return; + } + } + + thread_scratchpad_t* t_data = (thread_scratchpad_t*)pthread_getspecific(t_data_key); + + if (t_data == nullptr) { + t_data = new thread_scratchpad_t; + + t_data->A_pack = new vec_t[mc * kc * 2]; + t_data->B_pack = new vec_t[nc * kc * 2]; + t_data->comparray = new int[mc * kc]; + pthread_setspecific(t_data_key, t_data); + } + + vec_t* A_pack = t_data->A_pack; + vec_t* B_pack = t_data->B_pack; + int* comparray = t_data->comparray; + + int64_t ytiles = m / mc; + int64_t xtiles = n / nc; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) { + end = tiles; + } + for (int64_t job = start; job < end; ++job) { + int64_t ii = (job / xtiles) * mc; + int64_t jj = (job % xtiles) * nc; + for (int64_t kk = 0; kk < k; kk += kc) { + packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray); + packNormal_large(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true); + KERNEL_Q4(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray); + } + } + } + void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) { int64_t ytiles = (m - m0) / RM; @@ -2075,6 +2332,7 @@ class tinyBLAS_Q0_PPC { int64_t ii = m0 + job / xtiles * RM; int64_t jj = n0 + job % xtiles * RN; std::array comparray{}; + //int comparray[4] = {0}; vector float res[4] = {0}; vector float fin_res[4] = {0}; vector float vs[4] = {0}; @@ -2159,6 +2417,7 @@ class tinyBLAS_Q0_PPC { const block_q8_0 *const B; float *C; const int64_t k; + int64_t kc; const int64_t lda; const int64_t ldb; const int64_t ldc; @@ -2856,7 +3115,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (const block_q8_0 *)B, ldb, (float *)C, ldc, params->ith, params->nth}; - tb.matmul(m, n); + tb.matmul_q8(m, n); return true; #else return false; @@ -2893,7 +3152,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (const block_q8_0 *)B, ldb, (float *)C, ldc, params->ith, params->nth}; - tb.matmul(m, n); + tb.matmul_q4(m, n); return true; #else return false;