diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 2be54c31b5f3e..e1ad280121cbd 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -2175,14 +2175,39 @@ class tinyBLAS_PPC { int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } - void matmul(int64_t m, int64_t n) { - mnpack(0, m, 0, n); + int64_t mc = 256; int64_t nc = 256; int64_t kc = 256; + if ( m%mc == 0 && n%nc == 0 && k%kc == 0) { + matmul_tiled(m, n, mc, nc, kc); + } else { + mnpack(0, m, 0, n); + } } private: void (tinyBLAS_PPC::*kernel)(int64_t, int64_t); + + inline void save_acc(acc_t* ACC, int64_t ii, int64_t jj) { + vec_t vec_C[4]; + __builtin_mma_disassemble_acc(vec_C, ACC); + for (int I = 0; I < 4; I++) { + for (int J = 0; J < 4; J++) { + *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); + } + } + } + + inline void add_save_acc(acc_t* ACC, int64_t ii, int64_t jj) { + vec_t vec_C[4]; + __builtin_mma_disassemble_acc(vec_C, ACC); + for (int I = 0; I < 4; I++) { + for (int J = 0; J < 4; J++) { + float* c_ptr = (float*)(C+ii+((jj+J)*ldc)+I);// += *((float*)&vec_C[I]+J); + *c_ptr += *((float*)&vec_C[I]+J); + } + } + } inline void vector_permute_store_4(vector float *src, float *vecOffset) { vector float t1, t2, t3, t4, t5, t6, t7, t8; @@ -2235,7 +2260,7 @@ class tinyBLAS_PPC { vec_xst(t8, 0, vecOffset + 28); } - void packTranspose(const float* a, int64_t lda, int rows, int cols, float* vec) { + void packTranspose(const float* a, int64_t lda, int rows, int cols, float* vec) { int64_t i, j; float * aoffsets[8]; float *aoffset = NULL, *boffset = NULL; @@ -2265,10 +2290,13 @@ class tinyBLAS_PPC { vector_permute_store_8(c1, boffset); vector_permute_store_8(c2, boffset+32); - for (int it = 0; it < 4; it++) - aoffsets[it] = aoffsets[it] + 8*lda; boffset += 64; i--; + if (i > 0) { + for (int it = 0; it < 8; it++) { + aoffsets[it] = aoffsets[it] + 8; + } + } } while(i > 0); } if (cols & 4) { @@ -2401,6 +2429,83 @@ class tinyBLAS_PPC { SAVE_ACC(&acc_3, ii+4, jj+4); } + inline void MMA_16x8(vec_t *vec_A0, vec_t* vec_A1, vec_t *vec_B, acc_t * acc) { + for (int x = 0; x < 16; x += 2) { + __builtin_mma_xvf32gerpp(&acc[0], vec_A0[x + 0], vec_B[x]); + __builtin_mma_xvf32gerpp(&acc[1], vec_A0[x + 0], vec_B[x + 1]); + __builtin_mma_xvf32gerpp(&acc[2], vec_A0[x + 1], vec_B[x]); + __builtin_mma_xvf32gerpp(&acc[3], vec_A0[x + 1], vec_B[x + 1]); + __builtin_mma_xvf32gerpp(&acc[4], vec_A1[x + 0], vec_B[x]); + __builtin_mma_xvf32gerpp(&acc[5], vec_A1[x + 0], vec_B[x + 1]); + __builtin_mma_xvf32gerpp(&acc[6], vec_A1[x + 1], vec_B[x]); + __builtin_mma_xvf32gerpp(&acc[7], vec_A1[x + 1], vec_B[x + 1]); + } + } + + void KERNEL(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, vec_t * vec_A, vec_t* vec_B, int64_t kk) { + for (int64_t i = 0; i tiles) { + end = tiles; + } + for (int64_t job = start; job < end; ++job) { + int64_t ii = (job / xtiles) * mc; + int64_t jj = (job % xtiles) * nc; + for (int64_t kk = 0; kk < k; kk += kc) { + vec_t A_pack[kc*mc/4]; + vec_t B_pack[kc*nc/4]; + packTranspose(A+(ii*lda)+kk, lda, kc, mc, (float*)A_pack); + packTranspose(B+(jj*ldb)+kk, ldb, kc, nc, (float*)B_pack); + KERNEL(ii, jj, mc, nc, kc, A_pack, B_pack, kk); + } + } + } + void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { int m_rem = MIN(m - m0, 8); int n_rem = MIN(n - n0, 8);