Skip to content

Commit 6728495

Browse files
committed
Try to Optimize ppc Fp32 tiny blas kernels
This patch gets rid of the redundant vec_perm insns by re-ordering the matrix multiplication in kernel Signed-off-by: Shalini Salomi Bodapati <[email protected]>
1 parent 19e899c commit 6728495

File tree

1 file changed

+62
-12
lines changed

1 file changed

+62
-12
lines changed

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2188,6 +2188,32 @@ class tinyBLAS_PPC {
21882188

21892189
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
21902190

2191+
inline void save_acc(acc_t* ACC, int64_t ii, int64_t jj) {
2192+
vec_t vec_C[4];
2193+
__builtin_mma_disassemble_acc(vec_C, ACC);
2194+
for (int I = 0; I < 4; I++) {
2195+
for (int J = 0; J < 4; J++) {
2196+
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[J]+I);
2197+
}
2198+
}
2199+
}
2200+
2201+
void dump_vec_f(const char * name, vector float vec){
2202+
printf("%s:\t", name);
2203+
for(int i =0; i< 4; i++){
2204+
printf("%-12.4f", (float)vec[i]);
2205+
}
2206+
printf("\n");
2207+
}
2208+
void dump_acc(acc_t * acc, vector unsigned char* vec_C){
2209+
__builtin_mma_disassemble_acc(vec_C, acc);
2210+
for (int j = 0; j<4; j++) {
2211+
for (int i = 0; i< 4; i++){
2212+
printf("%-12.4f ", *((float*)&vec_C[j]+i));
2213+
}
2214+
printf("\n");
2215+
}
2216+
}
21912217
template<typename VA>
21922218
void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
21932219
int64_t i, j;
@@ -2497,19 +2523,34 @@ class tinyBLAS_PPC {
24972523
__builtin_mma_xxsetaccz(&acc_2);
24982524
__builtin_mma_xxsetaccz(&acc_3);
24992525
for (int l = 0; l < k; l+=8) {
2500-
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
2501-
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
2502-
for(int x = 0; x < 16; x+=2) {
2503-
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
2504-
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
2505-
__builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
2506-
__builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
2507-
}
2526+
for (int x = 0; x < 8; ++x) {
2527+
vec_A[2 * x] = (vec_t)vec_xl(0, (float *)(A + (l + x) * lda + ii));
2528+
vec_A[2 * x + 1] = (vec_t)vec_xl(0, (float *)(A + (l + x) * lda + ii + 4));
2529+
2530+
vec_B[2 * x] = (vec_t)vec_xl(0, (float *)(B + (l + x) * ldb + jj));
2531+
vec_B[2 * x + 1] = (vec_t)vec_xl(0, (float *)(B + (l + x) * ldb + jj + 4));
2532+
}
2533+
2534+
//packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
2535+
//packTranspose<vector float>(B+(jj*lda)+l, lda, 8, 8, (TA*)vec_B);
2536+
for (int i = 0; i< 16; i++) {
2537+
dump_vec_f("A", (vector float)vec_A[i]);
2538+
dump_vec_f("B", (vector float)vec_B[i]);
2539+
}
2540+
for (int x = 0; x < 16; x += 2) {
2541+
__builtin_mma_xvf32gerpp(&acc_0, vec_B[x], vec_A[x]);
2542+
__builtin_mma_xvf32gerpp(&acc_1, vec_B[x], vec_A[x+1]);
2543+
__builtin_mma_xvf32gerpp(&acc_2, vec_B[x+1], vec_A[x]);
2544+
__builtin_mma_xvf32gerpp(&acc_3, vec_B[x+1], vec_A[x+1]);
2545+
}
2546+
2547+
printf("dumping acc_0 adfter l=%d\n", l);
2548+
dump_acc(&acc_0, vec_C);
25082549
}
2509-
SAVE_ACC(&acc_0, ii, jj);
2510-
SAVE_ACC(&acc_1, ii, jj+4);
2511-
SAVE_ACC(&acc_2, ii+4, jj);
2512-
SAVE_ACC(&acc_3, ii+4, jj+4);
2550+
SAVE_ACC(&acc_0, jj, ii );
2551+
SAVE_ACC(&acc_1, jj+4, ii);
2552+
SAVE_ACC(&acc_2, jj, ii+4);
2553+
SAVE_ACC(&acc_3, jj+4, ii+4);
25132554
}
25142555

25152556
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
@@ -2838,6 +2879,15 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
28382879
#elif defined(__MMA__)
28392880
if (k % 8)
28402881
return false;
2882+
printf("Inside tinyblas\n");
2883+
float * Ap = (float*)A;
2884+
float * Bp = (float*)B;
2885+
for (int i = 0; i<( k*m); i++)
2886+
printf("%f\t", *(Ap++));
2887+
printf("\n*****************\n");
2888+
for (int i = 0; i<( k*n); i++)
2889+
printf("%f\t", *(Bp++));
2890+
printf("\n*****************\n");
28412891
tinyBLAS_PPC<float, float, float> tb{
28422892
k, (const float *)A, lda,
28432893
(const float *)B, ldb,

0 commit comments

Comments
 (0)