@@ -2188,6 +2188,32 @@ class tinyBLAS_PPC {
21882188
21892189 void (tinyBLAS_PPC::*kernel)(int64_t , int64_t );
21902190
2191+ inline void save_acc (acc_t * ACC, int64_t ii, int64_t jj) {
2192+ vec_t vec_C[4 ];
2193+ __builtin_mma_disassemble_acc (vec_C, ACC);
2194+ for (int I = 0 ; I < 4 ; I++) {
2195+ for (int J = 0 ; J < 4 ; J++) {
2196+ *((float *)(C+ii+((jj+J)*ldc)+I)) = *((float *)&vec_C[J]+I);
2197+ }
2198+ }
2199+ }
2200+
2201+ void dump_vec_f (const char * name, vector float vec){
2202+ printf (" %s:\t " , name);
2203+ for (int i =0 ; i< 4 ; i++){
2204+ printf (" %-12.4f" , (float )vec[i]);
2205+ }
2206+ printf (" \n " );
2207+ }
2208+ void dump_acc (acc_t * acc, vector unsigned char * vec_C){
2209+ __builtin_mma_disassemble_acc (vec_C, acc);
2210+ for (int j = 0 ; j<4 ; j++) {
2211+ for (int i = 0 ; i< 4 ; i++){
2212+ printf (" %-12.4f " , *((float *)&vec_C[j]+i));
2213+ }
2214+ printf (" \n " );
2215+ }
2216+ }
21912217 template <typename VA>
21922218 void packTranspose (const TA* a, int64_t lda, int rows, int cols, TA* vec) {
21932219 int64_t i, j;
@@ -2497,19 +2523,34 @@ class tinyBLAS_PPC {
24972523 __builtin_mma_xxsetaccz (&acc_2);
24982524 __builtin_mma_xxsetaccz (&acc_3);
24992525 for (int l = 0 ; l < k; l+=8 ) {
2500- packTranspose<vector float >(A+(ii*lda)+l, lda, 8 , 8 , (TA*)vec_A);
2501- packTranspose<vector float >(B+(jj*ldb)+l, ldb, 8 , 8 , (TA*)vec_B);
2502- for (int x = 0 ; x < 16 ; x+=2 ) {
2503- __builtin_mma_xvf32gerpp (&acc_0, (vec_t )vec_A[x], vec_B[x]);
2504- __builtin_mma_xvf32gerpp (&acc_1, (vec_t )vec_A[x], vec_B[x+1 ]);
2505- __builtin_mma_xvf32gerpp (&acc_2, (vec_t )vec_A[x+1 ], vec_B[x]);
2506- __builtin_mma_xvf32gerpp (&acc_3, (vec_t )vec_A[x+1 ], vec_B[x+1 ]);
2507- }
2526+ for (int x = 0 ; x < 8 ; ++x) {
2527+ vec_A[2 * x] = (vec_t )vec_xl (0 , (float *)(A + (l + x) * lda + ii));
2528+ vec_A[2 * x + 1 ] = (vec_t )vec_xl (0 , (float *)(A + (l + x) * lda + ii + 4 ));
2529+
2530+ vec_B[2 * x] = (vec_t )vec_xl (0 , (float *)(B + (l + x) * ldb + jj));
2531+ vec_B[2 * x + 1 ] = (vec_t )vec_xl (0 , (float *)(B + (l + x) * ldb + jj + 4 ));
2532+ }
2533+
2534+ // packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
2535+ // packTranspose<vector float>(B+(jj*lda)+l, lda, 8, 8, (TA*)vec_B);
2536+ for (int i = 0 ; i< 16 ; i++) {
2537+ dump_vec_f (" A" , (vector float )vec_A[i]);
2538+ dump_vec_f (" B" , (vector float )vec_B[i]);
2539+ }
2540+ for (int x = 0 ; x < 16 ; x += 2 ) {
2541+ __builtin_mma_xvf32gerpp (&acc_0, vec_B[x], vec_A[x]);
2542+ __builtin_mma_xvf32gerpp (&acc_1, vec_B[x], vec_A[x+1 ]);
2543+ __builtin_mma_xvf32gerpp (&acc_2, vec_B[x+1 ], vec_A[x]);
2544+ __builtin_mma_xvf32gerpp (&acc_3, vec_B[x+1 ], vec_A[x+1 ]);
2545+ }
2546+
2547+ printf (" dumping acc_0 adfter l=%d\n " , l);
2548+ dump_acc (&acc_0, vec_C);
25082549 }
2509- SAVE_ACC (&acc_0, ii, jj );
2510- SAVE_ACC (&acc_1, ii, jj+4 );
2511- SAVE_ACC (&acc_2, ii+4 , jj );
2512- SAVE_ACC (&acc_3, ii +4 , jj +4 );
2550+ SAVE_ACC (&acc_0, jj, ii );
2551+ SAVE_ACC (&acc_1, jj+4 , ii );
2552+ SAVE_ACC (&acc_2, jj, ii+4 );
2553+ SAVE_ACC (&acc_3, jj +4 , ii +4 );
25132554 }
25142555
25152556 void mnpack (int64_t m0, int64_t m, int64_t n0, int64_t n) {
@@ -2838,6 +2879,15 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
28382879#elif defined(__MMA__)
28392880 if (k % 8 )
28402881 return false ;
2882+ printf (" Inside tinyblas\n " );
2883+ float * Ap = (float *)A;
2884+ float * Bp = (float *)B;
2885+ for (int i = 0 ; i<( k*m); i++)
2886+ printf (" %f\t " , *(Ap++));
2887+ printf (" \n *****************\n " );
2888+ for (int i = 0 ; i<( k*n); i++)
2889+ printf (" %f\t " , *(Bp++));
2890+ printf (" \n *****************\n " );
28412891 tinyBLAS_PPC<float , float , float > tb{
28422892 k, (const float *)A, lda,
28432893 (const float *)B, ldb,
0 commit comments