@@ -1582,13 +1582,79 @@ class tinyBLAS_Q0_PPC {
15821582                float  *C, int64_t  ldc,
15831583                int  ith, int  nth)
15841584        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
1585+ 		kc = 8 ;
15851586    }
15861587
15871588    void  matmul (int64_t  m, int64_t  n) {
1589+ 	int  mc = 8 ; int  nc = 8 ;
1590+ 	if  (m%mc == 0  && n%nc == 0  && k%kc == 0 ) {
1591+ 	   // debug_print_q4_0((const block_q4_0 *)A, lda, m);
1592+ 	   // debug_print_q8_0((const block_q8_0 *)B, ldb, n);
1593+ 	   matmul_tiled (m, n, mc, nc, kc);
1594+ 	}
1595+ 	else  {
1596+ 	// debug_print_q4_0((const block_q4_0 *)A, lda, m);
1597+ 	// debug_print_q8_0((const block_q8_0 *)B, ldb, n);
15881598        mnpack (0 , m, 0 , n);
1599+ 	}
15891600    }
15901601
15911602  private: 
1603+      void  debug_print_q4_0 (const  block_q4_0 *A, int  lda, int  m) {
1604+        printf (" \n ===== Matrix A (Q4_0) =====\n "  );
1605+        for  (int  i = 0 ; i < m; i++) {
1606+            //  each block holds QK4_0 values (usually 32)
1607+            for  (int  blk = 0 ; blk < lda; blk++) {
1608+ 	       const  block_q4_0* bb = A + i*lda + blk;
1609+        	   float  d = GGML_FP16_TO_FP32 (bb->d );
1610+            printf (" Row %d: d = %f, qs = "  , i, d);
1611+ 	       for  ( int  x = 0 ; x< QK4_0/2 ; x++)  {
1612+                    uint8_t  q = bb->qs [x];
1613+                    int8_t  q0 = (q & 0x0F ) - 8 ;      //  lower nibble
1614+                    int8_t  q1 = ((q >> 4 ) & 0x0F ) - 8 ; //  upper nibble
1615+                    printf (" %d %d "  , q0, q1);
1616+                }
1617+            printf (" \n "  );
1618+    }
1619+        }
1620+    }
1621+ 
1622+ 
1623+    void  debug_print_q8_0 (const  block_q8_0 *B, int  ldb, int  n) {
1624+        printf (" \n ===== Matrix B (Q8_0) =====\n "  );
1625+        for  (int  j = 0 ; j < n; j++) {
1626+            printf (" Col %d : "  , j);
1627+            for  (int  blk = 0 ; blk < k; blk++) {
1628+                const  block_q8_0 *bb = B + j*ldb + blk;
1629+        	   float  d = GGML_FP16_TO_FP32 (bb->d );
1630+                printf ("  [d=%f, qs="  , d);
1631+                for  (int  x = 0 ; x < QK8_0; x++) {
1632+                    printf (" %d "  , bb->qs [x]);
1633+                }
1634+                printf (" ]\n "  );
1635+            }
1636+            printf (" \n "  );
1637+        }
1638+    }
1639+     void  print_vec_q4 (const  char * name, vec_t  vec) {
1640+     printf (" %s:\t "  , name);
1641+     for  (int  i = 0 ; i < 16 ; i++) {
1642+         uint8_t  byte = (uint8_t ) vec[i];   //  take the raw 8-bit value
1643+ 
1644+         int8_t  lo = (byte & 0x0F ) - 8 ;     //  lower nibble (0–15) → shift to signed (-8..7)
1645+         int8_t  hi = ((byte >> 4 ) & 0x0F ) - 8 ; //  upper nibble
1646+ 
1647+         printf (" (%2d,%2d) "  , lo, hi);
1648+     }
1649+     printf (" \n "  );
1650+    }
1651+ 
1652+    void  print_vec_q8 (vec_t   vec){
1653+         for  (int  i = 0 ; i<16 ; i++) {
1654+             printf (" %-5d "  , *((int8_t *)&vec[i]));
1655+         }
1656+         printf (" \n "  );
1657+     }
15921658
15931659    inline  void  save_res (int  ii, int  jj, int  idx, vector float * fin_res, int  RM=4 , int  RN=4 ) {
15941660       for  (int  I = 0 ; I < RM; I++) {
@@ -1598,8 +1664,17 @@ class tinyBLAS_Q0_PPC {
15981664       }
15991665    }
16001666
1667+     inline  void  add_save_res (int  ii, int  jj, int  idx, vector float * fin_res, int  RM=4 , int  RN=4 ) {
1668+        for  (int  I = 0 ; I < RM; I++) {
1669+           for  (int  J = 0 ; J < RN; J++) {
1670+ 	      float  * c_ptr = (float  *)(C+ii+((jj+J)*ldc)+I);
1671+              *c_ptr += *((float *)&fin_res[idx+I]+J);
1672+           }
1673+        }
1674+     }
1675+ 
16011676    template <int  size>
1602-     inline  void  compute (acc_t * ACC, int  c_idx, int  s_idx, std::array< int , size>&  comparray, vector float * vs, vector float * fin_res) {
1677+     inline  void  compute (acc_t * ACC, int  c_idx, int  s_idx, int *  comparray, vector float * vs, vector float * fin_res) {
16031678       vector signed  int  vec_C[4 ];
16041679       vector float  CA[4 ] = {0 };
16051680       vector float  res[4 ] = {0 };
@@ -1630,6 +1705,27 @@ class tinyBLAS_Q0_PPC {
16301705        *(ca) = vsum[0 ] + vsum[1 ] + vsum[2 ] + vsum[3 ];
16311706    }
16321707
1708+     inline  void  compute_scale (int64_t  ii, int64_t  jj, int  blk, vector float * vs){
1709+ 	float  a_scales[8 ];
1710+ 	for  (int  I = 0 ; I < 8 ; ++I) {
1711+     	    a_scales[I] = unhalf ((A + ((ii + I) * lda) + blk)->d );
1712+ 	}
1713+ 
1714+ 	float  tmp_bl[4 ], tmp_br[4 ];
1715+ 	for  (int  J = 0 ; J < 4 ; ++J) {
1716+     	    tmp_bl[J] = unhalf ((B + ((jj + J) * ldb) + blk)->d );
1717+     	    tmp_br[J] = unhalf ((B + ((jj + J + 4 ) * ldb) + blk)->d );
1718+ 	}
1719+ 	vector float  vec_bl = vec_xl (0 , tmp_bl);   //  or vec_xl(0, tmp_bl)
1720+ 	vector float  vec_br = vec_xl (0 , tmp_br);
1721+ 
1722+ 	for  (int  I = 0 ; I < 8 ; ++I) {
1723+     	    vector float  a_vec = vec_splats (a_scales[I]);
1724+     	    vs[I]     = vec_mul (a_vec, vec_bl);   //  left half
1725+             vs[I + 8 ] = vec_mul (a_vec, vec_br);   //  right half
1726+ 	}
1727+     }
1728+ 
16331729    template  <typename  V1, typename  V2>
16341730    inline  void  vector_permute_store (V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool  flip) {
16351731        vector unsigned  char  swiz1 = {0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 };
@@ -1661,7 +1757,7 @@ class tinyBLAS_Q0_PPC {
16611757    }
16621758
16631759    template <int  size>
1664-     void  packNormalInt4 (const  TA* a, int64_t  lda, int  rows, int  cols, int8_t * vec, std::array< int , size>&  comparray) {
1760+     void  packNormalInt4 (const  TA* a, int64_t  lda, int  rows, int  cols, int8_t * vec, int *  comparray) {
16651761        int64_t  i, j;
16661762        TA *aoffset = NULL ;
16671763        int8_t  *vecOffset = NULL ;
@@ -1670,7 +1766,9 @@ class tinyBLAS_Q0_PPC {
16701766        vector signed  char  c1[2 ] = {0 }, c2[2 ] = {0 }, c3[2 ] = {0 }, c4[2 ] = {0 };
16711767        vector signed  char  c5[2 ] = {0 }, c6[2 ] = {0 }, c7[2 ] = {0 }, c8[2 ] = {0 };
16721768        aoffset = const_cast <TA*>(a);
1769+ 	int  index = 0 ;
16731770        vecOffset = vec;
1771+ 	// int kc = 1;
16741772        j = (rows >> 3 );
16751773        if  (j > 0 ) {
16761774            do  {
@@ -1683,43 +1781,36 @@ class tinyBLAS_Q0_PPC {
16831781                aoffset7 = aoffset6 + lda;
16841782                aoffset8 = aoffset7 + lda;
16851783                aoffset += 8  * lda;
1686-                 i = (cols >> 2 );
1687-                 if  (i > 0 ) {
1688-                     do  {
1689-                         c1[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset1->qs ));
1690-                         c2[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset2->qs ));
1691-                         c3[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset3->qs ));
1692-                         c4[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset4->qs ));
1693-                         c5[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset5->qs ));
1694-                         c6[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset6->qs ));
1695-                         c7[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset7->qs ));
1696-                         c8[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , aoffset8->qs ));
1697- 
1698-                         process_q4_elements (c1, &comparray[0 ]);
1699-                         process_q4_elements (c2, &comparray[1 ]);
1700-                         process_q4_elements (c3, &comparray[2 ]);
1701-                         process_q4_elements (c4, &comparray[3 ]);
1702-                         process_q4_elements (c5, &comparray[4 ]);
1703-                         process_q4_elements (c6, &comparray[5 ]);
1704-                         process_q4_elements (c7, &comparray[6 ]);
1705-                         process_q4_elements (c8, &comparray[7 ]);
1784+                 for  (int  blk = 0 ; blk < kc; blk++) {
1785+ 		    // float scale = GGML_FP16_TO_FP32((aoffset1+blk)->d);
1786+ 		    // printf("packed block0 with scale=%f\n", scale);
1787+                         c1[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset1+blk)->qs ));
1788+                         c2[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset2+blk)->qs ));
1789+                         c3[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset3+blk)->qs ));
1790+                         c4[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset4+blk)->qs ));
1791+                         c5[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset5+blk)->qs ));
1792+                         c6[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset6+blk)->qs ));
1793+                         c7[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset7+blk)->qs ));
1794+                         c8[1 ] = reinterpret_cast <vector signed  char >(vec_xl (0 , (aoffset8+blk)->qs ));
1795+ 		    // scale = GGML_FP16_TO_FP32((aoffset8+blk)->d);
1796+ 		    // printf("packed block8 with scale=%f\n", scale);
1797+ 
1798+                         process_q4_elements (c1, &comparray[index + 8 *blk+0 ]);
1799+                         process_q4_elements (c2, &comparray[index + 8 *blk+1 ]);
1800+                         process_q4_elements (c3, &comparray[index + 8 *blk+2 ]);
1801+                         process_q4_elements (c4, &comparray[index + 8 *blk+3 ]);
1802+                         process_q4_elements (c5, &comparray[index + 8 *blk+4 ]);
1803+                         process_q4_elements (c6, &comparray[index + 8 *blk+5 ]);
1804+                         process_q4_elements (c7, &comparray[index + 8 *blk+6 ]);
1805+                         process_q4_elements (c8, &comparray[index + 8 *blk+7 ]);
17061806                        vector_permute_store<int8_t , vector signed  char >(c1[0 ], c2[0 ], c3[0 ], c4[0 ], vecOffset, false );
17071807                        vector_permute_store<int8_t , vector signed  char >(c1[1 ], c2[1 ], c3[1 ], c4[1 ], vecOffset+64 , false );
17081808                        vector_permute_store<int8_t , vector signed  char >(c5[0 ], c6[0 ], c7[0 ], c8[0 ], vecOffset+128 , false );
17091809                        vector_permute_store<int8_t , vector signed  char >(c5[1 ], c6[1 ], c7[1 ], c8[1 ], vecOffset+192 , false );
1710-                         aoffset1 += lda;
1711-                         aoffset2 += lda;
1712-                         aoffset3 += lda;
1713-                         aoffset4 += lda;
1714-                         aoffset5 += lda;
1715-                         aoffset6 += lda;
1716-                         aoffset7 += lda;
1717-                         aoffset8 += lda;
17181810                        vecOffset += 256 ;
1719-                         i--;
1720-                     } while  (i > 0 );
1721-                 }
1811+                     }
17221812                j--;
1813+ 		index += 8 *kc;
17231814            } while  (j > 0 );
17241815        }
17251816
@@ -1792,19 +1883,16 @@ class tinyBLAS_Q0_PPC {
17921883        VB c1[8 ] = {0 }; VB c2[8 ] = {0 };
17931884        aoffset = const_cast <block_q8_0*>(a);
17941885        vecOffset = vec;
1886+ 	// int kc = 1;
17951887        j = (rows >> 3 );
17961888        if  (j > 0 ) {
17971889            do  {
1798-                 aoffsets[0 ] = aoffset;
1799-                 for  (int  it = 1 ; it < 8 ; it++)
1800-                     aoffsets[it] = aoffsets[it-1 ] + lda;
1890+                 for  (int  it = 0 ; it < 8 ; it++)
1891+                     aoffsets[it] = aoffset + it*lda;
18011892                aoffset += 8  * lda;
1802- 
1803-                 i = (cols >> 3 );
1804-                 if  (i > 0 ) {
1805-                 do  {
1893+                 for  (int  blk = 0 ; blk < kc; blk++) {
18061894                    for  (int  it = 0 ; it < 8 ; it++) {
1807-                         arr[it] = __builtin_vsx_lxvp (0 , (__vector_pair*)aoffsets[it]->qs );
1895+                         arr[it] = __builtin_vsx_lxvp (0 , (__vector_pair*)( aoffsets[it]+blk) ->qs );
18081896                        __builtin_vsx_disassemble_pair (c[it], &arr[it]);
18091897                        c1[it] = c[it][0 ];
18101898                        c2[it] = c[it][1 ];
@@ -1813,12 +1901,8 @@ class tinyBLAS_Q0_PPC {
18131901                    vector_permute_store<VA, VB>(c2[0 ], c2[1 ], c2[2 ], c2[3 ], vecOffset+64 , flip);
18141902                    vector_permute_store<VA, VB>(c1[4 ], c1[5 ], c1[6 ], c1[7 ], vecOffset+128 , flip);
18151903                    vector_permute_store<VA, VB>(c2[4 ], c2[5 ], c2[6 ], c2[7 ], vecOffset+192 , flip);
1816-                     for  (int  it = 0 ; it < 8 ; it++)
1817-                         aoffsets[it] += lda;
18181904                    vecOffset += 256 ;
1819-                     i--;
1820-                } while (i > 0 );
1821-             }
1905+ 		}
18221906            j--;
18231907        } while (j > 0 );
18241908    }
@@ -1918,7 +2002,8 @@ class tinyBLAS_Q0_PPC {
19182002    void  KERNEL_4x8 (int64_t  ii, int64_t  jj) {
19192003        vec_t  vec_A[8 ], vec_B[16 ] = {0 };
19202004        acc_t  acc_0, acc_1;
1921-         std::array<int , 4 > comparray {};
2005+         // std::array<int, 4> comparray {};
2006+         int  comparray[8 ] = {0 };
19222007        vector float  fin_res[8 ] = {0 };
19232008        vector float  vs[8 ] = {0 };
19242009        bool  isAblock_q4 = std::is_same_v<TA, block_q4_0>;
@@ -1963,7 +2048,8 @@ class tinyBLAS_Q0_PPC {
19632048    void  KERNEL_8x4 (int64_t  ii, int64_t  jj) {
19642049        vec_t  vec_A[16 ], vec_B[8 ] = {0 };
19652050        acc_t  acc_0, acc_1;
1966-         std::array<int , 8 > comparray {};
2051+         // std::array<int, 8> comparray {};
2052+         int  comparray[8 ] = {0 };
19672053        vector float  fin_res[8 ] = {0 };
19682054        vector float  vs[8 ] = {0 };
19692055        bool  isAblock_q4 = std::is_same_v<TA, block_q4_0>;
@@ -2005,9 +2091,11 @@ class tinyBLAS_Q0_PPC {
20052091    }
20062092
20072093    void  KERNEL_8x8 (int64_t  ii, int64_t  jj) {
2094+ 	printf (" In kernel 8x8 with ii = %ld jj = %ld\n "  , ii, jj);
20082095        vec_t  vec_A[16 ], vec_B[16 ] = {0 };
20092096        acc_t  acc_0, acc_1, acc_2, acc_3;
2010-         std::array<int , 8 > comparray {};
2097+         // std::array<int, 8> comparray {};
2098+         int  comparray[8 ] = {0 };
20112099        vector float  fin_res[16 ] = {0 };
20122100        vector float  vs[16 ] = {0 };
20132101        bool  isAblock_q4 = std::is_same_v<TA, block_q4_0>;
@@ -2017,10 +2105,12 @@ class tinyBLAS_Q0_PPC {
20172105            __builtin_mma_xxsetaccz (&acc_2);
20182106            __builtin_mma_xxsetaccz (&acc_3);
20192107            if  (std::is_same_v<TA, block_q4_0>) {
2108+ 		    printf (" calling packNormal for A matrix l = %d\n "  , l);
20202109               packNormalInt4<8 >((A+(ii*lda)+l), lda, 8 , 4 , (int8_t *)vec_A, comparray);
20212110            } else  {
20222111               packNormal<int8_t , vector signed  char >((const  block_q8_0*)(A+(ii*lda)+l), lda, 8 , 8 , (int8_t *)vec_A, false );
20232112            }
2113+ 		    printf (" calling packNormal for B matrix l = %d\n "  , l);
20242114            packNormal<uint8_t , vector unsigned  char >((B+(jj*ldb)+l), ldb, 8 , 8 , (uint8_t *)vec_B, true );
20252115            for (int  x = 0 ; x < 8 ; x++) {
20262116                __builtin_mma_xvi8ger4pp (&acc_0, vec_A[x], vec_B[x]);
@@ -2057,6 +2147,64 @@ class tinyBLAS_Q0_PPC {
20572147        save_res (ii+4 , jj+4 , 12 , fin_res);
20582148    }
20592149
2150+     void  KERNEL_Q4 (int64_t  ii, int64_t  jj, int64_t  mc, int64_t  nc, int64_t  kc, vec_t  *vec_A, vec_t  *vec_B, int  *comparray) {
2151+ 	acc_t  acc[4 ];
2152+ 	for  (int  i = 0 ; i < mc ; i += 8 ) {
2153+ 	    for  (int  j = 0 ; j < nc; j += 8 ) {
2154+ 		vector float  fin_res[16 ] = {0 };
2155+ 		vector float  vs[16 ] = {0 };
2156+     		for  (int64_t  kk = 0 ; kk < kc; kk++) {
2157+     		    for  (int  x = 0 ; x < 4 ; x++) {
2158+             		__builtin_mma_xxsetaccz (&acc[x]);
2159+     		    }
2160+             	    int  A_block_idx = (i/8 )*(16 *kc) + kk*16 ;
2161+             	    int  B_block_idx = (j/8 )*(16 *kc)+ kk*16 ;
2162+             	    vec_t  *A_block = &vec_A[A_block_idx];
2163+             	    vec_t  *B_block = &vec_B[B_block_idx];
2164+ 
2165+                     for  (int  x = 0 ; x < 8 ; x++) {
2166+                         __builtin_mma_xvi8ger4pp (&acc[0 ], A_block[x],     B_block[x]);
2167+                         __builtin_mma_xvi8ger4pp (&acc[1 ], A_block[x + 8 ], B_block[x]);
2168+                         __builtin_mma_xvi8ger4pp (&acc[2 ], A_block[x],     B_block[x+8 ]);
2169+                         __builtin_mma_xvi8ger4pp (&acc[3 ], A_block[x+8 ],   B_block[x+8 ]);
2170+                     }
2171+ 		    compute_scale (ii+i, jj+j, kk, vs);
2172+ 	    	    int  c_index = (i/8 )*(8 *kc)+ kk*8 ;
2173+ 	    	    int * c_block = &comparray[c_index];
2174+     	            compute<8 >(&acc[0 ], 0 ,  0 ,  c_block, vs, fin_res);
2175+                     compute<8 >(&acc[1 ], 4 ,  4 ,  c_block, vs, fin_res);
2176+                     compute<8 >(&acc[2 ], 0 ,  8 ,  c_block, vs, fin_res);
2177+                     compute<8 >(&acc[3 ], 4 , 12 ,  c_block, vs, fin_res);
2178+ 		}
2179+         	add_save_res (ii+i,   jj+j,    0 ,  fin_res);
2180+         	add_save_res (ii+i+4 , jj+j,    4 ,  fin_res);
2181+         	add_save_res (ii+i,   jj+j+4 ,  8 ,  fin_res);
2182+         	add_save_res (ii+i+4 , jj+j+4 , 12 ,  fin_res);
2183+ 	    }
2184+         
2185+            }
2186+     }
2187+ 
2188+     void  matmul_tiled (int64_t  m, int64_t  n, int64_t  mc, int64_t  nc, int64_t  kc) {
2189+         int64_t  ytiles = m / mc;
2190+         int64_t  xtiles = n / nc;
2191+         int64_t  tiles  = xtiles * ytiles;
2192+ 
2193+         for  (int64_t  job = 0 ; job < tiles; job++) {
2194+             int64_t  ii = (job / xtiles) * mc;
2195+             int64_t  jj = (job % xtiles) * nc;
2196+ 
2197+             for  (int64_t  kk = 0 ; kk < k; kk += kc) {
2198+                 vec_t  A_pack[mc*kc*2 ];   //  int4 → int8_t storage
2199+                 vec_t  B_pack[nc*kc*2 ];
2200+                 int  comparray[mc*kc];       //  scales for A
2201+                 packNormalInt4<8 >(A + ii*lda + kk, lda, mc, 4 , (int8_t *)A_pack, comparray);
2202+                 packNormal<uint8_t , vector unsigned  char >(B + jj*ldb + kk, ldb, nc, 8 , (uint8_t *)B_pack, true );
2203+                 KERNEL_Q4 (ii, jj, mc, nc, kc, A_pack, B_pack, comparray);
2204+         }
2205+       }
2206+     }
2207+ 
20602208    void  gemm_small (int64_t  m0, int64_t  m, int64_t  n0, int64_t  n, int  RM, int  RN) {
20612209        int64_t  ytiles = (m - m0) / RM;
20622210        int64_t  xtiles = (n - n0) / RN;
@@ -2074,7 +2222,8 @@ class tinyBLAS_Q0_PPC {
20742222        for  (int64_t  job = start; job < end; ++job) {
20752223            int64_t  ii = m0 + job / xtiles * RM;
20762224            int64_t  jj = n0 + job % xtiles * RN;
2077-             std::array<int , 4 > comparray{};
2225+             // std::array<int, 4> comparray{};
2226+             int  comparray[4 ] = {0 };
20782227            vector float  res[4 ] = {0 };
20792228            vector float  fin_res[4 ] = {0 };
20802229            vector float  vs[4 ] = {0 };
@@ -2159,6 +2308,7 @@ class tinyBLAS_Q0_PPC {
21592308    const  block_q8_0 *const  B;
21602309    float  *C;
21612310    const  int64_t  k;
2311+     int64_t  kc;
21622312    const  int64_t  lda;
21632313    const  int64_t  ldb;
21642314    const  int64_t  ldc;
0 commit comments