diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index 2c4ad9d58b9f2..203723ecc08d7 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -117,8 +117,38 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
 #endif
 
 #if defined(__MMA__)
+#include <pthread.h>
+
 typedef vector unsigned char vec_t;
 typedef __vector_quad acc_t;
+// Global key for thread-local storage cleanup
+static pthread_key_t t_data_key;
+
+// Structure to hold the thread-local buffers
+typedef struct {
+    vec_t* A_pack;
+    vec_t* B_pack;
+    int* comparray;
+} thread_scratchpad_t;
+
+// Function run automatically when a thread exits
+void thread_cleanup(void* arg) {
+    thread_scratchpad_t* data = (thread_scratchpad_t*)arg;
+    if (data) {
+        // Use 'delete[]' as the memory was allocated with 'new[]'
+        delete[] data->A_pack;
+        delete[] data->B_pack;
+        delete[] data->comparray;
+
+        // Free the structure
+        delete data;
+    }
+}
+
+// Global flag to ensure key creation happens only once
+static bool key_created = false;
+// Explicit declaration of the Power ISA intrinsic to resolve template lookup issues.
+extern "C" void __dcbst(int, const void*);
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED FUSED MULTIPLY ADD
@@ -1582,10 +1612,20 @@ class tinyBLAS_Q0_PPC {
                 float *C, int64_t ldc,
                 int ith, int nth)
         : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+		kc=64;
+		//kc=k;
     }
-
-    void matmul(int64_t m, int64_t n) {
-        mnpack(0, m, 0, n);
+    void matmul_q8(int64_t m, int64_t n) {
+	    mnpack(0, m, 0, n);
+    }
+    void matmul_q4(int64_t m, int64_t n) {
+	int mc = 64; int nc = 64;
+	const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
+       if ( is_aligned) {
+          matmul_tiled(m, n, mc, nc, kc);
+       } else {
+         mnpack(0, m, 0, n);
+       }
     }
 
   private:
@@ -1597,9 +1637,28 @@ class tinyBLAS_Q0_PPC {
           }
        }
     }
+     void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){
+       for (int I = 0; I<8; I++) {
+            float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);// * unhalf((B+((jj+J)*ldb)+blk)->d));
+            for (int J = 0; J<4; J++) {
+                    //*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+blk)->d) * unhalf((B+((jj+J)*ldb)+blk)->d));
+                    //*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+blk)->d) * unhalf((B+((jj+J+4)*ldb)+blk)->d));
 
-    template<int size>
-    inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
+                    *((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d));
+                    *((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d));
+                }
+            }
+    }
+     inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
+       for (int I = 0; I < RM; I++) {
+          for (int J = 0; J < RN; J++) {
+             float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
+             *c_ptr += *((float*)&fin_res[idx+I]+J);
+          }
+       }
+    }
+    template<typename ArrayType>
+    inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) {
        vector signed int vec_C[4];
        vector float CA[4] = {0};
        vector float res[4] = {0};
@@ -1610,6 +1669,18 @@ class tinyBLAS_Q0_PPC {
           fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
        }
     }
+    /*inline void compute_new(acc_t* ACC, int c_idx, int s_idx, int* comparray, vector float* vs, vector float* fin_res) {
+       vector signed int vec_C[4];
+       vector float CA[4] = {0};
+       vector float res[4] = {0};
+       __builtin_mma_disassemble_acc(vec_C, ACC);
+       for (int i = 0; i < 4; i++) {
+          CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
+          res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
+          fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
+       }
+    }*/
+
     /* This function processes quantized data from block_q4_0 elements.
      * First the we try to extract the two int4 values stored in single int8_t into two signed int8.
      * And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8.
@@ -1629,6 +1700,7 @@ class tinyBLAS_Q0_PPC {
         vsum = vec_add(vsum, vsum2);
         *(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
     }
+    
 
     template <typename V1, typename V2>
     inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
@@ -1660,8 +1732,61 @@ class tinyBLAS_Q0_PPC {
         vec_xst(t8, 0, vecOffset+48);
     }
 
-    template<int size>
-    void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
+    void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) {
+        int64_t i, j;
+        TA *aoffset = NULL;
+        int8_t *vecOffset = NULL;
+        TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+        vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
+        vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
+        aoffset = const_cast<TA*>(a);
+        vecOffset = vec;
+	int index = 0;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+		for (int blk = 0; blk < kc; blk++) {
+                        c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset1+blk)->qs));
+                        c2[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset2+blk)->qs));
+                        c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset3+blk)->qs));
+                        c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset4+blk)->qs));
+                        c5[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset5+blk)->qs));
+                        c6[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset6+blk)->qs));
+                        c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset7+blk)->qs));
+                        c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset8+blk)->qs));
+
+                        process_q4_elements(c1, &comparray[index + 8*blk+0]);
+                        process_q4_elements(c2, &comparray[index + 8*blk+1]);
+                        process_q4_elements(c3, &comparray[index + 8*blk+2]);
+                        process_q4_elements(c4, &comparray[index + 8*blk+3]);
+                        process_q4_elements(c5, &comparray[index + 8*blk+4]);
+                        process_q4_elements(c6, &comparray[index + 8*blk+5]);
+                        process_q4_elements(c7, &comparray[index + 8*blk+6]);
+                        process_q4_elements(c8, &comparray[index + 8*blk+7]);
+                        vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
+                        vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
+                        vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
+                        vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
+                        vecOffset += 256;
+		}
+                j--;
+		index += 8*kc;
+            } while (j > 0);
+        }
+    }
+
+template<int size>
+void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
         int64_t i, j;
         TA *aoffset = NULL;
         int8_t *vecOffset = NULL;
@@ -1711,7 +1836,7 @@ class tinyBLAS_Q0_PPC {
                         aoffset2 += lda;
                         aoffset3 += lda;
                         aoffset4 += lda;
-                        aoffset5 += lda;
+			aoffset5 += lda;
                         aoffset6 += lda;
                         aoffset7 += lda;
                         aoffset8 += lda;
@@ -1722,7 +1847,6 @@ class tinyBLAS_Q0_PPC {
                 j--;
             } while (j > 0);
         }
-
         if (rows & 4) {
             aoffset1 = aoffset;
             aoffset2 = aoffset1 + lda;
@@ -1782,6 +1906,42 @@ class tinyBLAS_Q0_PPC {
         }
     }
     template<typename VA, typename VB>
+    void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
+        int64_t i, j;
+        block_q8_0 *aoffset = NULL;
+        VA *vecOffset = NULL;
+        block_q8_0* aoffsets[8];
+        __vector_pair arr[8];
+        VB c[8][2] = {0};
+        VB c1[8] = {0}; VB c2[8] = {0};
+        aoffset = const_cast<block_q8_0*>(a);
+        vecOffset = vec;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                for (int it = 0; it < 8; it++)
+                    aoffsets[it] = aoffset + it*lda;
+                aoffset += 8 * lda;
+		for (int blk = 0; blk < kc; blk++) {
+                    for (int it = 0; it < 8; it++) {
+                        arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs);
+                        __builtin_vsx_disassemble_pair(c[it], &arr[it]);
+                        c1[it] = c[it][0];
+                        c2[it] = c[it][1];
+                    }
+                    vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
+                    vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
+                    vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
+                    vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
+                    /*for (int it = 0; it < 8; it++)
+                        aoffsets[it] += lda;*/
+                    vecOffset += 256;
+            }
+            j--;
+        } while(j > 0);
+    }
+}
+ template<typename VA, typename VB>
     void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
         int64_t i, j;
         block_q8_0 *aoffset = NULL;
@@ -1822,7 +1982,6 @@ class tinyBLAS_Q0_PPC {
             j--;
         } while(j > 0);
     }
-
     if (rows & 4) {
             aoffsets[0]  = aoffset;
             for (int it = 1; it < 4; it++ )
@@ -1919,6 +2078,7 @@ class tinyBLAS_Q0_PPC {
         vec_t vec_A[8], vec_B[16] = {0};
         acc_t acc_0, acc_1;
         std::array<int, 4> comparray {};
+	//int comparray[8] = {0};
         vector float fin_res[8] = {0};
         vector float vs[8] = {0};
         bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
@@ -1953,8 +2113,8 @@ class tinyBLAS_Q0_PPC {
                     aoffset += lda;
                 }
             }
-            compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
-            compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
+            compute(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute(&acc_1, 0, 4, comparray, vs, fin_res);
         }
         save_res(ii, jj, 0, fin_res);
         save_res(ii, jj+4, 4, fin_res);
@@ -1964,6 +2124,7 @@ class tinyBLAS_Q0_PPC {
         vec_t vec_A[16], vec_B[8] = {0};
         acc_t acc_0, acc_1;
         std::array<int, 8> comparray {};
+	//int comparray[8] = {0};
         vector float fin_res[8] = {0};
         vector float vs[8] = {0};
         bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
@@ -1997,8 +2158,8 @@ class tinyBLAS_Q0_PPC {
                     aoffset += lda;
                 }
             }
-            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
-            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
+            compute(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute(&acc_1, 4, 4, comparray, vs, fin_res);
         }
         save_res(ii, jj, 0, fin_res);
         save_res(ii+4, jj, 4, fin_res);
@@ -2007,6 +2168,7 @@ class tinyBLAS_Q0_PPC {
     void KERNEL_8x8(int64_t ii, int64_t jj) {
         vec_t vec_A[16], vec_B[16] = {0};
         acc_t acc_0, acc_1, acc_2, acc_3;
+	//int comparray[8] = {0};
         std::array<int, 8> comparray {};
         vector float fin_res[16] = {0};
         vector float vs[16] = {0};
@@ -2046,16 +2208,111 @@ class tinyBLAS_Q0_PPC {
                     aoffset += lda;
                 }
             }
-            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
-            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
-            compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
-            compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
+            compute(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute(&acc_1, 4, 4, comparray, vs, fin_res);
+            compute(&acc_2, 0, 8, comparray, vs, fin_res);
+            compute(&acc_3, 4, 12, comparray, vs, fin_res);
         }
         save_res(ii, jj, 0, fin_res);
         save_res(ii+4, jj, 4, fin_res);
         save_res(ii, jj+4, 8, fin_res);
         save_res(ii+4, jj+4, 12, fin_res);
     }
+    void KERNEL_Q4(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) {
+      acc_t acc[4];
+       for (int i = 0; i < mc ; i += 8) {
+           for (int j = 0; j < nc; j += 8) {
+               //printf("in kernel q4 with i=%d j = %d\n", i, j);
+               vector float fin_res[16] = {0};
+               vector float vs[16] = {0};
+               for (int64_t kk = 0; kk < kc; kk++) {
+               //printf("Block  kk=%d\n", kk);
+                   for (int x = 0; x < 4; x++) {
+                       __builtin_mma_xxsetaccz(&acc[x]);
+                   }
+                   int A_block_idx = (i/8)*(16*kc) + kk*16;
+                   int B_block_idx = (j/8)*(16*kc)+ kk*16;
+                   vec_t *A_block = &vec_A[A_block_idx];
+                   vec_t *B_block = &vec_B[B_block_idx];
+                   /*printf("printing input vectors A and B\n");
+                   for (int i = 0; i< 16; i++){
+                           print_vec_q8("A", A_block[i]);
+                           print_vec_q8("B", B_block[i]);
+                   }*/
+                    for (int x = 0; x < 8; x++) {
+                        __builtin_mma_xvi8ger4pp(&acc[0], A_block[x],     B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[2], A_block[x],     B_block[x+8]);
+                        __builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8],   B_block[x+8]);
+                    }
+                   compute_scale(ii+i, jj+j, l+kk, vs);
+                   int c_index = (i/8)*(8*kc)+ kk*8;
+                   int* c_block = &comparray[c_index];
+                   compute(&acc[0], 0,  0,  c_block, vs, fin_res);
+                    compute(&acc[1], 4,  4,  c_block, vs, fin_res);
+                    compute(&acc[2], 0,  8,  c_block, vs, fin_res);
+                    compute(&acc[3], 4, 12,  c_block, vs, fin_res);
+               }
+               if (l == 0) {
+                  save_res(ii+i,   jj+j,    0,  fin_res);
+                  save_res(ii+i+4, jj+j,    4,  fin_res);
+                  save_res(ii+i,   jj+j+4,  8,  fin_res);
+                  save_res(ii+i+4, jj+j+4, 12,  fin_res);
+               } else {
+                  add_save_res(ii+i,   jj+j,    0,  fin_res);
+                  add_save_res(ii+i+4, jj+j,    4,  fin_res);
+                  add_save_res(ii+i,   jj+j+4,  8,  fin_res);
+                  add_save_res(ii+i+4, jj+j+4, 12,  fin_res);
+              }
+           }
+
+           }
+    }
+
+    void matmul_tiled(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
+    if (!key_created) {
+        if (pthread_key_create(&t_data_key, thread_cleanup) == 0) {
+            key_created = true;
+        } else {
+            return; 
+        }
+    }
+
+    thread_scratchpad_t* t_data = (thread_scratchpad_t*)pthread_getspecific(t_data_key);
+
+    if (t_data == nullptr) {
+        t_data = new thread_scratchpad_t;
+        
+        t_data->A_pack = new vec_t[mc * kc * 2];
+        t_data->B_pack = new vec_t[nc * kc * 2];
+        t_data->comparray = new int[mc * kc];
+        pthread_setspecific(t_data_key, t_data);
+    }
+    
+    vec_t* A_pack = t_data->A_pack;
+    vec_t* B_pack = t_data->B_pack;
+    int* comparray = t_data->comparray;
+
+        int64_t ytiles = m / mc;
+        int64_t xtiles = n / nc;
+        int64_t tiles  = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles) {
+            end = tiles;
+        }
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = (job / xtiles) * mc;
+            int64_t jj = (job % xtiles) * nc;
+            for (int64_t kk = 0; kk < k; kk += kc) {
+                packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray);
+                packNormal_large<uint8_t, vector unsigned char>(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true);
+                KERNEL_Q4(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray);
+        }
+      }
+    }
+
 
     void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
         int64_t ytiles = (m - m0) / RM;
@@ -2075,6 +2332,7 @@ class tinyBLAS_Q0_PPC {
             int64_t ii = m0 + job / xtiles * RM;
             int64_t jj = n0 + job % xtiles * RN;
             std::array<int, 4> comparray{};
+	    //int comparray[4] = {0};
             vector float res[4] = {0};
             vector float fin_res[4] = {0};
             vector float vs[4] = {0};
@@ -2159,6 +2417,7 @@ class tinyBLAS_Q0_PPC {
     const block_q8_0 *const B;
     float *C;
     const int64_t k;
+    int64_t kc;
     const int64_t lda;
     const int64_t ldb;
     const int64_t ldc;
@@ -2856,7 +3115,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
             params->ith, params->nth};
-        tb.matmul(m, n);
+        tb.matmul_q8(m, n);
         return true;
 #else
         return false;
@@ -2893,7 +3152,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
             params->ith, params->nth};
-        tb.matmul(m, n);
+        tb.matmul_q4(m, n);
         return true;
 #else
         return false;