From e516cd0056afaff096dd2c4564c4659f345f3c0e Mon Sep 17 00:00:00 2001
From: Uttam Pawar <upawar@amd.com>
Date: Mon, 17 Nov 2025 23:47:00 +0000
Subject: [PATCH] Keep barrier at the end of loop to synchronise threads
 reducing cache-line contention (cache HITM) This improves throughput for
 cases where threads have to wait due to lack work and causing process to
 spend many cycles in a spin loop. This enables to update dynamic chunk
 counter with static stride partitioning which further helps to eliminate
 shared counter.

* remove one barrier in sgemm()

* static stride partitioning
---
 ggml/src/ggml-cpu/llamafile/sgemm.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index 2c4ad9d58..231522683 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -446,10 +446,7 @@ class tinyBLAS {
             ggml_threadpool_chunk_set(params->threadpool, params->nth);
         }
 
-        ggml_barrier(params->threadpool);
-
-        int64_t job = params->ith;
-        while (job < nb_job) {
+        for (int64_t job = params->ith; job < nb_job; job += params->nth) {
             const int64_t ii = (job % ytiles) * RM * BM;
             const int64_t jb =  job / ytiles;
             const int64_t jr0 = BLOC_POS(jb  , jj_BN, SIZE_BN);
@@ -472,7 +469,6 @@ class tinyBLAS {
                 GGML_ASSERT(jj == jj2);
             }
 
-            job = ggml_threadpool_chunk_add(params->threadpool, 1);
         }
 
         ggml_barrier(params->threadpool);