From e516cd0056afaff096dd2c4564c4659f345f3c0e Mon Sep 17 00:00:00 2001 From: Uttam Pawar Date: Mon, 17 Nov 2025 23:47:00 +0000 Subject: [PATCH] Keep barrier at the end of loop to synchronise threads reducing cache-line contention (cache HITM) This improves throughput for cases where threads have to wait due to lack work and causing process to spend many cycles in a spin loop. This enables to update dynamic chunk counter with static stride partitioning which further helps to eliminate shared counter. * remove one barrier in sgemm() * static stride partitioning --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 2c4ad9d58..231522683 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -446,10 +446,7 @@ class tinyBLAS { ggml_threadpool_chunk_set(params->threadpool, params->nth); } - ggml_barrier(params->threadpool); - - int64_t job = params->ith; - while (job < nb_job) { + for (int64_t job = params->ith; job < nb_job; job += params->nth) { const int64_t ii = (job % ytiles) * RM * BM; const int64_t jb = job / ytiles; const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN); @@ -472,7 +469,6 @@ class tinyBLAS { GGML_ASSERT(jj == jj2); } - job = ggml_threadpool_chunk_add(params->threadpool, 1); } ggml_barrier(params->threadpool);