From e0101cfe5a188d0416abb772c5d7f58216e7c474 Mon Sep 17 00:00:00 2001 From: Saood Karim Date: Sun, 2 Feb 2025 13:18:33 -0600 Subject: [PATCH 1/7] NUMA-aware KV cache buffer type (experimental) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Stanisław Szymczyk --- ggml/include/ggml-backend.h | 2 + ggml/src/ggml-backend.c | 84 +++++++++++++++++++++++++++++++++++++ src/llama.cpp | 10 +++++ 3 files changed, 96 insertions(+) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 5f3f1e286..156b5d823 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -109,6 +109,8 @@ extern "C" { GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_numa_buffer_type(void); + #ifdef GGML_USE_CPU_HBM GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); #endif diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index fd538f50e..8d30298c5 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -936,6 +936,90 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, siz return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } + + +// NUMA buffer interface - similar to CPU, but with pages allocated accordingly to a NUMA first-touch policy + +#include + +GGML_CALL static void ggml_backend_numa_buffer_free_buffer(ggml_backend_buffer_t buffer) { + if (munmap((char *) buffer->context, buffer->size)) { + GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } +} + +GGML_CALL static void ggml_backend_numa_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + if (posix_madvise(buffer->context, buffer->size, POSIX_MADV_DONTNEED)) { + GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n", + strerror(errno)); + } +} + +static const struct ggml_backend_buffer_i ggml_backend_numa_buffer_i = { + /* .free_buffer = */ ggml_backend_numa_buffer_free_buffer, + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_numa_buffer_clear, + /* .reset = */ NULL, +}; + +// NUMA buffer type - similar to CPU, but with pages allocated accordingly to a NUMA first-touch policy + +GGML_CALL static const char * ggml_backend_numa_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "NUMA"; + + GGML_UNUSED(buft); +} + +GGML_CALL static ggml_backend_buffer_t ggml_backend_numa_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + int flags = MAP_SHARED | MAP_ANONYMOUS; + void * data = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0); + if (data == MAP_FAILED) { + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } + if (posix_madvise(data, size, POSIX_MADV_RANDOM)) { + GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + + return ggml_backend_buffer_init(buft, ggml_backend_numa_buffer_i, data, size); +} + +GGML_CALL static size_t ggml_backend_numa_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +GGML_CALL static bool ggml_backend_numa_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + +GGML_CALL ggml_backend_buffer_type_t ggml_backend_numa_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_numa_buffer_type = { + /* .iface = */ { + /* .get_name = */ ggml_backend_numa_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_numa_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_numa_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_numa_buffer_type_is_host, + }, + /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ NULL, + }; + + return &ggml_backend_numa_buffer_type; +} + + GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) { return ggml_backend_cpu_init(); diff --git a/src/llama.cpp b/src/llama.cpp index 247372653..3aecba7e9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3249,6 +3249,15 @@ static bool llama_kv_cache_init( bool warn = true; int n_mla = 0; + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + bool is_numa = is_numa_fn(); + if (!offload && is_numa) { + LLAMA_LOG_INFO("%s: NUMA usage detected, using NUMA-aware buffer for KV cache\n", __func__); + } + + + for (int i = 0; i < (int) n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); @@ -3257,6 +3266,7 @@ static bool llama_kv_cache_init( const uint32_t n_embd_head_k= hparams.n_embd_head_k; +<<<<<<< HEAD struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; ggml_tensor * v; From 8ab6b155d55a2c777b25739cee1abe4b09b3505d Mon Sep 17 00:00:00 2001 From: Saood Karim Date: Sun, 2 Feb 2025 14:25:19 -0600 Subject: [PATCH 2/7] Fixes to make previous commits compile --- ggml/src/ggml-backend.c | 22 ++++++++++++++-------- src/llama.cpp | 28 +++++++++++++++++++++------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 8d30298c5..726bd9ffa 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -944,22 +944,29 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, siz GGML_CALL static void ggml_backend_numa_buffer_free_buffer(ggml_backend_buffer_t buffer) { if (munmap((char *) buffer->context, buffer->size)) { - GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + //GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); } } GGML_CALL static void ggml_backend_numa_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { if (posix_madvise(buffer->context, buffer->size, POSIX_MADV_DONTNEED)) { - GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n", - strerror(errno)); + //GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n", + // strerror(errno)); } } +GGML_CALL static const char * ggml_backend_numa_buffer_name(ggml_backend_buffer_t buffer) { + return "CPU NUMA"; + + GGML_UNUSED(buffer); +} + static const struct ggml_backend_buffer_i ggml_backend_numa_buffer_i = { + /* .get_name = */ ggml_backend_numa_buffer_name, /* .free_buffer = */ ggml_backend_numa_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, +// / .memset_tensor = / ggml_backend_cpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, @@ -979,12 +986,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_numa_buffer_type_alloc_buffe int flags = MAP_SHARED | MAP_ANONYMOUS; void * data = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0); if (data == MAP_FAILED) { - GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + //GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); return NULL; } if (posix_madvise(data, size, POSIX_MADV_RANDOM)) { - GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", - strerror(errno)); + //GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + // strerror(errno)); } return ggml_backend_buffer_init(buft, ggml_backend_numa_buffer_i, data, size); @@ -1012,7 +1019,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_numa_buffer_type(void) { /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_numa_buffer_type_is_host, }, - /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, }; diff --git a/src/llama.cpp b/src/llama.cpp index 3aecba7e9..c616009fd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2260,7 +2260,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer #endif if (buft == nullptr) { - buft = ggml_backend_cpu_buffer_type(); + buft = ggml_backend_numa_buffer_type(); } return buft; @@ -3249,10 +3249,10 @@ static bool llama_kv_cache_init( bool warn = true; int n_mla = 0; - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); - auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); - bool is_numa = is_numa_fn(); - if (!offload && is_numa) { + //auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_TYPE_CPU)); + //auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + //bool is_numa = is_numa_fn(); + if (!offload) { LLAMA_LOG_INFO("%s: NUMA usage detected, using NUMA-aware buffer for KV cache\n", __func__); } @@ -3265,8 +3265,6 @@ static bool llama_kv_cache_init( const uint32_t n_head_kv = hparams.n_head_kv(i); const uint32_t n_embd_head_k= hparams.n_embd_head_k; - -<<<<<<< HEAD struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; ggml_tensor * v; @@ -3300,6 +3298,22 @@ static bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); + //Commented out old method + struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + + //ggml_backend_buffer_type_t buft; + //ggml_context * ctx; + + //if (offload) { + // ctx = ctx_map.at(model.buft_layer[i].buft); + //} else { + // buft = ggml_backend_numa_buffer_type(); + // ctx = get_ctx_for_buft(buft); + //} + + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); + return false; } } if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) { From 9b6b55f4419ea8bcafc8a473f91dcd08f051d695 Mon Sep 17 00:00:00 2001 From: Saood Karim Date: Tue, 25 Mar 2025 12:16:43 -0500 Subject: [PATCH 3/7] More fix --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c616009fd..58a76c2da 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3273,7 +3273,7 @@ static bool llama_kv_cache_init( const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; - LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); + //LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); if (cparams.flash_attn) { ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_k, kv_lora_rank + n_embd_head_qk_rope, kv_size); ggml_format_name(kv, "cache_kv_l%d", i); @@ -3299,7 +3299,7 @@ static bool llama_kv_cache_init( cache.k_l.push_back(k); cache.v_l.push_back(v); //Commented out old method - struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + //struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); //ggml_backend_buffer_type_t buft; //ggml_context * ctx; From c821129fcb0f045dd18e127f411b2753f69a9757 Mon Sep 17 00:00:00 2001 From: Saood Karim Date: Tue, 25 Mar 2025 13:42:02 -0500 Subject: [PATCH 4/7] More fix --- src/llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama.cpp b/src/llama.cpp index 58a76c2da..01a8af1c0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3298,6 +3298,7 @@ static bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); + } //Commented out old method //struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); From 109f5c0cd8fb23b5e52f1949b168b6a0b9bb1c5b Mon Sep 17 00:00:00 2001 From: Saood Karim Date: Tue, 25 Mar 2025 14:23:11 -0500 Subject: [PATCH 5/7] Cleanup --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 01a8af1c0..691d5d9c3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3312,10 +3312,10 @@ static bool llama_kv_cache_init( // ctx = get_ctx_for_buft(buft); //} - if (!ctx) { - LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); - return false; - } + //if (!ctx) { + // LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); + // return false; + //} } if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) { LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer)); From cc8c0e1b49a34478919fff8509cb52b32b7b4a9d Mon Sep 17 00:00:00 2001 From: Saood Karim Date: Tue, 25 Mar 2025 14:29:00 -0500 Subject: [PATCH 6/7] More cleanup --- src/llama.cpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 691d5d9c3..ec0a57431 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3299,23 +3299,6 @@ static bool llama_kv_cache_init( cache.k_l.push_back(k); cache.v_l.push_back(v); } - //Commented out old method - //struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); - - //ggml_backend_buffer_type_t buft; - //ggml_context * ctx; - - //if (offload) { - // ctx = ctx_map.at(model.buft_layer[i].buft); - //} else { - // buft = ggml_backend_numa_buffer_type(); - // ctx = get_ctx_for_buft(buft); - //} - - //if (!ctx) { - // LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); - // return false; - //} } if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) { LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer)); From f31aca2d4028f53ad28489df92ec19cf72d50aab Mon Sep 17 00:00:00 2001 From: Saood Karim Date: Tue, 25 Mar 2025 14:30:11 -0500 Subject: [PATCH 7/7] Whitespace --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index ec0a57431..f2706c3eb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3298,7 +3298,7 @@ static bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); - } + } } if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) { LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer));