Skip to content

Commit eb3041a

Browse files
committed
ggml : add NUMA-aware buffer type that allocates pages accordingly to the first-touch policy
llama : use NUMA-aware buffer type for KV cache
1 parent 5bbc736 commit eb3041a

File tree

3 files changed

+94
-1
lines changed

3 files changed

+94
-1
lines changed

ggml/include/ggml-backend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ extern "C" {
348348
// CPU buffer types are always available
349349
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
350350
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
351+
GGML_API ggml_backend_buffer_type_t ggml_backend_numa_buffer_type(void);
351352

352353
#ifdef __cplusplus
353354
}

ggml/src/ggml-backend.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2000,3 +2000,84 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
20002000
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
20012001
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
20022002
}
2003+
2004+
// NUMA buffer interface - similar to CPU, but with pages allocated accordingly to a NUMA first-touch policy
2005+
2006+
#include <sys/mman.h>
2007+
2008+
static void ggml_backend_numa_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2009+
if (munmap((char *) buffer->context, buffer->size)) {
2010+
GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
2011+
}
2012+
}
2013+
2014+
static void ggml_backend_numa_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2015+
if (posix_madvise(buffer->context, buffer->size, POSIX_MADV_DONTNEED)) {
2016+
GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n",
2017+
strerror(errno));
2018+
}
2019+
}
2020+
2021+
static const struct ggml_backend_buffer_i ggml_backend_numa_buffer_i = {
2022+
/* .free_buffer = */ ggml_backend_numa_buffer_free_buffer,
2023+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
2024+
/* .init_tensor = */ NULL, // no initialization required
2025+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
2026+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
2027+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
2028+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
2029+
/* .clear = */ ggml_backend_numa_buffer_clear,
2030+
/* .reset = */ NULL,
2031+
};
2032+
2033+
// NUMA buffer type - similar to CPU, but with pages allocated accordingly to a NUMA first-touch policy
2034+
2035+
static const char * ggml_backend_numa_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
2036+
return "NUMA";
2037+
2038+
GGML_UNUSED(buft);
2039+
}
2040+
2041+
static ggml_backend_buffer_t ggml_backend_numa_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2042+
int flags = MAP_SHARED | MAP_ANONYMOUS;
2043+
void * data = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0);
2044+
if (data == MAP_FAILED) {
2045+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
2046+
return NULL;
2047+
}
2048+
if (posix_madvise(data, size, POSIX_MADV_RANDOM)) {
2049+
GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
2050+
strerror(errno));
2051+
}
2052+
2053+
return ggml_backend_buffer_init(buft, ggml_backend_numa_buffer_i, data, size);
2054+
}
2055+
2056+
static size_t ggml_backend_numa_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
2057+
return TENSOR_ALIGNMENT;
2058+
2059+
GGML_UNUSED(buft);
2060+
}
2061+
2062+
static bool ggml_backend_numa_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2063+
return true;
2064+
2065+
GGML_UNUSED(buft);
2066+
}
2067+
2068+
ggml_backend_buffer_type_t ggml_backend_numa_buffer_type(void) {
2069+
static struct ggml_backend_buffer_type ggml_backend_numa_buffer_type = {
2070+
/* .iface = */ {
2071+
/* .get_name = */ ggml_backend_numa_buffer_type_get_name,
2072+
/* .alloc_buffer = */ ggml_backend_numa_buffer_type_alloc_buffer,
2073+
/* .get_alignment = */ ggml_backend_numa_buffer_type_get_alignment,
2074+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
2075+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
2076+
/* .is_host = */ ggml_backend_numa_buffer_type_is_host,
2077+
},
2078+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
2079+
/* .context = */ NULL,
2080+
};
2081+
2082+
return &ggml_backend_numa_buffer_type;
2083+
}

src/llama-kv-cache.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,13 @@ bool llama_kv_cache_init(
7171
cache.k_l.reserve(n_layer);
7272
cache.v_l.reserve(n_layer);
7373

74+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
75+
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
76+
bool is_numa = is_numa_fn();
77+
if (!offload && is_numa) {
78+
LLAMA_LOG_INFO("%s: NUMA usage detected, using NUMA-aware buffer for KV cache\n", __func__);
79+
}
80+
7481
for (int i = 0; i < n_layer; i++) {
7582
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
7683
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -82,7 +89,11 @@ bool llama_kv_cache_init(
8289
auto * dev = model.dev_layer(i);
8390
buft = ggml_backend_dev_buffer_type(dev);
8491
} else {
85-
buft = ggml_backend_cpu_buffer_type();
92+
if (is_numa) {
93+
buft = ggml_backend_numa_buffer_type();
94+
} else {
95+
buft = ggml_backend_cpu_buffer_type();
96+
}
8697
}
8798
ggml_context * ctx = ctx_for_buft(buft);
8899

0 commit comments

Comments
 (0)