Skip to content

Commit 4f7562d

Browse files
committed
optimisation: force all cplan work buffers to allocate on Numa node 0
1 parent 48d8d59 commit 4f7562d

File tree

3 files changed

+78
-8
lines changed

3 files changed

+78
-8
lines changed

ggml/include/ggml-cpu.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,13 @@ extern "C" {
104104
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105105
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106106

107+
//
108+
// NUMA work buffer allocation
109+
//
110+
111+
GGML_BACKEND_API void * ggml_numa_alloc_work_buffer (size_t size);
112+
GGML_BACKEND_API void ggml_numa_free_work_buffer (void * ptr);
113+
107114
// Internal types and functions exposed for tests and benchmarks
108115

109116
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -769,10 +769,64 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
769769

770770
bool ggml_is_numa(void) {
771771
// Return true if:
772-
// 1. Multiple physical NUMA nodes are present, OR
773-
// 2. User explicitly requested NUMA mirror strategy (--numa mirror)
774-
return g_state.numa.n_nodes > 1 ||
775-
g_state.numa.numa_strategy == GGML_NUMA_STRATEGY_MIRROR;
772+
// 1. Multiple physical NUMA nodes are present, AND
773+
// 2. User explicitly requested a NUMA strategy
774+
return g_state.numa.n_nodes > 1 &&
775+
g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_DISABLED;
776+
}
777+
778+
//
779+
// NUMA-aware work buffer allocation:
780+
// Based on empirical testing, allocating work buffers on node 0 provides
781+
// the best speed. Interleaving actually slows things down considerably.
782+
// If we optimised kernels for Numa awareness, this could be revisited.
783+
//
784+
785+
void* ggml_numa_alloc_work_buffer(size_t size) {
786+
void* ptr = malloc(size);
787+
if (!ptr) {
788+
return NULL;
789+
}
790+
791+
#ifdef GGML_USE_NUMA
792+
if (ggml_is_numa()) {
793+
// Bind to NUMA node 0 using first-touch policy
794+
if (numa_available() >= 0) {
795+
// Set memory policy to bind to node 0
796+
unsigned long nodemask = 1UL; // Only node 0
797+
if (set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask) * 8) == 0) {
798+
// Touch all pages to allocate them on node 0
799+
memset(ptr, 0, size);
800+
801+
// Reset memory policy to default
802+
set_mempolicy(MPOL_DEFAULT, NULL, 0);
803+
804+
GGML_LOG_DEBUG("NUMA: Work buffer allocated on node 0 (size: %zu bytes)\n", size);
805+
} else {
806+
// Fallback: just touch the pages without specific binding
807+
memset(ptr, 0, size);
808+
GGML_LOG_DEBUG("NUMA: Work buffer allocated with first-touch (size: %zu bytes)\n", size);
809+
}
810+
} else {
811+
// NUMA not available, just use regular allocation
812+
memset(ptr, 0, size);
813+
}
814+
} else {
815+
// No NUMA, just touch the pages for consistency
816+
memset(ptr, 0, size);
817+
}
818+
#else
819+
// No NUMA support, just touch the pages
820+
memset(ptr, 0, size);
821+
#endif
822+
823+
return ptr;
824+
}
825+
826+
void ggml_numa_free_work_buffer(void* ptr) {
827+
if (ptr) {
828+
free(ptr);
829+
}
776830
}
777831

778832
#if defined(__ARM_ARCH)
@@ -3285,9 +3339,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
32853339
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
32863340
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
32873341

3288-
cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
3342+
// Use NUMA-aware work buffer allocation instead of ggml_new_buffer
3343+
cplan.work_data = (uint8_t *)ggml_numa_alloc_work_buffer(cplan.work_size);
3344+
if (cplan.work_size > 0 && !cplan.work_data) {
3345+
return GGML_STATUS_ALLOC_FAILED;
3346+
}
32893347

3290-
return ggml_graph_compute(cgraph, &cplan);
3348+
enum ggml_status status = ggml_graph_compute(cgraph, &cplan);
3349+
3350+
// Free the work buffer
3351+
ggml_numa_free_work_buffer(cplan.work_data);
3352+
3353+
return status;
32913354
}
32923355

32933356
void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
124124
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
125125

126126
if (cpu_plan->cplan.work_size > 0) {
127-
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
127+
cpu_plan->cplan.work_data = (uint8_t*)ggml_numa_alloc_work_buffer(cpu_plan->cplan.work_size);
128128
if (cpu_plan->cplan.work_data == NULL) {
129129
delete cpu_plan;
130130
return NULL;
@@ -140,7 +140,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
140140
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
141141
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
142142

143-
delete[] cpu_plan->cplan.work_data;
143+
ggml_numa_free_work_buffer(cpu_plan->cplan.work_data);
144144
delete cpu_plan;
145145

146146
GGML_UNUSED(backend);

0 commit comments

Comments
 (0)