Skip to content

Commit fa3a5b4

Browse files
committed
update docs
1 parent 98135c9 commit fa3a5b4

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed

.github/instructions/numa-mirroring-implementation.instructions.md

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,213 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
286286
}
287287
}
288288
}
289+
290+
//
291+
// NUMA-aware work buffer allocation with interleaved default:
292+
//
293+
// By default, work buffers are allocated using an interleaved first-touch strategy
294+
// to distribute memory across all NUMA nodes. This can improve aggregate memory
295+
// bandwidth when the buffer is accessed uniformly by threads across all nodes.
296+
//
297+
// Override this behavior to force allocation on a specific node using:
298+
// GGML_NUMA_WORK_NODE=<node_number> (e.g., GGML_NUMA_WORK_NODE=0)
299+
//
300+
301+
// Helper function to capture current thread affinity
302+
static void ggml_numa_affinity_capture(cpu_set_t * original_affinity) {
303+
#if defined(__gnu_linux__)
304+
if (pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), original_affinity) != 0) {
305+
// If capture fails, just zero the set as a fallback
306+
CPU_ZERO(original_affinity);
307+
}
308+
#else
309+
// Non-Linux platforms: initialize to empty set
310+
CPU_ZERO(original_affinity);
311+
#endif
312+
}
313+
314+
// Helper function to bind current thread to a specific CPU
315+
static bool ggml_numa_affinity_bind_single(uint32_t cpu_id, cpu_set_t * backup_affinity) {
316+
#if defined(__gnu_linux__)
317+
UNUSED(backup_affinity); // Reserved for future use
318+
319+
cpu_set_t cpu_mask;
320+
CPU_ZERO(&cpu_mask);
321+
CPU_SET(cpu_id, &cpu_mask);
322+
323+
if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_mask) == 0) {
324+
return true;
325+
} else {
326+
GGML_LOG_DEBUG("NUMA: Failed to bind thread to CPU %u: %s\n", cpu_id, strerror(errno));
327+
return false;
328+
}
329+
#else
330+
UNUSED(cpu_id);
331+
UNUSED(backup_affinity);
332+
return false;
333+
#endif
334+
}
335+
336+
// Helper function to restore thread affinity
337+
static void ggml_numa_affinity_restore(const cpu_set_t * original_affinity) {
338+
#if defined(__gnu_linux__)
339+
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), original_affinity);
340+
#else
341+
UNUSED(original_affinity);
342+
#endif
343+
}
344+
345+
// Helper function to perform interleaved first-touch allocation
346+
static bool ggml_numa_alloc_interleaved_first_touch(void * ptr, size_t size) {
347+
if (g_state.numa.n_nodes <= 1) {
348+
return false;
349+
}
350+
351+
const long page_size = sysconf(_SC_PAGESIZE);
352+
if (page_size <= 0) {
353+
GGML_LOG_DEBUG("NUMA: Could not determine page size for interleaving\n");
354+
return false;
355+
}
356+
357+
const size_t page_size_t = (size_t)page_size;
358+
const size_t n_pages = (size + page_size_t - 1) / page_size_t;
359+
char * base = (char *)ptr;
360+
361+
// Capture original thread affinity to restore later
362+
cpu_set_t original_affinity;
363+
ggml_numa_affinity_capture(&original_affinity);
364+
365+
bool success = true;
366+
367+
// Touch each page on a different NUMA node in round-robin fashion
368+
for (size_t page_idx = 0; page_idx < n_pages; ++page_idx) {
369+
const uint32_t node_idx = page_idx % g_state.numa.n_nodes;
370+
const struct ggml_numa_node * node = &g_state.numa.nodes[node_idx];
371+
372+
if (node->n_cpus == 0) {
373+
// Skip nodes with no CPUs, fall back to default allocation for this page
374+
continue;
375+
}
376+
377+
// Bind to the first CPU of the target node for first-touch
378+
const uint32_t cpu_id = node->cpus[0];
379+
if (ggml_numa_affinity_bind_single(cpu_id, &original_affinity)) {
380+
// First-touch the page to allocate it on the current NUMA node
381+
volatile char * page_start = (volatile char *)(base + page_idx * page_size_t);
382+
page_start[0] = 0;
383+
384+
GGML_LOG_DEBUG("NUMA: Page %zu touched on node %u (CPU %u)\n",
385+
page_idx, node_idx, cpu_id);
386+
} else {
387+
// Could not bind to target CPU, skip this optimization for this page
388+
GGML_LOG_DEBUG("NUMA: Could not bind to CPU %u for page %zu, using default allocation\n",
389+
cpu_id, page_idx);
390+
success = false;
391+
}
392+
}
393+
394+
// Restore original thread affinity
395+
ggml_numa_affinity_restore(&original_affinity);
396+
397+
return success;
398+
}
399+
400+
void* ggml_numa_alloc_work_buffer(size_t size) {
401+
void* ptr = malloc(size);
402+
if (!ptr) {
403+
return NULL;
404+
}
405+
406+
// Check if NUMA is available and we have multiple nodes
407+
if (!ggml_is_numa()) {
408+
// No NUMA support, just initialize the buffer
409+
memset(ptr, 0, size);
410+
return ptr;
411+
}
412+
413+
#if defined(__gnu_linux__)
414+
// Check allocation strategy preference (one-time check with caching)
415+
static int allocation_strategy_checked = 0;
416+
static bool use_specific_node_allocation = false;
417+
static uint32_t target_numa_node = 0;
418+
419+
if (!allocation_strategy_checked) {
420+
const char * env_value = getenv("GGML_NUMA_WORK_NODE");
421+
if (env_value != NULL && env_value[0] != '\0') {
422+
// Parse the node number
423+
char * endptr;
424+
long node_num = strtol(env_value, &endptr, 10);
425+
426+
if (endptr != env_value && *endptr == '\0' && node_num >= 0 &&
427+
node_num < (long)g_state.numa.n_nodes) {
428+
use_specific_node_allocation = true;
429+
target_numa_node = (uint32_t)node_num;
430+
GGML_LOG_INFO("NUMA: Work buffer allocation forced to node %u via GGML_NUMA_WORK_NODE\n",
431+
target_numa_node);
432+
} else {
433+
GGML_LOG_WARN("NUMA: Invalid node number '%s' in GGML_NUMA_WORK_NODE, using default interleaving\n",
434+
env_value);
435+
}
436+
} else {
437+
GGML_LOG_DEBUG("NUMA: Using default interleaved work buffer allocation\n");
438+
}
439+
allocation_strategy_checked = 1;
440+
}
441+
442+
if (use_specific_node_allocation) {
443+
// Force allocation to specific node using memory policy
444+
if (numa_available() >= 0) {
445+
unsigned long nodemask = 1UL << target_numa_node;
446+
if (set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask) * 8) == 0) {
447+
// Touch all pages to ensure allocation on target node
448+
memset(ptr, 0, size);
449+
450+
// Reset memory policy to default
451+
set_mempolicy(MPOL_DEFAULT, NULL, 0);
452+
453+
GGML_LOG_DEBUG("NUMA: Work buffer allocated on node %u (size: %zu bytes)\n",
454+
target_numa_node, size);
455+
return ptr;
456+
} else {
457+
GGML_LOG_DEBUG("NUMA: Failed to set MPOL_BIND policy for node %u: %s\n",
458+
target_numa_node, strerror(errno));
459+
}
460+
}
461+
462+
// Fallback: first-touch initialization without specific node binding
463+
memset(ptr, 0, size);
464+
GGML_LOG_DEBUG("NUMA: Work buffer allocated with first-touch fallback (size: %zu bytes)\n", size);
465+
return ptr;
466+
}
467+
468+
// Default strategy: interleaved allocation across all nodes
469+
if (g_state.numa.n_nodes > 1) {
470+
if (ggml_numa_alloc_interleaved_first_touch(ptr, size)) {
471+
GGML_LOG_DEBUG("NUMA: Work buffer interleaved across %u nodes (size: %zu bytes)\n",
472+
g_state.numa.n_nodes, size);
473+
return ptr;
474+
} else {
475+
GGML_LOG_DEBUG("NUMA: Interleaved allocation failed, falling back to default initialization\n");
476+
}
477+
}
478+
479+
// Final fallback: simple initialization
480+
memset(ptr, 0, size);
481+
GGML_LOG_DEBUG("NUMA: Work buffer allocated with fallback initialization (size: %zu bytes)\n", size);
482+
return ptr;
483+
484+
#else
485+
// Non-Linux platforms: simple initialization
486+
memset(ptr, 0, size);
487+
return ptr;
488+
#endif
489+
}
490+
491+
void ggml_numa_free_work_buffer(void* ptr) {
492+
if (ptr) {
493+
free(ptr);
494+
}
495+
}
289496
```
290497

291498
In `llama-mmap.cpp`: First-touch allocation at model weight loading time

0 commit comments

Comments
 (0)