@@ -286,6 +286,213 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
286286 }
287287 }
288288}
289+
290+ //
291+ // NUMA-aware work buffer allocation with interleaved default:
292+ //
293+ // By default, work buffers are allocated using an interleaved first-touch strategy
294+ // to distribute memory across all NUMA nodes. This can improve aggregate memory
295+ // bandwidth when the buffer is accessed uniformly by threads across all nodes.
296+ //
297+ // Override this behavior to force allocation on a specific node using:
298+ // GGML_NUMA_WORK_NODE=<node_number> (e.g., GGML_NUMA_WORK_NODE=0)
299+ //
300+
301+ // Helper function to capture current thread affinity
302+ static void ggml_numa_affinity_capture(cpu_set_t * original_affinity) {
303+ #if defined(__gnu_linux__)
304+ if (pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), original_affinity) != 0) {
305+ // If capture fails, just zero the set as a fallback
306+ CPU_ZERO(original_affinity);
307+ }
308+ #else
309+ // Non-Linux platforms: initialize to empty set
310+ CPU_ZERO(original_affinity);
311+ #endif
312+ }
313+
314+ // Helper function to bind current thread to a specific CPU
315+ static bool ggml_numa_affinity_bind_single(uint32_t cpu_id, cpu_set_t * backup_affinity) {
316+ #if defined(__gnu_linux__)
317+ UNUSED(backup_affinity); // Reserved for future use
318+
319+ cpu_set_t cpu_mask;
320+ CPU_ZERO(&cpu_mask);
321+ CPU_SET(cpu_id, &cpu_mask);
322+
323+ if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_mask) == 0) {
324+ return true;
325+ } else {
326+ GGML_LOG_DEBUG("NUMA: Failed to bind thread to CPU %u: %s\n", cpu_id, strerror(errno));
327+ return false;
328+ }
329+ #else
330+ UNUSED(cpu_id);
331+ UNUSED(backup_affinity);
332+ return false;
333+ #endif
334+ }
335+
336+ // Helper function to restore thread affinity
337+ static void ggml_numa_affinity_restore(const cpu_set_t * original_affinity) {
338+ #if defined(__gnu_linux__)
339+ pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), original_affinity);
340+ #else
341+ UNUSED(original_affinity);
342+ #endif
343+ }
344+
345+ // Helper function to perform interleaved first-touch allocation
346+ static bool ggml_numa_alloc_interleaved_first_touch(void * ptr, size_t size) {
347+ if (g_state.numa.n_nodes <= 1) {
348+ return false;
349+ }
350+
351+ const long page_size = sysconf(_SC_PAGESIZE);
352+ if (page_size <= 0) {
353+ GGML_LOG_DEBUG("NUMA: Could not determine page size for interleaving\n");
354+ return false;
355+ }
356+
357+ const size_t page_size_t = (size_t)page_size;
358+ const size_t n_pages = (size + page_size_t - 1) / page_size_t;
359+ char * base = (char *)ptr;
360+
361+ // Capture original thread affinity to restore later
362+ cpu_set_t original_affinity;
363+ ggml_numa_affinity_capture(&original_affinity);
364+
365+ bool success = true;
366+
367+ // Touch each page on a different NUMA node in round-robin fashion
368+ for (size_t page_idx = 0; page_idx < n_pages; ++page_idx) {
369+ const uint32_t node_idx = page_idx % g_state.numa.n_nodes;
370+ const struct ggml_numa_node * node = &g_state.numa.nodes[node_idx];
371+
372+ if (node->n_cpus == 0) {
373+ // Skip nodes with no CPUs, fall back to default allocation for this page
374+ continue;
375+ }
376+
377+ // Bind to the first CPU of the target node for first-touch
378+ const uint32_t cpu_id = node->cpus[0];
379+ if (ggml_numa_affinity_bind_single(cpu_id, &original_affinity)) {
380+ // First-touch the page to allocate it on the current NUMA node
381+ volatile char * page_start = (volatile char *)(base + page_idx * page_size_t);
382+ page_start[0] = 0;
383+
384+ GGML_LOG_DEBUG("NUMA: Page %zu touched on node %u (CPU %u)\n",
385+ page_idx, node_idx, cpu_id);
386+ } else {
387+ // Could not bind to target CPU, skip this optimization for this page
388+ GGML_LOG_DEBUG("NUMA: Could not bind to CPU %u for page %zu, using default allocation\n",
389+ cpu_id, page_idx);
390+ success = false;
391+ }
392+ }
393+
394+ // Restore original thread affinity
395+ ggml_numa_affinity_restore(&original_affinity);
396+
397+ return success;
398+ }
399+
400+ void* ggml_numa_alloc_work_buffer(size_t size) {
401+ void* ptr = malloc(size);
402+ if (!ptr) {
403+ return NULL;
404+ }
405+
406+ // Check if NUMA is available and we have multiple nodes
407+ if (!ggml_is_numa()) {
408+ // No NUMA support, just initialize the buffer
409+ memset(ptr, 0, size);
410+ return ptr;
411+ }
412+
413+ #if defined(__gnu_linux__)
414+ // Check allocation strategy preference (one-time check with caching)
415+ static int allocation_strategy_checked = 0;
416+ static bool use_specific_node_allocation = false;
417+ static uint32_t target_numa_node = 0;
418+
419+ if (!allocation_strategy_checked) {
420+ const char * env_value = getenv("GGML_NUMA_WORK_NODE");
421+ if (env_value != NULL && env_value[0] != '\0') {
422+ // Parse the node number
423+ char * endptr;
424+ long node_num = strtol(env_value, &endptr, 10);
425+
426+ if (endptr != env_value && *endptr == '\0' && node_num >= 0 &&
427+ node_num < (long)g_state.numa.n_nodes) {
428+ use_specific_node_allocation = true;
429+ target_numa_node = (uint32_t)node_num;
430+ GGML_LOG_INFO("NUMA: Work buffer allocation forced to node %u via GGML_NUMA_WORK_NODE\n",
431+ target_numa_node);
432+ } else {
433+ GGML_LOG_WARN("NUMA: Invalid node number '%s' in GGML_NUMA_WORK_NODE, using default interleaving\n",
434+ env_value);
435+ }
436+ } else {
437+ GGML_LOG_DEBUG("NUMA: Using default interleaved work buffer allocation\n");
438+ }
439+ allocation_strategy_checked = 1;
440+ }
441+
442+ if (use_specific_node_allocation) {
443+ // Force allocation to specific node using memory policy
444+ if (numa_available() >= 0) {
445+ unsigned long nodemask = 1UL << target_numa_node;
446+ if (set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask) * 8) == 0) {
447+ // Touch all pages to ensure allocation on target node
448+ memset(ptr, 0, size);
449+
450+ // Reset memory policy to default
451+ set_mempolicy(MPOL_DEFAULT, NULL, 0);
452+
453+ GGML_LOG_DEBUG("NUMA: Work buffer allocated on node %u (size: %zu bytes)\n",
454+ target_numa_node, size);
455+ return ptr;
456+ } else {
457+ GGML_LOG_DEBUG("NUMA: Failed to set MPOL_BIND policy for node %u: %s\n",
458+ target_numa_node, strerror(errno));
459+ }
460+ }
461+
462+ // Fallback: first-touch initialization without specific node binding
463+ memset(ptr, 0, size);
464+ GGML_LOG_DEBUG("NUMA: Work buffer allocated with first-touch fallback (size: %zu bytes)\n", size);
465+ return ptr;
466+ }
467+
468+ // Default strategy: interleaved allocation across all nodes
469+ if (g_state.numa.n_nodes > 1) {
470+ if (ggml_numa_alloc_interleaved_first_touch(ptr, size)) {
471+ GGML_LOG_DEBUG("NUMA: Work buffer interleaved across %u nodes (size: %zu bytes)\n",
472+ g_state.numa.n_nodes, size);
473+ return ptr;
474+ } else {
475+ GGML_LOG_DEBUG("NUMA: Interleaved allocation failed, falling back to default initialization\n");
476+ }
477+ }
478+
479+ // Final fallback: simple initialization
480+ memset(ptr, 0, size);
481+ GGML_LOG_DEBUG("NUMA: Work buffer allocated with fallback initialization (size: %zu bytes)\n", size);
482+ return ptr;
483+
484+ #else
485+ // Non-Linux platforms: simple initialization
486+ memset(ptr, 0, size);
487+ return ptr;
488+ #endif
489+ }
490+
491+ void ggml_numa_free_work_buffer(void* ptr) {
492+ if (ptr) {
493+ free(ptr);
494+ }
495+ }
289496```
290497
291498In ` llama-mmap.cpp ` : First-touch allocation at model weight loading time
0 commit comments