Skip to content

Commit 98135c9

Browse files
committed
experimental - interleave work buffers
1 parent b41a837 commit 98135c9

File tree

1 file changed

+177
-18
lines changed

1 file changed

+177
-18
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 177 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -780,45 +780,204 @@ enum ggml_numa_strategy ggml_numa_get_strategy(void) {
780780
}
781781

782782
//
783-
// NUMA-aware work buffer allocation:
784-
// Based on empirical testing, allocating work buffers on node 0 provides
785-
// the best speed. Interleaving actually slows things down considerably.
786-
// If we optimised kernels for Numa awareness, this could be revisited.
783+
// NUMA-aware work buffer allocation with interleaved default:
787784
//
785+
// By default, work buffers are allocated using an interleaved first-touch strategy
786+
// to distribute memory across all NUMA nodes. This can improve aggregate memory
787+
// bandwidth when the buffer is accessed uniformly by threads across all nodes.
788+
//
789+
// Override this behavior to force allocation on a specific node using:
790+
// GGML_NUMA_WORK_NODE=<node_number> (e.g., GGML_NUMA_WORK_NODE=0)
791+
//
792+
793+
// Helper function to capture current thread affinity
794+
static void ggml_numa_affinity_capture(cpu_set_t * original_affinity) {
795+
#if defined(__gnu_linux__)
796+
if (pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), original_affinity) != 0) {
797+
// If capture fails, just zero the set as a fallback
798+
CPU_ZERO(original_affinity);
799+
}
800+
#else
801+
// Non-Linux platforms: initialize to empty set
802+
CPU_ZERO(original_affinity);
803+
#endif
804+
}
805+
806+
// Helper function to bind current thread to a specific CPU
807+
static bool ggml_numa_affinity_bind_single(uint32_t cpu_id, cpu_set_t * backup_affinity) {
808+
#if defined(__gnu_linux__)
809+
UNUSED(backup_affinity); // Reserved for future use
810+
811+
cpu_set_t cpu_mask;
812+
CPU_ZERO(&cpu_mask);
813+
CPU_SET(cpu_id, &cpu_mask);
814+
815+
if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_mask) == 0) {
816+
return true;
817+
} else {
818+
GGML_LOG_DEBUG("NUMA: Failed to bind thread to CPU %u: %s\n", cpu_id, strerror(errno));
819+
return false;
820+
}
821+
#else
822+
UNUSED(cpu_id);
823+
UNUSED(backup_affinity);
824+
return false;
825+
#endif
826+
}
827+
828+
// Helper function to restore thread affinity
829+
static void ggml_numa_affinity_restore(const cpu_set_t * original_affinity) {
830+
#if defined(__gnu_linux__)
831+
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), original_affinity);
832+
#else
833+
UNUSED(original_affinity);
834+
#endif
835+
}
836+
837+
// Helper function to perform interleaved first-touch allocation
838+
static bool ggml_numa_alloc_interleaved_first_touch(void * ptr, size_t size) {
839+
if (g_state.numa.n_nodes <= 1) {
840+
return false;
841+
}
842+
843+
const long page_size = sysconf(_SC_PAGESIZE);
844+
if (page_size <= 0) {
845+
GGML_LOG_DEBUG("NUMA: Could not determine page size for interleaving\n");
846+
return false;
847+
}
848+
849+
const size_t page_size_t = (size_t)page_size;
850+
const size_t n_pages = (size + page_size_t - 1) / page_size_t;
851+
char * base = (char *)ptr;
852+
853+
// Capture original thread affinity to restore later
854+
cpu_set_t original_affinity;
855+
ggml_numa_affinity_capture(&original_affinity);
856+
857+
bool success = true;
858+
859+
// Touch each page on a different NUMA node in round-robin fashion
860+
for (size_t page_idx = 0; page_idx < n_pages; ++page_idx) {
861+
const uint32_t node_idx = page_idx % g_state.numa.n_nodes;
862+
const struct ggml_numa_node * node = &g_state.numa.nodes[node_idx];
863+
864+
if (node->n_cpus == 0) {
865+
// Skip nodes with no CPUs, fall back to default allocation for this page
866+
continue;
867+
}
868+
869+
// Bind to the first CPU of the target node for first-touch
870+
const uint32_t cpu_id = node->cpus[0];
871+
if (ggml_numa_affinity_bind_single(cpu_id, &original_affinity)) {
872+
// First-touch the page to allocate it on the current NUMA node
873+
volatile char * page_start = (volatile char *)(base + page_idx * page_size_t);
874+
page_start[0] = 0;
875+
876+
GGML_LOG_DEBUG("NUMA: Page %zu touched on node %u (CPU %u)\n",
877+
page_idx, node_idx, cpu_id);
878+
} else {
879+
// Could not bind to target CPU, skip this optimization for this page
880+
GGML_LOG_DEBUG("NUMA: Could not bind to CPU %u for page %zu, using default allocation\n",
881+
cpu_id, page_idx);
882+
success = false;
883+
}
884+
}
885+
886+
// Restore original thread affinity
887+
ggml_numa_affinity_restore(&original_affinity);
888+
889+
return success;
890+
}
891+
788892
void* ggml_numa_alloc_work_buffer(size_t size) {
789893
void* ptr = malloc(size);
790894
if (!ptr) {
791895
return NULL;
792896
}
793897

794-
if (ggml_is_numa()) {
795-
// Bind to NUMA node 0 using first-touch policy
898+
// Check if NUMA is available and we have multiple nodes
899+
if (!ggml_is_numa()) {
900+
// No NUMA support, just initialize the buffer
901+
memset(ptr, 0, size);
902+
return ptr;
903+
}
904+
905+
#if defined(__gnu_linux__)
906+
// Check allocation strategy preference (one-time check with caching)
907+
static int allocation_strategy_checked = 0;
908+
static bool use_specific_node_allocation = false;
909+
static uint32_t target_numa_node = 0;
910+
911+
if (!allocation_strategy_checked) {
912+
const char * env_value = getenv("GGML_NUMA_WORK_NODE");
913+
if (env_value != NULL && env_value[0] != '\0') {
914+
// Parse the node number
915+
char * endptr;
916+
long node_num = strtol(env_value, &endptr, 10);
917+
918+
if (endptr != env_value && *endptr == '\0' && node_num >= 0 &&
919+
node_num < (long)g_state.numa.n_nodes) {
920+
use_specific_node_allocation = true;
921+
target_numa_node = (uint32_t)node_num;
922+
GGML_LOG_INFO("NUMA: Work buffer allocation forced to node %u via GGML_NUMA_WORK_NODE\n",
923+
target_numa_node);
924+
} else {
925+
GGML_LOG_WARN("NUMA: Invalid node number '%s' in GGML_NUMA_WORK_NODE, using default interleaving\n",
926+
env_value);
927+
}
928+
} else {
929+
GGML_LOG_DEBUG("NUMA: Using default interleaved work buffer allocation\n");
930+
}
931+
allocation_strategy_checked = 1;
932+
}
933+
934+
if (use_specific_node_allocation) {
935+
// Force allocation to specific node using memory policy
796936
if (numa_available() >= 0) {
797-
// Set memory policy to bind to node 0
798-
unsigned long nodemask = 1UL; // Only node 0
937+
unsigned long nodemask = 1UL << target_numa_node;
799938
if (set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask) * 8) == 0) {
800-
// Touch all pages to allocate them on node 0
939+
// Touch all pages to ensure allocation on target node
801940
memset(ptr, 0, size);
802941

803942
// Reset memory policy to default
804943
set_mempolicy(MPOL_DEFAULT, NULL, 0);
805944

806-
GGML_LOG_DEBUG("NUMA: Work buffer allocated on node 0 (size: %zu bytes)\n", size);
945+
GGML_LOG_DEBUG("NUMA: Work buffer allocated on node %u (size: %zu bytes)\n",
946+
target_numa_node, size);
947+
return ptr;
807948
} else {
808-
// Fallback: just touch the pages without specific binding
809-
memset(ptr, 0, size);
810-
GGML_LOG_DEBUG("NUMA: Work buffer allocated with first-touch (size: %zu bytes)\n", size);
949+
GGML_LOG_DEBUG("NUMA: Failed to set MPOL_BIND policy for node %u: %s\n",
950+
target_numa_node, strerror(errno));
811951
}
812-
} else {
813-
// NUMA not available, just use regular allocation
814-
memset(ptr, 0, size);
815952
}
816-
} else {
817-
// No NUMA, just touch the pages for consistency
953+
954+
// Fallback: first-touch initialization without specific node binding
818955
memset(ptr, 0, size);
956+
GGML_LOG_DEBUG("NUMA: Work buffer allocated with first-touch fallback (size: %zu bytes)\n", size);
957+
return ptr;
819958
}
820959

960+
// Default strategy: interleaved allocation across all nodes
961+
if (g_state.numa.n_nodes > 1) {
962+
if (ggml_numa_alloc_interleaved_first_touch(ptr, size)) {
963+
GGML_LOG_DEBUG("NUMA: Work buffer interleaved across %u nodes (size: %zu bytes)\n",
964+
g_state.numa.n_nodes, size);
965+
return ptr;
966+
} else {
967+
GGML_LOG_DEBUG("NUMA: Interleaved allocation failed, falling back to default initialization\n");
968+
}
969+
}
970+
971+
// Final fallback: simple initialization
972+
memset(ptr, 0, size);
973+
GGML_LOG_DEBUG("NUMA: Work buffer allocated with fallback initialization (size: %zu bytes)\n", size);
821974
return ptr;
975+
976+
#else
977+
// Non-Linux platforms: simple initialization
978+
memset(ptr, 0, size);
979+
return ptr;
980+
#endif
822981
}
823982

824983
void ggml_numa_free_work_buffer(void* ptr) {

0 commit comments

Comments
 (0)