@@ -780,45 +780,204 @@ enum ggml_numa_strategy ggml_numa_get_strategy(void) {
780780}
781781
782782//
783- // NUMA-aware work buffer allocation:
784- // Based on empirical testing, allocating work buffers on node 0 provides
785- // the best speed. Interleaving actually slows things down considerably.
786- // If we optimised kernels for Numa awareness, this could be revisited.
783+ // NUMA-aware work buffer allocation with interleaved default:
787784//
785+ // By default, work buffers are allocated using an interleaved first-touch strategy
786+ // to distribute memory across all NUMA nodes. This can improve aggregate memory
787+ // bandwidth when the buffer is accessed uniformly by threads across all nodes.
788+ //
789+ // Override this behavior to force allocation on a specific node using:
790+ // GGML_NUMA_WORK_NODE=<node_number> (e.g., GGML_NUMA_WORK_NODE=0)
791+ //
792+
793+ // Helper function to capture current thread affinity
794+ static void ggml_numa_affinity_capture (cpu_set_t * original_affinity ) {
795+ #if defined(__gnu_linux__ )
796+ if (pthread_getaffinity_np (pthread_self (), sizeof (cpu_set_t ), original_affinity ) != 0 ) {
797+ // If capture fails, just zero the set as a fallback
798+ CPU_ZERO (original_affinity );
799+ }
800+ #else
801+ // Non-Linux platforms: initialize to empty set
802+ CPU_ZERO (original_affinity );
803+ #endif
804+ }
805+
806+ // Helper function to bind current thread to a specific CPU
807+ static bool ggml_numa_affinity_bind_single (uint32_t cpu_id , cpu_set_t * backup_affinity ) {
808+ #if defined(__gnu_linux__ )
809+ UNUSED (backup_affinity ); // Reserved for future use
810+
811+ cpu_set_t cpu_mask ;
812+ CPU_ZERO (& cpu_mask );
813+ CPU_SET (cpu_id , & cpu_mask );
814+
815+ if (pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t ), & cpu_mask ) == 0 ) {
816+ return true;
817+ } else {
818+ GGML_LOG_DEBUG ("NUMA: Failed to bind thread to CPU %u: %s\n" , cpu_id , strerror (errno ));
819+ return false;
820+ }
821+ #else
822+ UNUSED (cpu_id );
823+ UNUSED (backup_affinity );
824+ return false;
825+ #endif
826+ }
827+
828+ // Helper function to restore thread affinity
829+ static void ggml_numa_affinity_restore (const cpu_set_t * original_affinity ) {
830+ #if defined(__gnu_linux__ )
831+ pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t ), original_affinity );
832+ #else
833+ UNUSED (original_affinity );
834+ #endif
835+ }
836+
837+ // Helper function to perform interleaved first-touch allocation
838+ static bool ggml_numa_alloc_interleaved_first_touch (void * ptr , size_t size ) {
839+ if (g_state .numa .n_nodes <= 1 ) {
840+ return false;
841+ }
842+
843+ const long page_size = sysconf (_SC_PAGESIZE );
844+ if (page_size <= 0 ) {
845+ GGML_LOG_DEBUG ("NUMA: Could not determine page size for interleaving\n" );
846+ return false;
847+ }
848+
849+ const size_t page_size_t = (size_t )page_size ;
850+ const size_t n_pages = (size + page_size_t - 1 ) / page_size_t ;
851+ char * base = (char * )ptr ;
852+
853+ // Capture original thread affinity to restore later
854+ cpu_set_t original_affinity ;
855+ ggml_numa_affinity_capture (& original_affinity );
856+
857+ bool success = true;
858+
859+ // Touch each page on a different NUMA node in round-robin fashion
860+ for (size_t page_idx = 0 ; page_idx < n_pages ; ++ page_idx ) {
861+ const uint32_t node_idx = page_idx % g_state .numa .n_nodes ;
862+ const struct ggml_numa_node * node = & g_state .numa .nodes [node_idx ];
863+
864+ if (node -> n_cpus == 0 ) {
865+ // Skip nodes with no CPUs, fall back to default allocation for this page
866+ continue ;
867+ }
868+
869+ // Bind to the first CPU of the target node for first-touch
870+ const uint32_t cpu_id = node -> cpus [0 ];
871+ if (ggml_numa_affinity_bind_single (cpu_id , & original_affinity )) {
872+ // First-touch the page to allocate it on the current NUMA node
873+ volatile char * page_start = (volatile char * )(base + page_idx * page_size_t );
874+ page_start [0 ] = 0 ;
875+
876+ GGML_LOG_DEBUG ("NUMA: Page %zu touched on node %u (CPU %u)\n" ,
877+ page_idx , node_idx , cpu_id );
878+ } else {
879+ // Could not bind to target CPU, skip this optimization for this page
880+ GGML_LOG_DEBUG ("NUMA: Could not bind to CPU %u for page %zu, using default allocation\n" ,
881+ cpu_id , page_idx );
882+ success = false;
883+ }
884+ }
885+
886+ // Restore original thread affinity
887+ ggml_numa_affinity_restore (& original_affinity );
888+
889+ return success ;
890+ }
891+
788892void * ggml_numa_alloc_work_buffer (size_t size ) {
789893 void * ptr = malloc (size );
790894 if (!ptr ) {
791895 return NULL ;
792896 }
793897
794- if (ggml_is_numa ()) {
795- // Bind to NUMA node 0 using first-touch policy
898+ // Check if NUMA is available and we have multiple nodes
899+ if (!ggml_is_numa ()) {
900+ // No NUMA support, just initialize the buffer
901+ memset (ptr , 0 , size );
902+ return ptr ;
903+ }
904+
905+ #if defined(__gnu_linux__ )
906+ // Check allocation strategy preference (one-time check with caching)
907+ static int allocation_strategy_checked = 0 ;
908+ static bool use_specific_node_allocation = false;
909+ static uint32_t target_numa_node = 0 ;
910+
911+ if (!allocation_strategy_checked ) {
912+ const char * env_value = getenv ("GGML_NUMA_WORK_NODE" );
913+ if (env_value != NULL && env_value [0 ] != '\0' ) {
914+ // Parse the node number
915+ char * endptr ;
916+ long node_num = strtol (env_value , & endptr , 10 );
917+
918+ if (endptr != env_value && * endptr == '\0' && node_num >= 0 &&
919+ node_num < (long )g_state .numa .n_nodes ) {
920+ use_specific_node_allocation = true;
921+ target_numa_node = (uint32_t )node_num ;
922+ GGML_LOG_INFO ("NUMA: Work buffer allocation forced to node %u via GGML_NUMA_WORK_NODE\n" ,
923+ target_numa_node );
924+ } else {
925+ GGML_LOG_WARN ("NUMA: Invalid node number '%s' in GGML_NUMA_WORK_NODE, using default interleaving\n" ,
926+ env_value );
927+ }
928+ } else {
929+ GGML_LOG_DEBUG ("NUMA: Using default interleaved work buffer allocation\n" );
930+ }
931+ allocation_strategy_checked = 1 ;
932+ }
933+
934+ if (use_specific_node_allocation ) {
935+ // Force allocation to specific node using memory policy
796936 if (numa_available () >= 0 ) {
797- // Set memory policy to bind to node 0
798- unsigned long nodemask = 1UL ; // Only node 0
937+ unsigned long nodemask = 1UL << target_numa_node ;
799938 if (set_mempolicy (MPOL_BIND , & nodemask , sizeof (nodemask ) * 8 ) == 0 ) {
800- // Touch all pages to allocate them on node 0
939+ // Touch all pages to ensure allocation on target node
801940 memset (ptr , 0 , size );
802941
803942 // Reset memory policy to default
804943 set_mempolicy (MPOL_DEFAULT , NULL , 0 );
805944
806- GGML_LOG_DEBUG ("NUMA: Work buffer allocated on node 0 (size: %zu bytes)\n" , size );
945+ GGML_LOG_DEBUG ("NUMA: Work buffer allocated on node %u (size: %zu bytes)\n" ,
946+ target_numa_node , size );
947+ return ptr ;
807948 } else {
808- // Fallback: just touch the pages without specific binding
809- memset (ptr , 0 , size );
810- GGML_LOG_DEBUG ("NUMA: Work buffer allocated with first-touch (size: %zu bytes)\n" , size );
949+ GGML_LOG_DEBUG ("NUMA: Failed to set MPOL_BIND policy for node %u: %s\n" ,
950+ target_numa_node , strerror (errno ));
811951 }
812- } else {
813- // NUMA not available, just use regular allocation
814- memset (ptr , 0 , size );
815952 }
816- } else {
817- // No NUMA, just touch the pages for consistency
953+
954+ // Fallback: first- touch initialization without specific node binding
818955 memset (ptr , 0 , size );
956+ GGML_LOG_DEBUG ("NUMA: Work buffer allocated with first-touch fallback (size: %zu bytes)\n" , size );
957+ return ptr ;
819958 }
820959
960+ // Default strategy: interleaved allocation across all nodes
961+ if (g_state .numa .n_nodes > 1 ) {
962+ if (ggml_numa_alloc_interleaved_first_touch (ptr , size )) {
963+ GGML_LOG_DEBUG ("NUMA: Work buffer interleaved across %u nodes (size: %zu bytes)\n" ,
964+ g_state .numa .n_nodes , size );
965+ return ptr ;
966+ } else {
967+ GGML_LOG_DEBUG ("NUMA: Interleaved allocation failed, falling back to default initialization\n" );
968+ }
969+ }
970+
971+ // Final fallback: simple initialization
972+ memset (ptr , 0 , size );
973+ GGML_LOG_DEBUG ("NUMA: Work buffer allocated with fallback initialization (size: %zu bytes)\n" , size );
821974 return ptr ;
975+
976+ #else
977+ // Non-Linux platforms: simple initialization
978+ memset (ptr , 0 , size );
979+ return ptr ;
980+ #endif
822981}
823982
824983void ggml_numa_free_work_buffer (void * ptr ) {
0 commit comments