@@ -769,10 +769,64 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
769769
770770bool ggml_is_numa (void ) {
771771 // Return true if:
772- // 1. Multiple physical NUMA nodes are present, OR
773- // 2. User explicitly requested NUMA mirror strategy (--numa mirror)
774- return g_state .numa .n_nodes > 1 ||
775- g_state .numa .numa_strategy == GGML_NUMA_STRATEGY_MIRROR ;
772+ // 1. Multiple physical NUMA nodes are present, AND
773+ // 2. User explicitly requested a NUMA strategy
774+ return g_state .numa .n_nodes > 1 &&
775+ g_state .numa .numa_strategy != GGML_NUMA_STRATEGY_DISABLED ;
776+ }
777+
778+ //
779+ // NUMA-aware work buffer allocation:
780+ // Based on empirical testing, allocating work buffers on node 0 provides
781+ // the best speed. Interleaving actually slows things down considerably.
782+ // If we optimised kernels for Numa awareness, this could be revisited.
783+ //
784+
785+ void * ggml_numa_alloc_work_buffer (size_t size ) {
786+ void * ptr = malloc (size );
787+ if (!ptr ) {
788+ return NULL ;
789+ }
790+
791+ #ifdef GGML_USE_NUMA
792+ if (ggml_is_numa ()) {
793+ // Bind to NUMA node 0 using first-touch policy
794+ if (numa_available () >= 0 ) {
795+ // Set memory policy to bind to node 0
796+ unsigned long nodemask = 1UL ; // Only node 0
797+ if (set_mempolicy (MPOL_BIND , & nodemask , sizeof (nodemask ) * 8 ) == 0 ) {
798+ // Touch all pages to allocate them on node 0
799+ memset (ptr , 0 , size );
800+
801+ // Reset memory policy to default
802+ set_mempolicy (MPOL_DEFAULT , NULL , 0 );
803+
804+ GGML_LOG_DEBUG ("NUMA: Work buffer allocated on node 0 (size: %zu bytes)\n" , size );
805+ } else {
806+ // Fallback: just touch the pages without specific binding
807+ memset (ptr , 0 , size );
808+ GGML_LOG_DEBUG ("NUMA: Work buffer allocated with first-touch (size: %zu bytes)\n" , size );
809+ }
810+ } else {
811+ // NUMA not available, just use regular allocation
812+ memset (ptr , 0 , size );
813+ }
814+ } else {
815+ // No NUMA, just touch the pages for consistency
816+ memset (ptr , 0 , size );
817+ }
818+ #else
819+ // No NUMA support, just touch the pages
820+ memset (ptr , 0 , size );
821+ #endif
822+
823+ return ptr ;
824+ }
825+
826+ void ggml_numa_free_work_buffer (void * ptr ) {
827+ if (ptr ) {
828+ free (ptr );
829+ }
776830}
777831
778832#if defined(__ARM_ARCH )
@@ -3285,9 +3339,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
32853339enum ggml_status ggml_graph_compute_with_ctx (struct ggml_context * ctx , struct ggml_cgraph * cgraph , int n_threads ) {
32863340 struct ggml_cplan cplan = ggml_graph_plan (cgraph , n_threads , NULL );
32873341
3288- cplan .work_data = (uint8_t * )ggml_new_buffer (ctx , cplan .work_size );
3342+ // Use NUMA-aware work buffer allocation instead of ggml_new_buffer
3343+ cplan .work_data = (uint8_t * )ggml_numa_alloc_work_buffer (cplan .work_size );
3344+ if (cplan .work_size > 0 && !cplan .work_data ) {
3345+ return GGML_STATUS_ALLOC_FAILED ;
3346+ }
32893347
3290- return ggml_graph_compute (cgraph , & cplan );
3348+ enum ggml_status status = ggml_graph_compute (cgraph , & cplan );
3349+
3350+ // Free the work buffer
3351+ ggml_numa_free_work_buffer (cplan .work_data );
3352+
3353+ return status ;
32913354}
32923355
32933356void ggml_cpu_fp32_to_fp32 (const float * x , float * y , int64_t n ) {
0 commit comments