@@ -522,7 +522,9 @@ struct ggml_numa_nodes {
522522#endif
523523
524524#ifdef GGML_USE_NUMA_MIGRATE
525- bool even_distributed ;
525+ int * node_num_of_cpu ;
526+ int * cpu_core_mapping ; // x logic core, y physical core
527+ int cores_per_numa [GGML_NUMA_MIGRATE_NODES ];
526528#endif
527529};
528530
@@ -583,16 +585,18 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
583585}
584586
585587#ifdef GGML_USE_NUMA_MIGRATE
586- int ggml_get_node_from_cpu (int cpu ) {
587- return cpu / g_state .numa .nodes [0 ].n_cpus ;
588+ int ggml_get_node_from_cpu (int ith ) {
589+ int cpu = g_state .numa .cpu_core_mapping [ith ];
590+ return g_state .numa .node_num_of_cpu [cpu ];
588591}
589592
590- int ggml_cores_per_numa (void ) {
591- return g_state .numa .nodes [0 ].n_cpus ;
593+ int ggml_cores_per_numa (int ith ) {
594+ int node = ggml_get_node_from_cpu (ith );
595+ return g_state .numa .cores_per_numa [node ];
592596}
593597
594598void ggml_barrier_numa_aware (struct ggml_threadpool * tp , int ith , int node_n ) {
595- if (( g_state .numa .numa_strategy != GGML_NUMA_STRATEGY_MIGRATE ) || ! g_state . numa . even_distributed ) {
599+ if (g_state .numa .numa_strategy != GGML_NUMA_STRATEGY_MIGRATE ) {
596600 ggml_barrier (tp );
597601 return ;
598602 }
@@ -602,13 +606,8 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
602606 return ;
603607 }
604608
605- int cores_per_numa = ggml_cores_per_numa ();
606- int numa_nodes = n_threads / cores_per_numa ;
607- int remaining_cores = n_threads % cores_per_numa ;
608- if ((numa_nodes != GGML_NUMA_MIGRATE_NODES ) || remaining_cores ) {
609- ggml_barrier (tp );
610- return ;
611- }
609+ int cores_per_numa = ggml_cores_per_numa (ith );
610+ int numa_nodes = GGML_NUMA_MIGRATE_NODES ;
612611
613612 int node = ggml_get_node_from_cpu (ith );
614613
@@ -720,9 +719,6 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
720719 GGML_PRINT_DEBUG ("CPUs on node %u:" , n );
721720 node -> n_cpus = 0 ;
722721
723- #ifdef GGML_USE_NUMA_MIGRATE
724- g_state .numa .even_distributed = true;
725- #endif
726722 for (uint32_t c = 0 ; c < g_state .numa .total_cpus ; ++ c ) {
727723 rv = snprintf (path , sizeof (path ), "/sys/devices/system/node/node%u/cpu%u" , n , c );
728724 GGML_ASSERT (rv > 0 && (unsigned )rv < sizeof (path ));
@@ -732,13 +728,41 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
732728 }
733729 }
734730 GGML_PRINT_DEBUG ("\n" );
731+ }
732+
735733#ifdef GGML_USE_NUMA_MIGRATE
736- if ((n != 0 ) && (g_state .numa .nodes [n ].n_cpus != g_state .numa .nodes [0 ].n_cpus )) {
737- g_state .numa .even_distributed = false;
734+ g_state .numa .node_num_of_cpu = (int * )malloc (g_state .numa .total_cpus * sizeof (int ));
735+ g_state .numa .cpu_core_mapping = (int * )malloc (g_state .numa .total_cpus * sizeof (int ));
736+ for (uint32_t i = 0 ; i < g_state .numa .total_cpus ; i ++ ) {
737+ g_state .numa .node_num_of_cpu [i ] = numa_node_of_cpu (i );
738+ }
739+
740+ FILE * fp = fopen ("/sys/devices/system/cpu/online" , "r" );
741+ if (fp == NULL ) {
742+ perror ("fopen" );
743+ exit (EXIT_FAILURE );
744+ }
745+
746+ int cpu0 , cpu1 ;
747+ int logic_core_index = 0 ;
748+ while (fscanf (fp , "%d" , & cpu0 ) != EOF ) {
749+ cpu1 = cpu0 ;
750+ while (fgetc (fp ) == '-' ) {
751+ fscanf (fp , "%d" , & cpu1 );
752+ }
753+
754+ for (int cpu_index = cpu0 ; cpu_index <= cpu1 ; cpu_index ++ ) {
755+ g_state .numa .cpu_core_mapping [logic_core_index ++ ] = cpu_index ;
756+ int node = g_state .numa .node_num_of_cpu [cpu_index ];
757+ if (node < GGML_NUMA_MIGRATE_NODES ) {
758+ g_state .numa .cores_per_numa [node ]++ ;
759+ }
738760 }
739- #endif
740761 }
741762
763+ fclose (fp );
764+ #endif
765+
742766 if (ggml_is_numa ()) {
743767 FILE * fptr = fopen ("/proc/sys/kernel/numa_balancing" , "r" );
744768 if (fptr != NULL ) {
@@ -2169,7 +2193,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
21692193#if defined(__gnu_linux__ )
21702194
21712195#ifdef GGML_USE_NUMA_MIGRATE
2172- static void set_numa_migrate_affinity (int core_no ) {
2196+ static void set_numa_migrate_affinity (int thread_no ) {
2197+ int core_no = g_state .numa .cpu_core_mapping [thread_no ];
21732198 // Check if the core number is valid
21742199 if (core_no < 0 || core_no >= (int )g_state .numa .total_cpus ) {
21752200 printf ("%s, Warn: core_no not between 0 and %d, failback.\n" , __func__ , g_state .numa .total_cpus );
0 commit comments