@@ -524,6 +524,7 @@ struct ggml_numa_nodes {
524524#ifdef GGML_USE_NUMA_MIGRATE
525525 int * node_num_of_cpu ;
526526 int * cpu_core_mapping ; // x logic core, y physical core
527+ int logic_core_cnts ;
527528 int cores_per_numa [GGML_NUMA_MIGRATE_NODES ];
528529#endif
529530};
@@ -585,11 +586,80 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
585586}
586587
587588#ifdef GGML_USE_NUMA_MIGRATE
589+
590+ static int * * ggml_allocate_core_ids (int num_nodes , int max_cores ) {
591+ int * * core_ids = malloc (num_nodes * sizeof (int * ));
592+ for (int i = 0 ; i < num_nodes ; i ++ ) {
593+ core_ids [i ] = malloc (max_cores * sizeof (int ));
594+ for (int j = 0 ; j < max_cores ; j ++ ) {
595+ core_ids [i ][j ] = -1 ;
596+ }
597+ }
598+ return core_ids ;
599+ }
600+
601+ static void ggml_free_core_ids (int * * core_ids , int num_nodes ) {
602+ for (int i = 0 ; i < num_nodes ; i ++ ) {
603+ free (core_ids [i ]);
604+ }
605+ free (core_ids );
606+ }
607+
608+ static void ggml_parse_cpu_core_ids (const char * env_var , int * * core_ids , int max_numa_nodes , int max_cores_per_node ) {
609+ char * numa_node ;
610+ char * node_copy = strdup (env_var );
611+ char * context ;
612+
613+ numa_node = strtok_r (node_copy , "|" , & context );
614+ int node_count = 0 ;
615+
616+ while (numa_node != NULL && node_count < max_numa_nodes ) {
617+ int core_index = 0 ;
618+
619+ char * core_range = strtok (numa_node , "," );
620+ while (core_range != NULL && core_index < max_cores_per_node ) {
621+ if (strchr (core_range , '-' ) != NULL ) {
622+ int start , end ;
623+ sscanf (core_range , "%d-%d" , & start , & end );
624+ for (int i = start ; i <= end && core_index < max_cores_per_node ; i ++ ) {
625+ core_ids [node_count ][core_index ++ ] = i ;
626+ }
627+ } else {
628+ int core_id = atoi (core_range );
629+ if (core_index < max_cores_per_node ) {
630+ core_ids [node_count ][core_index ++ ] = core_id ;
631+ }
632+ }
633+ core_range = strtok (NULL , "," );
634+ }
635+ node_count ++ ;
636+ numa_node = strtok_r (NULL , "|" , & context );
637+ }
638+
639+ free (node_copy );
640+ }
641+
642+
588643int ggml_get_node_from_cpu (int ith ) {
589644 int cpu = g_state .numa .cpu_core_mapping [ith ];
590645 return g_state .numa .node_num_of_cpu [cpu ];
591646}
592647
648+ int ggml_get_start_id_in_node (int ith ) {
649+ int total_cpus = 0 ;
650+ int prev_total_cpus = 0 ;
651+ for (int node = 0 ; node < GGML_NUMA_MIGRATE_NODES ; node ++ ) {
652+ prev_total_cpus = total_cpus ;
653+ total_cpus += g_state .numa .cores_per_numa [node ];
654+ if (ith < total_cpus ) {
655+ return (ith - prev_total_cpus );
656+ }
657+ }
658+
659+ assert (0 );
660+ return -1 ;
661+ }
662+
593663int ggml_cores_per_numa (int ith ) {
594664 int node = ggml_get_node_from_cpu (ith );
595665 return g_state .numa .cores_per_numa [node ];
@@ -605,6 +675,11 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
605675 if (n_threads == 1 ) {
606676 return ;
607677 }
678+ if (n_threads != g_state .numa .logic_core_cnts ) {
679+ printf ("bolt-test: n_threads: %d, g_state.numa.logic_core_cnts: %d\n" , n_threads , g_state .numa .logic_core_cnts );
680+ ggml_barrier (tp );
681+ return ;
682+ }
608683
609684 int cores_per_numa = ggml_cores_per_numa (ith );
610685 int numa_nodes = GGML_NUMA_MIGRATE_NODES ;
@@ -733,34 +808,60 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
733808#ifdef GGML_USE_NUMA_MIGRATE
734809 g_state .numa .node_num_of_cpu = (int * )malloc (g_state .numa .total_cpus * sizeof (int ));
735810 g_state .numa .cpu_core_mapping = (int * )malloc (g_state .numa .total_cpus * sizeof (int ));
736- for (uint32_t i = 0 ; i < g_state .numa .total_cpus ; i ++ ) {
737- g_state .numa .node_num_of_cpu [i ] = numa_node_of_cpu (i );
738- }
739-
740- FILE * fp = fopen ("/sys/devices/system/cpu/online" , "r" );
741- if (fp == NULL ) {
742- perror ("fopen" );
743- exit (EXIT_FAILURE );
744- }
745-
746- int cpu0 , cpu1 ;
747811 int logic_core_index = 0 ;
748- while (fscanf (fp , "%d" , & cpu0 ) != EOF ) {
749- cpu1 = cpu0 ;
750- while (fgetc (fp ) == '-' ) {
751- fscanf (fp , "%d" , & cpu1 );
812+
813+ const char * env_var = getenv ("GGML_NUMA_CORE_IDS" );
814+ if (env_var ) {
815+ int max_numa_nodes = GGML_NUMA_MIGRATE_NODES ;
816+ int * * core_ids = ggml_allocate_core_ids (max_numa_nodes , g_state .numa .total_cpus );
817+ ggml_parse_cpu_core_ids (env_var , core_ids , max_numa_nodes , g_state .numa .total_cpus );
818+
819+ for (int node = 0 ; node < max_numa_nodes ; node ++ ) {
820+ for (int core = 0 ; core < (int )g_state .numa .total_cpus ; core ++ ) {
821+ int phy_core_id = core_ids [node ][core ];
822+ if (phy_core_id != -1 ) {
823+ g_state .numa .node_num_of_cpu [phy_core_id ] = node ;
824+ g_state .numa .cpu_core_mapping [logic_core_index ] = phy_core_id ;
825+ g_state .numa .cores_per_numa [node ]++ ;
826+ GGML_PRINT_DEBUG ("setting core ids, core: %d, logic_core_index: %d, mapping: %d, cores_per_numa: %d, node_num_of_cpu: %d\n" ,
827+ phy_core_id ,
828+ logic_core_index ,
829+ g_state .numa .cpu_core_mapping [logic_core_index ],
830+ g_state .numa .cores_per_numa [node ],
831+ g_state .numa .node_num_of_cpu [phy_core_id ]);
832+ logic_core_index ++ ;
833+ g_state .numa .logic_core_cnts ++ ;
834+ }
835+ }
836+ }
837+ ggml_free_core_ids (core_ids , max_numa_nodes );
838+ } else {
839+ FILE * fp = fopen ("/sys/devices/system/cpu/online" , "r" );
840+ if (fp == NULL ) {
841+ perror ("fopen" );
842+ exit (EXIT_FAILURE );
752843 }
753844
754- for (int cpu_index = cpu0 ; cpu_index <= cpu1 ; cpu_index ++ ) {
755- g_state .numa .cpu_core_mapping [logic_core_index ++ ] = cpu_index ;
756- int node = g_state .numa .node_num_of_cpu [cpu_index ];
757- if (node < GGML_NUMA_MIGRATE_NODES ) {
758- g_state .numa .cores_per_numa [node ]++ ;
845+ int cpu0 , cpu1 ;
846+ while (fscanf (fp , "%d" , & cpu0 ) != EOF ) {
847+ cpu1 = cpu0 ;
848+ while (fgetc (fp ) == '-' ) {
849+ fscanf (fp , "%d" , & cpu1 );
850+ }
851+
852+ for (int cpu_index = cpu0 ; cpu_index <= cpu1 ; cpu_index ++ ) {
853+ g_state .numa .cpu_core_mapping [logic_core_index ++ ] = cpu_index ;
854+ g_state .numa .node_num_of_cpu [cpu_index ] = numa_node_of_cpu (cpu_index );
855+ int node = g_state .numa .node_num_of_cpu [cpu_index ];
856+ if (node < GGML_NUMA_MIGRATE_NODES ) {
857+ g_state .numa .logic_core_cnts ++ ;
858+ g_state .numa .cores_per_numa [node ]++ ;
859+ }
759860 }
760861 }
761- }
762862
763- fclose (fp );
863+ fclose (fp );
864+ }
764865#endif
765866
766867 if (ggml_is_numa ()) {
@@ -3219,10 +3320,12 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
32193320 threadpool -> n_barrier_passed = 0 ;
32203321
32213322#ifdef GGML_USE_NUMA_MIGRATE
3323+ ggml_backend_init_node_id ();
32223324 for (int node = 0 ; node < GGML_NUMA_MIGRATE_NODES ; node ++ ) {
3223- threadpool -> n_barrier_node [node ] = (atomic_int * )numa_alloc_onnode (sizeof (atomic_int ), node );
3325+ int node_id = ggml_backend_get_node_id (node );
3326+ threadpool -> n_barrier_node [node ] = (atomic_int * )numa_alloc_onnode (sizeof (atomic_int ), node_id );
32243327 * threadpool -> n_barrier_node [node ] = 0 ;
3225- threadpool -> n_barrier_passed_node [node ] = (atomic_int * )numa_alloc_onnode (sizeof (atomic_int ), node );
3328+ threadpool -> n_barrier_passed_node [node ] = (atomic_int * )numa_alloc_onnode (sizeof (atomic_int ), node_id );
32263329 * threadpool -> n_barrier_passed_node [node ] = 0 ;
32273330 }
32283331
0 commit comments