Skip to content

Commit 1f3fdfb

Browse files
committed
add GGML_NUMA_CORE_IDS to specify which cores for the numa migrate feature, with 54 threads, 41 S_TG t/s, 55% uplift
1 parent f6a69ed commit 1f3fdfb

File tree

6 files changed

+184
-35
lines changed

6 files changed

+184
-35
lines changed

ggml/include/ggml-backend.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,8 @@ extern "C" {
350350
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
351351
#ifdef GGML_USE_NUMA_MIGRATE
352352
GGML_API size_t ggml_backend_get_page_size(void);
353+
GGML_API int ggml_backend_get_node_id(int index);
354+
GGML_API void ggml_backend_init_node_id(void);
353355
#endif
354356

355357
#ifdef __cplusplus

ggml/src/ggml-backend.cpp

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class numa_migrate_mapping_cache {
6767

6868
static std::set<numa_migrate_mapping_cache> ggml_mapping_cache;
6969
static size_t ggml_backend_page_size = 0;
70+
static int ggml_backend_node_id[GGML_NUMA_MIGRATE_NODES];
71+
static bool ggml_backend_node_id_inited = false;
7072
static std::mutex ggml_mapping_mutex;
7173
#endif
7274

@@ -1718,7 +1720,7 @@ static int check_numa_pages_migration(void *addr, size_t total_size) {
17181720
int num_nodes = GGML_NUMA_MIGRATE_NODES;
17191721

17201722
for (int i = 0; i < num_nodes; ++i) {
1721-
int target_node = i;
1723+
int target_node = ggml_backend_node_id[i];
17221724
size_t size_to_migrate = total_size / num_nodes;
17231725

17241726
if (size_to_migrate > total_size - offset) {
@@ -1765,7 +1767,7 @@ static int check_numa_pages_migration(void *addr, size_t total_size) {
17651767
(void **)malloc(num_pages_to_migrate * sizeof(void *));
17661768
for (size_t j = 0; j < num_pages_to_migrate; j++) {
17671769
status[j] = 0;
1768-
nodes[j] = i;
1770+
nodes[j] = target_node;
17691771
addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
17701772
j * ggml_backend_page_size);
17711773
}
@@ -1842,7 +1844,7 @@ static int migrate_pages_multiple_nodes(void *addr, size_t total_size) {
18421844
int num_nodes = GGML_NUMA_MIGRATE_NODES;
18431845

18441846
for (int i = 0; i < num_nodes; ++i) {
1845-
int target_node = i;
1847+
int target_node = ggml_backend_node_id[i];
18461848
size_t size_to_migrate = total_size / num_nodes;
18471849

18481850
if (size_to_migrate > total_size - offset) {
@@ -1891,7 +1893,7 @@ static int migrate_pages_multiple_nodes(void *addr, size_t total_size) {
18911893
(void **)malloc(num_pages_to_migrate * sizeof(void *));
18921894
for (size_t j = 0; j < num_pages_to_migrate; j++) {
18931895
status[j] = 0;
1894-
nodes[j] = i;
1896+
nodes[j] = target_node;
18951897
addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
18961898
j * ggml_backend_page_size);
18971899
}
@@ -2244,18 +2246,62 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
22442246
}
22452247

22462248
#ifdef GGML_USE_NUMA_MIGRATE
2249+
int ggml_backend_get_node_id(int index) {
2250+
return ggml_backend_node_id[index];
2251+
}
2252+
22472253
size_t ggml_backend_get_page_size(void) {
22482254
if (ggml_backend_page_size == 0) {
22492255
ggml_backend_page_size = sysconf(_SC_PAGE_SIZE);
22502256
}
22512257
return ggml_backend_page_size;
22522258
}
2259+
2260+
static void parse_numa_node_ids(const char *input, int *count, int *node_ids) {
2261+
*count = 0;
2262+
char *input_copy = strdup(input);
2263+
char *range = strtok(input_copy, ",");
2264+
2265+
while (range != NULL) {
2266+
if (strchr(range, '-') != NULL) {
2267+
int start, end;
2268+
sscanf(range, "%d-%d", &start, &end);
2269+
for (int i = start; i <= end; i++) {
2270+
node_ids[(*count)++] = i;
2271+
}
2272+
} else {
2273+
int node_id = atoi(range);
2274+
node_ids[(*count)++] = node_id;
2275+
}
2276+
range = strtok(NULL, ",");
2277+
}
2278+
2279+
free(input_copy);
2280+
}
2281+
2282+
void ggml_backend_init_node_id(void) {
2283+
if (ggml_backend_node_id_inited) {
2284+
return;
2285+
}
2286+
const char *env_var = getenv("GGML_NUMA_NODE_IDS");
2287+
2288+
if (env_var) {
2289+
int count = 0;
2290+
parse_numa_node_ids(env_var, &count, ggml_backend_node_id);
2291+
} else {
2292+
for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
2293+
ggml_backend_node_id[node] = node;
2294+
}
2295+
}
2296+
ggml_backend_node_id_inited = true;
2297+
}
22532298
#endif
22542299

22552300
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
22562301
#ifdef GGML_USE_NUMA_MIGRATE
2302+
ggml_backend_init_node_id();
22572303
ggml_backend_get_page_size();
2258-
void * data = numa_alloc_onnode(size, 0);
2304+
void * data = numa_alloc_onnode(size, ggml_backend_node_id[0]);
22592305
#else
22602306
void * data = ggml_aligned_malloc(size);
22612307
#endif

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,7 @@ enum ggml_barrier_node_index {
515515
void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
516516
int ggml_cores_per_numa(int ith);
517517
int ggml_get_node_from_cpu(int ith);
518+
int ggml_get_start_id_in_node(int ith);
518519
#endif
519520

520521
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 127 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,7 @@ struct ggml_numa_nodes {
524524
#ifdef GGML_USE_NUMA_MIGRATE
525525
int *node_num_of_cpu;
526526
int *cpu_core_mapping; // x logic core, y physical core
527+
int logic_core_cnts;
527528
int cores_per_numa[GGML_NUMA_MIGRATE_NODES];
528529
#endif
529530
};
@@ -585,11 +586,80 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
585586
}
586587

587588
#ifdef GGML_USE_NUMA_MIGRATE
589+
590+
static int** ggml_allocate_core_ids(int num_nodes, int max_cores) {
591+
int **core_ids = malloc(num_nodes * sizeof(int *));
592+
for (int i = 0; i < num_nodes; i++) {
593+
core_ids[i] = malloc(max_cores * sizeof(int));
594+
for (int j = 0; j < max_cores; j++) {
595+
core_ids[i][j] = -1;
596+
}
597+
}
598+
return core_ids;
599+
}
600+
601+
static void ggml_free_core_ids(int **core_ids, int num_nodes) {
602+
for (int i = 0; i < num_nodes; i++) {
603+
free(core_ids[i]);
604+
}
605+
free(core_ids);
606+
}
607+
608+
static void ggml_parse_cpu_core_ids(const char *env_var, int **core_ids, int max_numa_nodes, int max_cores_per_node) {
609+
char *numa_node;
610+
char *node_copy = strdup(env_var);
611+
char *context;
612+
613+
numa_node = strtok_r(node_copy, "|", &context);
614+
int node_count = 0;
615+
616+
while (numa_node != NULL && node_count < max_numa_nodes) {
617+
int core_index = 0;
618+
619+
char *core_range = strtok(numa_node, ",");
620+
while (core_range != NULL && core_index < max_cores_per_node) {
621+
if (strchr(core_range, '-') != NULL) {
622+
int start, end;
623+
sscanf(core_range, "%d-%d", &start, &end);
624+
for (int i = start; i <= end && core_index < max_cores_per_node; i++) {
625+
core_ids[node_count][core_index++] = i;
626+
}
627+
} else {
628+
int core_id = atoi(core_range);
629+
if (core_index < max_cores_per_node) {
630+
core_ids[node_count][core_index++] = core_id;
631+
}
632+
}
633+
core_range = strtok(NULL, ",");
634+
}
635+
node_count++;
636+
numa_node = strtok_r(NULL, "|", &context);
637+
}
638+
639+
free(node_copy);
640+
}
641+
642+
588643
int ggml_get_node_from_cpu(int ith) {
589644
int cpu = g_state.numa.cpu_core_mapping[ith];
590645
return g_state.numa.node_num_of_cpu[cpu];
591646
}
592647

648+
int ggml_get_start_id_in_node(int ith) {
649+
int total_cpus = 0;
650+
int prev_total_cpus = 0;
651+
for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
652+
prev_total_cpus = total_cpus;
653+
total_cpus += g_state.numa.cores_per_numa[node];
654+
if (ith < total_cpus) {
655+
return (ith - prev_total_cpus);
656+
}
657+
}
658+
659+
assert(0);
660+
return -1;
661+
}
662+
593663
int ggml_cores_per_numa(int ith) {
594664
int node = ggml_get_node_from_cpu(ith);
595665
return g_state.numa.cores_per_numa[node];
@@ -605,6 +675,11 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
605675
if (n_threads == 1) {
606676
return;
607677
}
678+
if (n_threads != g_state.numa.logic_core_cnts) {
679+
printf("bolt-test: n_threads: %d, g_state.numa.logic_core_cnts: %d\n", n_threads, g_state.numa.logic_core_cnts);
680+
ggml_barrier(tp);
681+
return;
682+
}
608683

609684
int cores_per_numa = ggml_cores_per_numa(ith);
610685
int numa_nodes = GGML_NUMA_MIGRATE_NODES;
@@ -733,34 +808,60 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
733808
#ifdef GGML_USE_NUMA_MIGRATE
734809
g_state.numa.node_num_of_cpu = (int *)malloc(g_state.numa.total_cpus * sizeof(int));
735810
g_state.numa.cpu_core_mapping = (int *)malloc(g_state.numa.total_cpus * sizeof(int));
736-
for (uint32_t i = 0; i < g_state.numa.total_cpus; i++) {
737-
g_state.numa.node_num_of_cpu[i] = numa_node_of_cpu(i);
738-
}
739-
740-
FILE *fp = fopen("/sys/devices/system/cpu/online", "r");
741-
if (fp == NULL) {
742-
perror("fopen");
743-
exit(EXIT_FAILURE);
744-
}
745-
746-
int cpu0, cpu1;
747811
int logic_core_index = 0;
748-
while (fscanf(fp, "%d", &cpu0) != EOF) {
749-
cpu1 = cpu0;
750-
while (fgetc(fp) == '-') {
751-
fscanf(fp, "%d", &cpu1);
812+
813+
const char *env_var = getenv("GGML_NUMA_CORE_IDS");
814+
if (env_var) {
815+
int max_numa_nodes = GGML_NUMA_MIGRATE_NODES;
816+
int **core_ids = ggml_allocate_core_ids(max_numa_nodes, g_state.numa.total_cpus);
817+
ggml_parse_cpu_core_ids(env_var, core_ids, max_numa_nodes, g_state.numa.total_cpus);
818+
819+
for (int node = 0; node < max_numa_nodes; node++) {
820+
for (int core = 0; core < (int)g_state.numa.total_cpus; core++) {
821+
int phy_core_id = core_ids[node][core];
822+
if (phy_core_id != -1) {
823+
g_state.numa.node_num_of_cpu[phy_core_id] = node;
824+
g_state.numa.cpu_core_mapping[logic_core_index] = phy_core_id;
825+
g_state.numa.cores_per_numa[node]++;
826+
GGML_PRINT_DEBUG("setting core ids, core: %d, logic_core_index: %d, mapping: %d, cores_per_numa: %d, node_num_of_cpu: %d\n",
827+
phy_core_id,
828+
logic_core_index,
829+
g_state.numa.cpu_core_mapping[logic_core_index],
830+
g_state.numa.cores_per_numa[node],
831+
g_state.numa.node_num_of_cpu[phy_core_id]);
832+
logic_core_index++;
833+
g_state.numa.logic_core_cnts++;
834+
}
835+
}
836+
}
837+
ggml_free_core_ids(core_ids, max_numa_nodes);
838+
} else {
839+
FILE *fp = fopen("/sys/devices/system/cpu/online", "r");
840+
if (fp == NULL) {
841+
perror("fopen");
842+
exit(EXIT_FAILURE);
752843
}
753844

754-
for (int cpu_index = cpu0; cpu_index <= cpu1; cpu_index++) {
755-
g_state.numa.cpu_core_mapping[logic_core_index++] = cpu_index;
756-
int node = g_state.numa.node_num_of_cpu[cpu_index];
757-
if (node < GGML_NUMA_MIGRATE_NODES) {
758-
g_state.numa.cores_per_numa[node]++;
845+
int cpu0, cpu1;
846+
while (fscanf(fp, "%d", &cpu0) != EOF) {
847+
cpu1 = cpu0;
848+
while (fgetc(fp) == '-') {
849+
fscanf(fp, "%d", &cpu1);
850+
}
851+
852+
for (int cpu_index = cpu0; cpu_index <= cpu1; cpu_index++) {
853+
g_state.numa.cpu_core_mapping[logic_core_index++] = cpu_index;
854+
g_state.numa.node_num_of_cpu[cpu_index] = numa_node_of_cpu(cpu_index);
855+
int node = g_state.numa.node_num_of_cpu[cpu_index];
856+
if (node < GGML_NUMA_MIGRATE_NODES) {
857+
g_state.numa.logic_core_cnts++;
858+
g_state.numa.cores_per_numa[node]++;
859+
}
759860
}
760861
}
761-
}
762862

763-
fclose(fp);
863+
fclose(fp);
864+
}
764865
#endif
765866

766867
if (ggml_is_numa()) {
@@ -3219,10 +3320,12 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
32193320
threadpool->n_barrier_passed = 0;
32203321

32213322
#ifdef GGML_USE_NUMA_MIGRATE
3323+
ggml_backend_init_node_id();
32223324
for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
3223-
threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
3325+
int node_id = ggml_backend_get_node_id(node);
3326+
threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node_id);
32243327
*threadpool->n_barrier_node[node] = 0;
3225-
threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
3328+
threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node_id);
32263329
*threadpool->n_barrier_passed_node[node] = 0;
32273330
}
32283331

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
176176

177177
#ifdef GGML_USE_NUMA_MIGRATE
178178
for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
179-
cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, i);
179+
cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, ggml_backend_get_node_id(i));
180180
if (cpu_ctx->work_data_numa[i] == NULL) {
181181
cpu_ctx->work_size = 0;
182182
return GGML_STATUS_ALLOC_FAILED;

ggml/src/ggml-cpu/repack.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1250,11 +1250,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
12501250
int64_t i11_processed = 0;
12511251
#ifdef GGML_USE_NUMA_MIGRATE
12521252
int round_cnts = ggml_cores_per_numa(ith);
1253-
int start_id = ith - round_cnts * node_id;
1254-
if (round_cnts == 0) {
1255-
round_cnts = nth;
1256-
start_id = ith;
1257-
}
1253+
assert(round_cnts);
1254+
int start_id = ggml_get_start_id_in_node(ith);
12581255
#else
12591256
int round_cnts = nth;
12601257
int start_id = ith;

0 commit comments

Comments
 (0)