Skip to content

Commit f6a69ed

Browse files
committed
support any number of cores in numa node, S_TG t/s increases from 36 to 41 with fewer cores
1 parent 53370cb commit f6a69ed

File tree

3 files changed

+48
-23
lines changed

3 files changed

+48
-23
lines changed

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -513,8 +513,8 @@ enum ggml_barrier_node_index {
513513
GGML_BARRIER_NODE_CNTS = 3
514514
};
515515
void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
516-
int ggml_cores_per_numa(void);
517-
int ggml_get_node_from_cpu(int cpu);
516+
int ggml_cores_per_numa(int ith);
517+
int ggml_get_node_from_cpu(int ith);
518518
#endif
519519

520520
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,9 @@ struct ggml_numa_nodes {
522522
#endif
523523

524524
#ifdef GGML_USE_NUMA_MIGRATE
525-
bool even_distributed;
525+
int *node_num_of_cpu;
526+
int *cpu_core_mapping; // x logic core, y physical core
527+
int cores_per_numa[GGML_NUMA_MIGRATE_NODES];
526528
#endif
527529
};
528530

@@ -583,16 +585,18 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
583585
}
584586

585587
#ifdef GGML_USE_NUMA_MIGRATE
586-
int ggml_get_node_from_cpu(int cpu) {
587-
return cpu / g_state.numa.nodes[0].n_cpus;
588+
int ggml_get_node_from_cpu(int ith) {
589+
int cpu = g_state.numa.cpu_core_mapping[ith];
590+
return g_state.numa.node_num_of_cpu[cpu];
588591
}
589592

590-
int ggml_cores_per_numa(void) {
591-
return g_state.numa.nodes[0].n_cpus;
593+
int ggml_cores_per_numa(int ith) {
594+
int node = ggml_get_node_from_cpu(ith);
595+
return g_state.numa.cores_per_numa[node];
592596
}
593597

594598
void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
595-
if ((g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_MIGRATE) || !g_state.numa.even_distributed) {
599+
if (g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_MIGRATE) {
596600
ggml_barrier(tp);
597601
return;
598602
}
@@ -602,13 +606,8 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
602606
return;
603607
}
604608

605-
int cores_per_numa = ggml_cores_per_numa();
606-
int numa_nodes = n_threads / cores_per_numa;
607-
int remaining_cores = n_threads % cores_per_numa;
608-
if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) {
609-
ggml_barrier(tp);
610-
return;
611-
}
609+
int cores_per_numa = ggml_cores_per_numa(ith);
610+
int numa_nodes = GGML_NUMA_MIGRATE_NODES;
612611

613612
int node = ggml_get_node_from_cpu(ith);
614613

@@ -720,9 +719,6 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
720719
GGML_PRINT_DEBUG("CPUs on node %u:", n);
721720
node->n_cpus = 0;
722721

723-
#ifdef GGML_USE_NUMA_MIGRATE
724-
g_state.numa.even_distributed = true;
725-
#endif
726722
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
727723
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
728724
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
@@ -732,13 +728,41 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
732728
}
733729
}
734730
GGML_PRINT_DEBUG("\n");
731+
}
732+
735733
#ifdef GGML_USE_NUMA_MIGRATE
736-
if ((n != 0) && (g_state.numa.nodes[n].n_cpus != g_state.numa.nodes[0].n_cpus)) {
737-
g_state.numa.even_distributed = false;
734+
g_state.numa.node_num_of_cpu = (int *)malloc(g_state.numa.total_cpus * sizeof(int));
735+
g_state.numa.cpu_core_mapping = (int *)malloc(g_state.numa.total_cpus * sizeof(int));
736+
for (uint32_t i = 0; i < g_state.numa.total_cpus; i++) {
737+
g_state.numa.node_num_of_cpu[i] = numa_node_of_cpu(i);
738+
}
739+
740+
FILE *fp = fopen("/sys/devices/system/cpu/online", "r");
741+
if (fp == NULL) {
742+
perror("fopen");
743+
exit(EXIT_FAILURE);
744+
}
745+
746+
int cpu0, cpu1;
747+
int logic_core_index = 0;
748+
while (fscanf(fp, "%d", &cpu0) != EOF) {
749+
cpu1 = cpu0;
750+
while (fgetc(fp) == '-') {
751+
fscanf(fp, "%d", &cpu1);
752+
}
753+
754+
for (int cpu_index = cpu0; cpu_index <= cpu1; cpu_index++) {
755+
g_state.numa.cpu_core_mapping[logic_core_index++] = cpu_index;
756+
int node = g_state.numa.node_num_of_cpu[cpu_index];
757+
if (node < GGML_NUMA_MIGRATE_NODES) {
758+
g_state.numa.cores_per_numa[node]++;
759+
}
738760
}
739-
#endif
740761
}
741762

763+
fclose(fp);
764+
#endif
765+
742766
if (ggml_is_numa()) {
743767
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
744768
if (fptr != NULL) {
@@ -2169,7 +2193,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
21692193
#if defined(__gnu_linux__)
21702194

21712195
#ifdef GGML_USE_NUMA_MIGRATE
2172-
static void set_numa_migrate_affinity(int core_no) {
2196+
static void set_numa_migrate_affinity(int thread_no) {
2197+
int core_no = g_state.numa.cpu_core_mapping[thread_no];
21732198
// Check if the core number is valid
21742199
if (core_no < 0 || core_no >= (int)g_state.numa.total_cpus) {
21752200
printf("%s, Warn: core_no not between 0 and %d, failback.\n", __func__, g_state.numa.total_cpus);

ggml/src/ggml-cpu/repack.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1249,7 +1249,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
12491249

12501250
int64_t i11_processed = 0;
12511251
#ifdef GGML_USE_NUMA_MIGRATE
1252-
int round_cnts = ggml_cores_per_numa();
1252+
int round_cnts = ggml_cores_per_numa(ith);
12531253
int start_id = ith - round_cnts * node_id;
12541254
if (round_cnts == 0) {
12551255
round_cnts = nth;

0 commit comments

Comments
 (0)