Skip to content

Commit 8835823

Browse files
committed
fix the perf regression with numa_node_of_cpu() system call and the n_barrier_passed_last indexing issue
1 parent e9c9371 commit 8835823

File tree

3 files changed

+15
-13
lines changed

3 files changed

+15
-13
lines changed

ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6096,7 +6096,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
60966096
// GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
60976097

60986098
#ifdef GGML_USE_NUMA_MIGRATE
6099-
int node_id = numa_node_of_cpu(ith);
6099+
int node_id = ggml_get_node_from_cpu(ith);
61006100
char * wdata = static_cast<char *>(params->wdata_numa[node_id]);
61016101
#else
61026102
char * wdata = static_cast<char *>(params->wdata);

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,10 +513,12 @@ enum ggml_barrier_node_index {
513513
GGML_BARRIER_NODE_PING = 0,
514514
GGML_BARRIER_NODE_PONG = 1,
515515
GGML_BARRIER_NODE_LAST = 2,
516+
GGML_BARRIER_NODE_CNTS = 3
516517
};
517518
void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
518519
#ifdef GGML_USE_NUMA_MIGRATE
519520
int ggml_cores_per_numa(void);
521+
int ggml_get_node_from_cpu(int cpu);
520522
#endif
521523

522524
#ifdef __cplusplus

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ struct ggml_threadpool {
445445
#ifdef GGML_USE_NUMA_MIGRATE
446446
atomic_int GGML_CACHE_ALIGN *n_barrier_node[GGML_NUMA_MIGRATE_NODES];
447447
atomic_int GGML_CACHE_ALIGN *n_barrier_passed_node[GGML_NUMA_MIGRATE_NODES];
448-
atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last;
448+
atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last[GGML_BARRIER_NODE_CNTS];
449449
#endif
450450

451451
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -568,8 +568,8 @@ void ggml_barrier(struct ggml_threadpool * tp) {
568568
}
569569

570570
#ifdef GGML_USE_NUMA_MIGRATE
571-
static int get_node_from_cpu(int cpu, int cores_per_numa) {
572-
return cpu / cores_per_numa;
571+
int ggml_get_node_from_cpu(int cpu) {
572+
return cpu / g_state.numa.nodes[0].n_cpus;
573573
}
574574

575575
int ggml_cores_per_numa(void) {
@@ -594,15 +594,15 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
594594
return;
595595
}
596596

597-
int cores_per_numa = g_state.numa.nodes[0].n_cpus;
597+
int cores_per_numa = ggml_cores_per_numa();
598598
int numa_nodes = n_threads / cores_per_numa;
599599
int remaining_cores = n_threads % cores_per_numa;
600600
if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) {
601601
ggml_barrier(tp);
602602
return;
603603
}
604604

605-
int node = get_node_from_cpu(ith, cores_per_numa);
605+
int node = ggml_get_node_from_cpu(ith);
606606

607607
int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed);
608608

@@ -613,13 +613,13 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
613613
// last thread of current numa node
614614
atomic_store_explicit(tp->n_barrier_node[node], 0, memory_order_seq_cst);
615615

616-
int n_passed_node = atomic_fetch_add_explicit(&tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst);
616+
int n_passed_node = atomic_fetch_add_explicit(tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst);
617617

618618
if (n_passed_node == (numa_nodes - 1)) { // last numa node cpu
619619
atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
620-
atomic_store_explicit(&tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst);
620+
atomic_store_explicit(tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst);
621621
} else {
622-
while (atomic_load_explicit(&tp->n_barrier_passed_last[node_n], memory_order_relaxed)) {
622+
while (atomic_load_explicit(tp->n_barrier_passed_last[node_n], memory_order_relaxed)) {
623623
ggml_thread_cpu_relax();
624624
}
625625
atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
@@ -2968,7 +2968,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
29682968

29692969
#ifdef GGML_USE_NUMA_MIGRATE
29702970
for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
2971-
params.wdata_numa[i] = cplan->work_data_numa[numa_node_of_cpu(state->ith)];
2971+
params.wdata_numa[i] = cplan->work_data_numa[ggml_get_node_from_cpu(state->ith)];
29722972
}
29732973
#endif
29742974

@@ -3161,9 +3161,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
31613161
*threadpool->n_barrier_passed_node[node] = 0;
31623162
}
31633163

3164-
threadpool->n_barrier_passed_last = (atomic_int *)malloc(GGML_BARRIER_NODE_LAST * sizeof(atomic_int));
3165-
for (int i = 0; i < GGML_BARRIER_NODE_LAST; i++) {
3166-
threadpool->n_barrier_passed_last[i] = 0;
3164+
for (int i = 0; i < GGML_BARRIER_NODE_CNTS; i++) {
3165+
threadpool->n_barrier_passed_last[i] = (atomic_int *)malloc(sizeof(atomic_int));
3166+
*threadpool->n_barrier_passed_last[i] = 0;
31673167
}
31683168
#endif
31693169

0 commit comments

Comments
 (0)