Skip to content

Commit b6bd497

Browse files
committed
use a different atomic for each expert [wip]
1 parent 7fd0ae5 commit b6bd497

File tree

2 files changed

+29
-14
lines changed

2 files changed

+29
-14
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,7 +1297,11 @@ struct ggml_threadpool {
12971297
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
12981298
atomic_int GGML_CACHE_ALIGN n_barrier;
12991299
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
1300-
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1300+
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1301+
1302+
// store each counter in a separate cache line
1303+
char GGML_CACHE_ALIGN current_chunks[256][GGML_CACHE_LINE];
1304+
//atomic_int GGML_CACHE_ALIGN current_chunks[256];
13011305

13021306
// these are atomic as an annotation for thread-sanitizer
13031307
atomic_bool stop; // Used for stopping the threadpool altogether
@@ -7744,7 +7748,8 @@ static void ggml_compute_forward_mul_mat_id(
77447748

77457749
if (ith == 0) {
77467750
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7747-
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7751+
//atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7752+
//memset(params->threadpool->current_chunks, 0, n_as*sizeof(params->threadpool->current_chunks[0]));
77487753

77497754
// initialize matrix_row_counts
77507755
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
@@ -7760,6 +7765,12 @@ static void ggml_compute_forward_mul_mat_id(
77607765
matrix_row_counts[i02] += 1;
77617766
}
77627767
}
7768+
} else {
7769+
// reset current_chunk
7770+
for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
7771+
atomic_int * current_chunk_ctr = (atomic_int *)(params->threadpool->current_chunks + cur_a);
7772+
atomic_store_explicit(current_chunk_ctr, nth, memory_order_relaxed);
7773+
}
77637774
}
77647775

77657776
ggml_barrier(params->threadpool);
@@ -7801,6 +7812,8 @@ static void ggml_compute_forward_mul_mat_id(
78017812

78027813
int current_chunk = ith;
78037814

7815+
atomic_int * current_chunk_ctr = (atomic_int *)(params->threadpool->current_chunks + cur_a);
7816+
78047817
while (current_chunk < nchunk0 * nchunk1) {
78057818
const int64_t ith0 = current_chunk % nchunk0;
78067819
const int64_t ith1 = current_chunk / nchunk0;
@@ -7821,21 +7834,21 @@ static void ggml_compute_forward_mul_mat_id(
78217834
break;
78227835
}
78237836

7824-
current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
7837+
current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
78257838
}
78267839

7827-
if (rows_processed == rows_total) {
7828-
break;
7829-
}
7840+
//if (rows_processed == rows_total) {
7841+
// break;
7842+
//}
78307843

7831-
ggml_barrier(params->threadpool);
7844+
//ggml_barrier(params->threadpool);
78327845

7833-
if (ith == 0) {
7834-
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7835-
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7836-
}
7846+
//if (ith == 0) {
7847+
// // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7848+
// atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7849+
//}
78377850

7838-
ggml_barrier(params->threadpool);
7851+
//ggml_barrier(params->threadpool);
78397852
}
78407853
}
78417854

@@ -14079,6 +14092,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
1407914092
threadpool->n_barrier = 0;
1408014093
threadpool->n_barrier_passed = 0;
1408114094
threadpool->current_chunk = 0;
14095+
memset(threadpool->current_chunks, 0, sizeof(threadpool->current_chunks));
1408214096
threadpool->stop = false;
1408314097
threadpool->pause = tpp->paused;
1408414098
threadpool->abort = -1;
@@ -14160,6 +14174,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1416014174
threadpool->cgraph = cgraph;
1416114175
threadpool->cplan = cplan;
1416214176
threadpool->current_chunk = 0;
14177+
memset(threadpool->current_chunks, 0, sizeof(threadpool->current_chunks));
1416314178
threadpool->abort = -1;
1416414179
threadpool->ec = GGML_STATUS_SUCCESS;
1416514180
}

tests/test-backend-ops.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4324,12 +4324,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
43244324
for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
43254325
for (ggml_type type_a : all_types) {
43264326
for (ggml_type type_b : {GGML_TYPE_F32}) {
4327-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1, 1}, {1, 1}));
4327+
//test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1, 1}, {1, 1}));
43284328
}
43294329
}
43304330
}
43314331

4332-
#if 0
4332+
#if 1
43334333
for (int bs : {1, 64}) {
43344334
for (ggml_type type_a : {GGML_TYPE_Q4_0}) {
43354335
for (ggml_type type_b : {GGML_TYPE_F32}) {

0 commit comments

Comments
 (0)