Skip to content

Commit 45cede2

Browse files
committed
use a different atomic for each expert [wip]
1 parent 7fd0ae5 commit 45cede2

File tree

1 file changed

+30
-15
lines changed

1 file changed

+30
-15
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,7 +1297,11 @@ struct ggml_threadpool {
12971297
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
12981298
atomic_int GGML_CACHE_ALIGN n_barrier;
12991299
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
1300-
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1300+
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1301+
1302+
// store each counter in a separate cache line
1303+
char GGML_CACHE_ALIGN current_chunks[256][GGML_CACHE_LINE];
1304+
//atomic_int GGML_CACHE_ALIGN current_chunks[256];
13011305

13021306
// these are atomic as an annotation for thread-sanitizer
13031307
atomic_bool stop; // Used for stopping the threadpool altogether
@@ -7744,7 +7748,8 @@ static void ggml_compute_forward_mul_mat_id(
77447748

77457749
if (ith == 0) {
77467750
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7747-
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7751+
//atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7752+
//memset(params->threadpool->current_chunks, 0, n_as*sizeof(params->threadpool->current_chunks[0]));
77487753

77497754
// initialize matrix_row_counts
77507755
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
@@ -7760,12 +7765,18 @@ static void ggml_compute_forward_mul_mat_id(
77607765
matrix_row_counts[i02] += 1;
77617766
}
77627767
}
7768+
} else {
7769+
// reset current_chunk
7770+
for (int cur_a = ith - 1; cur_a < n_as; cur_a += (nth - 1)) {
7771+
atomic_int * current_chunk_ctr = (atomic_int *)(params->threadpool->current_chunks + cur_a);
7772+
atomic_store_explicit(current_chunk_ctr, nth, memory_order_relaxed);
7773+
}
77637774
}
77647775

77657776
ggml_barrier(params->threadpool);
77667777

7767-
const int64_t rows_total = ggml_nelements(ids);
7768-
int64_t rows_processed = 0;
7778+
//const int64_t rows_total = ggml_nelements(ids);
7779+
//int64_t rows_processed = 0;
77697780

77707781
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
77717782
const int64_t cne1 = matrix_row_counts[cur_a];
@@ -7774,7 +7785,7 @@ static void ggml_compute_forward_mul_mat_id(
77747785
continue;
77757786
}
77767787

7777-
rows_processed += cne1;
7788+
//rows_processed += cne1;
77787789

77797790
const char * src0_cur = (const char *) src0->data + cur_a * nb02;
77807791
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
@@ -7801,6 +7812,8 @@ static void ggml_compute_forward_mul_mat_id(
78017812

78027813
int current_chunk = ith;
78037814

7815+
atomic_int * current_chunk_ctr = (atomic_int *)(params->threadpool->current_chunks + cur_a);
7816+
78047817
while (current_chunk < nchunk0 * nchunk1) {
78057818
const int64_t ith0 = current_chunk % nchunk0;
78067819
const int64_t ith1 = current_chunk / nchunk0;
@@ -7821,21 +7834,21 @@ static void ggml_compute_forward_mul_mat_id(
78217834
break;
78227835
}
78237836

7824-
current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
7837+
current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
78257838
}
78267839

7827-
if (rows_processed == rows_total) {
7828-
break;
7829-
}
7840+
//if (rows_processed == rows_total) {
7841+
// break;
7842+
//}
78307843

7831-
ggml_barrier(params->threadpool);
7844+
//ggml_barrier(params->threadpool);
78327845

7833-
if (ith == 0) {
7834-
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7835-
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7836-
}
7846+
//if (ith == 0) {
7847+
// // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7848+
// atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
7849+
//}
78377850

7838-
ggml_barrier(params->threadpool);
7851+
//ggml_barrier(params->threadpool);
78397852
}
78407853
}
78417854

@@ -14079,6 +14092,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
1407914092
threadpool->n_barrier = 0;
1408014093
threadpool->n_barrier_passed = 0;
1408114094
threadpool->current_chunk = 0;
14095+
memset(threadpool->current_chunks, 0, sizeof(threadpool->current_chunks));
1408214096
threadpool->stop = false;
1408314097
threadpool->pause = tpp->paused;
1408414098
threadpool->abort = -1;
@@ -14160,6 +14174,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1416014174
threadpool->cgraph = cgraph;
1416114175
threadpool->cplan = cplan;
1416214176
threadpool->current_chunk = 0;
14177+
memset(threadpool->current_chunks, 0, sizeof(threadpool->current_chunks));
1416314178
threadpool->abort = -1;
1416414179
threadpool->ec = GGML_STATUS_SUCCESS;
1416514180
}

0 commit comments

Comments
 (0)