use a different atomic for each expert [wip]

slaren · slaren · commit b6bd4972850b · 2025-02-05T16:36:26.000+01:00
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1297,7 +1297,11 @@ struct ggml_threadpool {
     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
     atomic_int GGML_CACHE_ALIGN n_barrier;
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
-    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+    atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+
+    // store each counter in a separate cache line
+    char GGML_CACHE_ALIGN current_chunks[256][GGML_CACHE_LINE];
+    //atomic_int GGML_CACHE_ALIGN current_chunks[256];
 
     // these are atomic as an annotation for thread-sanitizer
     atomic_bool stop;         // Used for stopping the threadpool altogether
@@ -7744,7 +7748,8 @@ static void ggml_compute_forward_mul_mat_id(
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
+        //atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
+        //memset(params->threadpool->current_chunks, 0, n_as*sizeof(params->threadpool->current_chunks[0]));
 
         // initialize matrix_row_counts
         memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
@@ -7760,6 +7765,12 @@ static void ggml_compute_forward_mul_mat_id(
                 matrix_row_counts[i02] += 1;
             }
         }
+    } else {
+        // reset current_chunk
+        for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
+            atomic_int * current_chunk_ctr = (atomic_int *)(params->threadpool->current_chunks + cur_a);
+            atomic_store_explicit(current_chunk_ctr, nth, memory_order_relaxed);
+        }
     }
 
     ggml_barrier(params->threadpool);
@@ -7801,6 +7812,8 @@ static void ggml_compute_forward_mul_mat_id(
 
         int current_chunk = ith;
 
+        atomic_int * current_chunk_ctr = (atomic_int *)(params->threadpool->current_chunks + cur_a);
+
         while (current_chunk < nchunk0 * nchunk1) {
             const int64_t ith0 = current_chunk % nchunk0;
             const int64_t ith1 = current_chunk / nchunk0;
@@ -7821,21 +7834,21 @@ static void ggml_compute_forward_mul_mat_id(
                 break;
             }
 
-            current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
+            current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
         }
 
-        if (rows_processed == rows_total) {
-            break;
-        }
+        //if (rows_processed == rows_total) {
+        //    break;
+        //}
 
-        ggml_barrier(params->threadpool);
+        //ggml_barrier(params->threadpool);
 
-        if (ith == 0) {
-            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-            atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
-        }
+        //if (ith == 0) {
+        //    // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        //    atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
+        //}
 
-        ggml_barrier(params->threadpool);
+        //ggml_barrier(params->threadpool);
     }
 }
 
@@ -14079,6 +14092,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
+        memset(threadpool->current_chunks, 0, sizeof(threadpool->current_chunks));
         threadpool->stop             = false;
         threadpool->pause            = tpp->paused;
         threadpool->abort            = -1;
@@ -14160,6 +14174,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         threadpool->cgraph           = cgraph;
         threadpool->cplan            = cplan;
         threadpool->current_chunk    = 0;
+        memset(threadpool->current_chunks, 0, sizeof(threadpool->current_chunks));
         threadpool->abort            = -1;
         threadpool->ec               = GGML_STATUS_SUCCESS;
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4324,12 +4324,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
         for (ggml_type type_a : all_types) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {
-                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
+                //test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
             }
         }
     }
 
-#if 0
+#if 1
     for (int bs : {1, 64}) {
         for (ggml_type type_a : {GGML_TYPE_Q4_0}) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {

Original file line number	Diff line number	Diff line change
`@@ -4324,12 +4324,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {`
`4324`	`4324`	`for (int bs : {1, 2, 3, 4, 5, 8, 512}) {`
`4325`	`4325`	`for (ggml_type type_a : all_types) {`
`4326`	`4326`	`for (ggml_type type_b : {GGML_TYPE_F32}) {`
`4327`		`- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1, 1}, {1, 1}));`
	`4327`	`+ //test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1, 1}, {1, 1}));`
`4328`	`4328`	`}`
`4329`	`4329`	`}`
`4330`	`4330`	`}`
`4331`	`4331`
`4332`		`-#if 0`
	`4332`	`+#if 1`
`4333`	`4333`	`for (int bs : {1, 64}) {`
`4334`	`4334`	`for (ggml_type type_a : {GGML_TYPE_Q4_0}) {`
`4335`	`4335`	`for (ggml_type type_b : {GGML_TYPE_F32}) {`