threadpool: reduce pause/resume/wakeup overhead in common cases

max-krasnyansky · max-krasnyansky · commit 23b4499777d0 · 2024-08-24T10:05:52.000-07:00
We now start threadpool in paused state only if we have two.
The resume is now implicit (ie new work) which allows for reduced locking and context-switch overhead.
diff --git a/common/common.cpp b/common/common.cpp
@@ -2602,6 +2602,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
     tpp.prio       = params.priority;
     tpp.poll       = params.poll;
     tpp.strict_cpu = params.strict_cpu;
+    tpp.paused     = false;
 
     return tpp;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -230,17 +230,6 @@ int main(int argc, char ** argv) {
     struct ggml_threadpool_params tpp =
             ggml_threadpool_params_from_cpu_params(params.cpuparams);
 
-    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
-    if (!threadpool) {
-        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        exit(1);
-    }
-
-    llama_attach_threadpool(ctx, threadpool);
-    if (ctx_guidance) {
-        llama_attach_threadpool(ctx_guidance, threadpool);
-    }
-
     struct ggml_compute_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
         threadpool_batch = ggml_create_threadpool(&tpp_batch);
@@ -253,6 +242,20 @@ int main(int argc, char ** argv) {
         if (ctx_guidance) {
             llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
         }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_threadpool(ctx, threadpool);
+    if (ctx_guidance) {
+        llama_attach_threadpool(ctx_guidance, threadpool);
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -633,6 +633,7 @@ extern "C" {
         int32_t prio;
         bool    poll;
         bool    strict_cpu;
+        bool    paused;
     };
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -18872,14 +18872,27 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     GGML_ALIGNED_FREE(threadpool);
 }
 
+#ifndef GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void __ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+
+static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+
 void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
-    GGML_PRINT_DEBUG("Pausing threadpool\n");
     ggml_mutex_lock(&threadpool->mutex);
     if (!threadpool->pause) {
-        threadpool->pause = true;
-        ggml_cond_broadcast(&threadpool->cond);
+       __ggml_pause_threadpool(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -18890,12 +18903,9 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
-    GGML_PRINT_DEBUG("Resuming threadpool\n");
-
     ggml_mutex_lock(&threadpool->mutex);
     if (threadpool->pause) {
-        threadpool->pause = false;
-        ggml_cond_broadcast(&threadpool->cond);
+       __ggml_resume_threadpool(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -19237,7 +19247,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
-        threadpool->pause            = disposable ? false : true;
+        threadpool->pause            = disposable ? false : tpp->paused;
         threadpool->new_work         = false;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
@@ -19327,9 +19337,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         struct ggml_threadpool_params ttp = {
             .mask_specified = false,
             .n_threads      = n_threads,
-            .prio           = 1,
+            .prio           = 0,
             .poll           = false,
-            .strict_cpu     = false
+            .strict_cpu     = false,
+            .paused         = false
         };
 
         threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
@@ -19383,10 +19394,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         if (!threadpool->poll) {
             ggml_mutex_lock(&threadpool->mutex);
             threadpool->new_work = true;
-            ggml_cond_broadcast(&threadpool->cond);
+            if (threadpool->pause) {
+               __ggml_resume_threadpool(threadpool);
+            } else {
+               ggml_cond_broadcast(&threadpool->cond);
+            }
             ggml_mutex_unlock(&threadpool->mutex);
         } else {
             threadpool->new_work = true;
+            if (threadpool->pause) {
+                ggml_mutex_lock(&threadpool->mutex);
+                __ggml_resume_threadpool(threadpool);
+                ggml_mutex_unlock(&threadpool->mutex);
+            }
         }
     }
     // this is a work thread too
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -15539,17 +15539,14 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
         // Switch between the 2 threadpools as needed
         if (n_tokens > 1) {
             ggml_pause_threadpool(lctx.threadpool);
-            ggml_resume_threadpool(lctx.threadpool_batch);
             threadpool = lctx.threadpool_batch;
             n_threads = cparams.n_threads_batch;
         } else {
             ggml_pause_threadpool(lctx.threadpool_batch);
-            ggml_resume_threadpool(lctx.threadpool);
             threadpool = lctx.threadpool;
             n_threads = cparams.n_threads;
         }
     } else if (lctx.threadpool) {
-        ggml_resume_threadpool(lctx.threadpool);
         threadpool = lctx.threadpool;
         n_threads = cparams.n_threads;
     }

Original file line number	Diff line number	Diff line change
`@@ -2602,6 +2602,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p`
`2602`	`2602`	`tpp.prio = params.priority;`
`2603`	`2603`	`tpp.poll = params.poll;`
`2604`	`2604`	`tpp.strict_cpu = params.strict_cpu;`
	`2605`	`+ tpp.paused = false;`
`2605`	`2606`
`2606`	`2607`	`return tpp;`
`2607`	`2608`	`}`