Skip to content

Commit 23b4499

Browse files
threadpool: reduce pause/resume/wakeup overhead in common cases
We now start threadpool in paused state only if we have two. The resume is now implicit (ie new work) which allows for reduced locking and context-switch overhead.
1 parent 5ccc5ef commit 23b4499

File tree

5 files changed

+47
-25
lines changed

5 files changed

+47
-25
lines changed

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2602,6 +2602,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
26022602
tpp.prio = params.priority;
26032603
tpp.poll = params.poll;
26042604
tpp.strict_cpu = params.strict_cpu;
2605+
tpp.paused = false;
26052606

26062607
return tpp;
26072608
}

examples/main/main.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -230,17 +230,6 @@ int main(int argc, char ** argv) {
230230
struct ggml_threadpool_params tpp =
231231
ggml_threadpool_params_from_cpu_params(params.cpuparams);
232232

233-
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
234-
if (!threadpool) {
235-
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
236-
exit(1);
237-
}
238-
239-
llama_attach_threadpool(ctx, threadpool);
240-
if (ctx_guidance) {
241-
llama_attach_threadpool(ctx_guidance, threadpool);
242-
}
243-
244233
struct ggml_compute_threadpool * threadpool_batch = NULL;
245234
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
246235
threadpool_batch = ggml_create_threadpool(&tpp_batch);
@@ -253,6 +242,20 @@ int main(int argc, char ** argv) {
253242
if (ctx_guidance) {
254243
llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
255244
}
245+
246+
// Start the non-batch threadpool in the paused state
247+
tpp.paused = true;
248+
}
249+
250+
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
251+
if (!threadpool) {
252+
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
253+
exit(1);
254+
}
255+
256+
llama_attach_threadpool(ctx, threadpool);
257+
if (ctx_guidance) {
258+
llama_attach_threadpool(ctx_guidance, threadpool);
256259
}
257260

258261
const int n_ctx_train = llama_n_ctx_train(model);

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ extern "C" {
633633
int32_t prio;
634634
bool poll;
635635
bool strict_cpu;
636+
bool paused;
636637
};
637638

638639
struct ggml_compute_threadpool; // forward declaration, see ggml.c

ggml/src/ggml.c

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18872,14 +18872,27 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
1887218872
GGML_ALIGNED_FREE(threadpool);
1887318873
}
1887418874

18875+
#ifndef GGML_USE_OPENMP
18876+
// pause/resume must be called under mutex
18877+
static void __ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
18878+
GGML_PRINT_DEBUG("Pausing threadpool\n");
18879+
threadpool->pause = true;
18880+
ggml_cond_broadcast(&threadpool->cond);
18881+
}
18882+
18883+
static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
18884+
GGML_PRINT_DEBUG("Resuming threadpool\n");
18885+
threadpool->pause = false;
18886+
ggml_cond_broadcast(&threadpool->cond);
18887+
}
18888+
#endif
18889+
1887518890
void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
1887618891
#ifndef GGML_USE_OPENMP
1887718892
GGML_ASSERT(!threadpool->disposable);
18878-
GGML_PRINT_DEBUG("Pausing threadpool\n");
1887918893
ggml_mutex_lock(&threadpool->mutex);
1888018894
if (!threadpool->pause) {
18881-
threadpool->pause = true;
18882-
ggml_cond_broadcast(&threadpool->cond);
18895+
__ggml_pause_threadpool(threadpool);
1888318896
}
1888418897
ggml_mutex_unlock(&threadpool->mutex);
1888518898
#else
@@ -18890,12 +18903,9 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
1889018903
void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
1889118904
#ifndef GGML_USE_OPENMP
1889218905
GGML_ASSERT(!threadpool->disposable);
18893-
GGML_PRINT_DEBUG("Resuming threadpool\n");
18894-
1889518906
ggml_mutex_lock(&threadpool->mutex);
1889618907
if (threadpool->pause) {
18897-
threadpool->pause = false;
18898-
ggml_cond_broadcast(&threadpool->cond);
18908+
__ggml_resume_threadpool(threadpool);
1889918909
}
1890018910
ggml_mutex_unlock(&threadpool->mutex);
1890118911
#else
@@ -19237,7 +19247,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
1923719247
threadpool->n_barrier_passed = 0;
1923819248
threadpool->current_chunk = 0;
1923919249
threadpool->stop = false;
19240-
threadpool->pause = disposable ? false : true;
19250+
threadpool->pause = disposable ? false : tpp->paused;
1924119251
threadpool->new_work = false;
1924219252
threadpool->workers = NULL;
1924319253
threadpool->n_threads_max = tpp->n_threads;
@@ -19327,9 +19337,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1932719337
struct ggml_threadpool_params ttp = {
1932819338
.mask_specified = false,
1932919339
.n_threads = n_threads,
19330-
.prio = 1,
19340+
.prio = 0,
1933119341
.poll = false,
19332-
.strict_cpu = false
19342+
.strict_cpu = false,
19343+
.paused = false
1933319344
};
1933419345

1933519346
threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
@@ -19383,10 +19394,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1938319394
if (!threadpool->poll) {
1938419395
ggml_mutex_lock(&threadpool->mutex);
1938519396
threadpool->new_work = true;
19386-
ggml_cond_broadcast(&threadpool->cond);
19397+
if (threadpool->pause) {
19398+
__ggml_resume_threadpool(threadpool);
19399+
} else {
19400+
ggml_cond_broadcast(&threadpool->cond);
19401+
}
1938719402
ggml_mutex_unlock(&threadpool->mutex);
1938819403
} else {
1938919404
threadpool->new_work = true;
19405+
if (threadpool->pause) {
19406+
ggml_mutex_lock(&threadpool->mutex);
19407+
__ggml_resume_threadpool(threadpool);
19408+
ggml_mutex_unlock(&threadpool->mutex);
19409+
}
1939019410
}
1939119411
}
1939219412
// this is a work thread too

src/llama.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15539,17 +15539,14 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
1553915539
// Switch between the 2 threadpools as needed
1554015540
if (n_tokens > 1) {
1554115541
ggml_pause_threadpool(lctx.threadpool);
15542-
ggml_resume_threadpool(lctx.threadpool_batch);
1554315542
threadpool = lctx.threadpool_batch;
1554415543
n_threads = cparams.n_threads_batch;
1554515544
} else {
1554615545
ggml_pause_threadpool(lctx.threadpool_batch);
15547-
ggml_resume_threadpool(lctx.threadpool);
1554815546
threadpool = lctx.threadpool;
1554915547
n_threads = cparams.n_threads;
1555015548
}
1555115549
} else if (lctx.threadpool) {
15552-
ggml_resume_threadpool(lctx.threadpool);
1555315550
threadpool = lctx.threadpool;
1555415551
n_threads = cparams.n_threads;
1555515552
}

0 commit comments

Comments
 (0)