Skip to content

Commit 70ff905

Browse files
threadpool: skip polling for unused threads
Currently all threads do N polling rounds even if only 1 thread is active (n_threads_cur == 1). This commit adds a check to skip the polling for unused threads (ith >= n_threads_cur). n_threads_cur is now an atomic_int to explicitly tell thread sanitizer that it is written from one thread and read from other threads (not a race conditions).
1 parent bd35cb0 commit 70ff905

File tree

1 file changed

+28
-13
lines changed

1 file changed

+28
-13
lines changed

ggml/src/ggml.c

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,7 +2015,7 @@ struct ggml_threadpool {
20152015

20162016
struct ggml_compute_state * workers; // per thread state
20172017
int n_threads_max; // number of threads in the pool
2018-
int n_threads_cur; // number of threads used in the current graph
2018+
atomic_int n_threads_cur; // number of threads used in the current graph
20192019

20202020
int32_t prio; // Scheduling priority
20212021
uint32_t poll; // Polling level (0 - no polling)
@@ -19967,15 +19967,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1996719967

1996819968
#ifndef GGML_USE_OPENMP
1996919969

19970-
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19970+
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
19971+
struct ggml_threadpool * threadpool = state->threadpool;
19972+
return (state->ith < threadpool->n_threads_cur);
19973+
}
19974+
19975+
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
1997119976
struct ggml_threadpool * threadpool = state->threadpool;
1997219977

1997319978
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
1997419979

1997519980
// check for new graph/work
1997619981
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
1997719982
if (new_graph != state->last_graph) {
19978-
state->pending = (state->ith < threadpool->n_threads_cur);
19983+
state->pending = ggml_graph_compute_thread_active(state);
1997919984
state->last_graph = new_graph;
1998019985
}
1998119986

@@ -19985,11 +19990,16 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
1998519990
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
1998619991
struct ggml_threadpool * threadpool = state->threadpool;
1998719992

19993+
// Skip polling for unused threads
19994+
if (!ggml_graph_compute_thread_active(state)) {
19995+
return state->pending;
19996+
}
19997+
1998819998
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
1998919999
// Perhaps, we can adjust it dynamically based on load and things.
1999020000
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
1999120001

19992-
for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
20002+
for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
1999320003
// No new work. Keep polling.
1999420004
ggml_thread_cpu_relax();
1999520005
}
@@ -20005,9 +20015,9 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
2000520015
}
2000620016

2000720017
ggml_mutex_lock_shared(&threadpool->mutex);
20008-
while (!ggml_graph_compute_ready(state)) {
20018+
while (!ggml_graph_compute_thread_ready(state)) {
2000920019
// No new work. Wait for the signal.
20010-
GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
20020+
GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
2001120021
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
2001220022
}
2001320023
ggml_mutex_unlock_shared(&threadpool->mutex);
@@ -20054,12 +20064,17 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
2005420064
}
2005520065

2005620066
// Start processing new graph
20057-
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
20067+
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
2005820068
{
2005920069
// always take the mutex here because the worker threads are doing hybrid poll/wait
2006020070

2006120071
ggml_mutex_lock(&threadpool->mutex);
2006220072

20073+
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
20074+
20075+
// Update the number of active threads
20076+
threadpool->n_threads_cur = n_threads;
20077+
2006320078
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
2006420079

2006520080
if (threadpool->pause) {
@@ -20194,15 +20209,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2019420209
// No worker threads should be accessing the parameters below at this stage
2019520210
threadpool->cgraph = cgraph;
2019620211
threadpool->cplan = cplan;
20197-
threadpool->n_threads_cur = n_threads;
2019820212
threadpool->current_chunk = 0;
2019920213
threadpool->ec = GGML_STATUS_SUCCESS;
2020020214
}
2020120215

20202-
if (n_threads > threadpool->n_threads_max) {
20203-
GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
20204-
}
20205-
2020620216
#ifdef GGML_USE_OPENMP
2020720217
if (n_threads > 1) {
2020820218
#pragma omp parallel num_threads(n_threads)
@@ -20220,8 +20230,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2022020230
ggml_graph_compute_thread(&threadpool->workers[0]);
2022120231
}
2022220232
#else
20233+
if (n_threads > threadpool->n_threads_max) {
20234+
GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20235+
n_threads = threadpool->n_threads_max;
20236+
}
20237+
2022320238
// Kick all threads to start the new graph
20224-
ggml_graph_compute_kickoff(threadpool);
20239+
ggml_graph_compute_kickoff(threadpool, n_threads);
2022520240

2022620241
// This is a work thread too
2022720242
ggml_graph_compute_thread(&threadpool->workers[0]);

0 commit comments

Comments
 (0)