Skip to content

Commit 83eedea

Browse files
threadpool: add support for hybrid polling
poll params (--poll, ...) now specify "polling level", i.e. how aggresively we poll before waiting on cond.var. poll=0 means no polling, 1 means poll for 128K rounds then wait, 2 for 256K rounds, ... The default value of 50 (ie 50x128K rounds) seems like a decent default across modern platforms. We can tune this further as things evolve.
1 parent 23b4499 commit 83eedea

File tree

4 files changed

+57
-47
lines changed

4 files changed

+57
-47
lines changed

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1710,7 +1710,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
17101710
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
17111711
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
17121712
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
1713-
options.push_back({ "*", " --poll <0|1>", "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
1713+
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
17141714
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
17151715
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
17161716
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi",

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ struct cpu_params {
7373
bool mask_valid = false; // Default: any CPU
7474
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
7575
bool strict_cpu = false; // Use strict CPU placement
76-
bool poll = true; // Use polling (busywait) to wait for work (default matches OpenMP)
76+
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
7777
};
7878

7979
struct gpt_params {

ggml/include/ggml.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -627,13 +627,13 @@ extern "C" {
627627
typedef bool (*ggml_abort_callback)(void * data);
628628

629629
struct ggml_threadpool_params {
630-
bool cpumask[GGML_MAX_N_THREADS];
631-
bool mask_specified;
632-
int32_t n_threads;
633-
int32_t prio;
634-
bool poll;
635-
bool strict_cpu;
636-
bool paused;
630+
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
631+
bool mask_specified; // mask is non-empty
632+
int32_t n_threads; // number of threads
633+
int32_t prio; // thread priority
634+
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
635+
bool strict_cpu; // strict cpu placement
636+
bool paused; // start in paused state
637637
};
638638

639639
struct ggml_compute_threadpool; // forward declaration, see ggml.c

ggml/src/ggml.c

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,7 +1973,7 @@ struct ggml_compute_threadpool {
19731973

19741974
int32_t prio; // Scheduling priority
19751975
bool disposable; // Doesn't initialize a conv-var
1976-
bool poll; // Use polling (busywait) // TODO
1976+
uint32_t poll; // Polling level (0 - no polling)
19771977

19781978
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
19791979
void * abort_callback_data;
@@ -19143,35 +19143,50 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1914319143
return 0;
1914419144
}
1914519145

19146+
#ifndef GGML_USE_OPENMP
1914619147

19148+
static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
19149+
struct ggml_compute_threadpool * threadpool = state->threadpool;
19150+
return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
19151+
}
1914719152

19148-
#ifndef GGML_USE_OPENMP
19153+
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19154+
struct ggml_compute_threadpool * threadpool = state->threadpool;
19155+
if (threadpool->stop || threadpool->pause) return true;
19156+
return ggml_graph_compute_got_work(state);
19157+
}
19158+
19159+
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
19160+
struct ggml_compute_threadpool * threadpool = state->threadpool;
19161+
19162+
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
19163+
// Perhaps, we can adjust it dynamically based on load and things.
19164+
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
19165+
19166+
for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
19167+
// No new work. Keep polling.
19168+
__cpu_relax();
19169+
}
19170+
19171+
return ggml_graph_compute_got_work(state);
19172+
}
1914919173

1915019174
static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
1915119175
struct ggml_compute_threadpool * threadpool = state->threadpool;
1915219176

19153-
if (threadpool->poll) {
19154-
while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
19155-
threadpool->stop ||
19156-
threadpool->pause
19157-
)
19158-
) {
19159-
// No new work. Yield and keep polling.
19160-
__cpu_relax();
19161-
}
19162-
} else {
19163-
ggml_mutex_lock_shared(&threadpool->mutex);
19164-
while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
19165-
threadpool->stop ||
19166-
threadpool->pause
19167-
)
19168-
) {
19169-
// No new work. Wait for the signal.
19170-
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
19171-
}
19172-
ggml_mutex_unlock_shared(&threadpool->mutex);
19177+
if (ggml_graph_compute_poll_for_work(state)) {
19178+
return ggml_graph_compute_got_work(state);
19179+
}
19180+
19181+
ggml_mutex_lock_shared(&threadpool->mutex);
19182+
while (!ggml_graph_compute_ready(state)) {
19183+
// No new work. Wait for the signal.
19184+
GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
19185+
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
1917319186
}
19174-
return threadpool->new_work;
19187+
ggml_mutex_unlock_shared(&threadpool->mutex);
19188+
19189+
return ggml_graph_compute_got_work(state);
1917519190
}
1917619191

1917719192
static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@@ -19391,24 +19406,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1939119406
__thread_affinity(threadpool->workers[0].cpumask);
1939219407
}
1939319408

19394-
if (!threadpool->poll) {
19395-
ggml_mutex_lock(&threadpool->mutex);
19396-
threadpool->new_work = true;
19397-
if (threadpool->pause) {
19398-
__ggml_resume_threadpool(threadpool);
19399-
} else {
19400-
ggml_cond_broadcast(&threadpool->cond);
19401-
}
19402-
ggml_mutex_unlock(&threadpool->mutex);
19409+
// always take the mutex here because the worker threads are doing hybrid poll/wait
19410+
19411+
ggml_mutex_lock(&threadpool->mutex);
19412+
threadpool->new_work = true;
19413+
if (!threadpool->pause) {
19414+
ggml_cond_broadcast(&threadpool->cond);
1940319415
} else {
19404-
threadpool->new_work = true;
19405-
if (threadpool->pause) {
19406-
ggml_mutex_lock(&threadpool->mutex);
19407-
__ggml_resume_threadpool(threadpool);
19408-
ggml_mutex_unlock(&threadpool->mutex);
19409-
}
19416+
// resume does cond broadcast
19417+
__ggml_resume_threadpool(threadpool);
1941019418
}
19419+
ggml_mutex_unlock(&threadpool->mutex);
1941119420
}
19421+
1941219422
// this is a work thread too
1941319423
ggml_graph_compute_thread(&threadpool->workers[0]);
1941419424
#endif

0 commit comments

Comments
 (0)