@@ -2015,7 +2015,7 @@ struct ggml_threadpool {
2015
2015
2016
2016
struct ggml_compute_state * workers; // per thread state
2017
2017
int n_threads_max; // number of threads in the pool
2018
- int n_threads_cur; // number of threads used in the current graph
2018
+ atomic_int n_threads_cur; // number of threads used in the current graph
2019
2019
2020
2020
int32_t prio; // Scheduling priority
2021
2021
uint32_t poll; // Polling level (0 - no polling)
@@ -19967,15 +19967,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19967
19967
19968
19968
#ifndef GGML_USE_OPENMP
19969
19969
19970
- static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19970
+ static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
19971
+ struct ggml_threadpool * threadpool = state->threadpool;
19972
+ return (state->ith < threadpool->n_threads_cur);
19973
+ }
19974
+
19975
+ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
19971
19976
struct ggml_threadpool * threadpool = state->threadpool;
19972
19977
19973
19978
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
19974
19979
19975
19980
// check for new graph/work
19976
19981
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
19977
19982
if (new_graph != state->last_graph) {
19978
- state->pending = (state->ith < threadpool->n_threads_cur );
19983
+ state->pending = ggml_graph_compute_thread_active (state);
19979
19984
state->last_graph = new_graph;
19980
19985
}
19981
19986
@@ -19985,11 +19990,16 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19985
19990
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
19986
19991
struct ggml_threadpool * threadpool = state->threadpool;
19987
19992
19993
+ // Skip polling for unused threads
19994
+ if (!ggml_graph_compute_thread_active(state)) {
19995
+ return state->pending;
19996
+ }
19997
+
19988
19998
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
19989
19999
// Perhaps, we can adjust it dynamically based on load and things.
19990
20000
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
19991
20001
19992
- for (uint64_t i=0; !ggml_graph_compute_ready (state) && i< n_rounds; i++) {
20002
+ for (uint64_t i=0; !ggml_graph_compute_thread_ready (state) && i < n_rounds; i++) {
19993
20003
// No new work. Keep polling.
19994
20004
ggml_thread_cpu_relax();
19995
20005
}
@@ -20005,9 +20015,9 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
20005
20015
}
20006
20016
20007
20017
ggml_mutex_lock_shared(&threadpool->mutex);
20008
- while (!ggml_graph_compute_ready (state)) {
20018
+ while (!ggml_graph_compute_thread_ready (state)) {
20009
20019
// No new work. Wait for the signal.
20010
- GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
20020
+ GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping) \n", state->ith);
20011
20021
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
20012
20022
}
20013
20023
ggml_mutex_unlock_shared(&threadpool->mutex);
@@ -20054,12 +20064,17 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
20054
20064
}
20055
20065
20056
20066
// Start processing new graph
20057
- static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
20067
+ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads )
20058
20068
{
20059
20069
// always take the mutex here because the worker threads are doing hybrid poll/wait
20060
20070
20061
20071
ggml_mutex_lock(&threadpool->mutex);
20062
20072
20073
+ GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
20074
+
20075
+ // Update the number of active threads
20076
+ threadpool->n_threads_cur = n_threads;
20077
+
20063
20078
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
20064
20079
20065
20080
if (threadpool->pause) {
@@ -20194,15 +20209,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
20194
20209
// No worker threads should be accessing the parameters below at this stage
20195
20210
threadpool->cgraph = cgraph;
20196
20211
threadpool->cplan = cplan;
20197
- threadpool->n_threads_cur = n_threads;
20198
20212
threadpool->current_chunk = 0;
20199
20213
threadpool->ec = GGML_STATUS_SUCCESS;
20200
20214
}
20201
20215
20202
- if (n_threads > threadpool->n_threads_max) {
20203
- GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
20204
- }
20205
-
20206
20216
#ifdef GGML_USE_OPENMP
20207
20217
if (n_threads > 1) {
20208
20218
#pragma omp parallel num_threads(n_threads)
@@ -20220,8 +20230,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
20220
20230
ggml_graph_compute_thread(&threadpool->workers[0]);
20221
20231
}
20222
20232
#else
20233
+ if (n_threads > threadpool->n_threads_max) {
20234
+ GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20235
+ n_threads = threadpool->n_threads_max;
20236
+ }
20237
+
20223
20238
// Kick all threads to start the new graph
20224
- ggml_graph_compute_kickoff(threadpool);
20239
+ ggml_graph_compute_kickoff(threadpool, n_threads );
20225
20240
20226
20241
// This is a work thread too
20227
20242
ggml_graph_compute_thread(&threadpool->workers[0]);
0 commit comments