Skip to content

Commit e69b0a8

Browse files
threadpool: reduce the number of barrier required
New work is now indicated with an atomic counter that is incremented for each new graph that needs to be computed. This removes the need for extra barrier for clearing the "new_work" and removes the special case for trivial graphs.
1 parent 83eedea commit e69b0a8

File tree

1 file changed

+34
-44
lines changed

1 file changed

+34
-44
lines changed

ggml/src/ggml.c

Lines changed: 34 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1959,13 +1959,13 @@ struct ggml_compute_threadpool {
19591959
struct ggml_cplan * cplan;
19601960

19611961
// synchronization primitives
1962+
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
19621963
atomic_int n_barrier;
19631964
atomic_int n_barrier_passed;
19641965
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
19651966

19661967
volatile bool stop; // Used for stopping the threadpool altogether
19671968
volatile bool pause; // Used for pausing the threadpool or individual threads
1968-
volatile bool new_work; // Set when there is work to be done, unset after it's done
19691969

19701970
struct ggml_compute_state * workers; // per thread state
19711971
int32_t n_threads_max; // number of threads in the pool
@@ -1987,6 +1987,8 @@ struct ggml_compute_state {
19871987
ggml_thread_t thrd;
19881988
bool cpumask[GGML_MAX_N_THREADS];
19891989
bool mask_specified;
1990+
int last_graph;
1991+
bool pending;
19901992
#endif
19911993
struct ggml_compute_threadpool * threadpool;
19921994
int ith;
@@ -19105,55 +19107,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1910519107
/*.threadpool=*/ state->threadpool,
1910619108
};
1910719109

19108-
struct ggml_tensor * node = cgraph->nodes[0];
19110+
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
19111+
struct ggml_tensor * node = cgraph->nodes[node_n];
1910919112

19110-
ggml_compute_forward(&params, node);
19111-
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
19112-
state->threadpool->ec = GGML_STATUS_ABORTED;
19113-
}
19114-
19115-
for (int node_n = 1; node_n < cgraph->n_nodes; node_n++) {
19116-
ggml_barrier(state->threadpool);
19117-
19118-
if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
19119-
break;
19120-
}
19121-
19122-
node = cgraph->nodes[node_n];
1912319113
ggml_compute_forward(&params, node);
1912419114

1912519115
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
1912619116
state->threadpool->ec = GGML_STATUS_ABORTED;
1912719117
}
19128-
}
1912919118

19130-
if (cgraph->n_nodes == 1) {
19131-
// We need a barrier before disabling new_work in case we have a trivial graph
1913219119
ggml_barrier(state->threadpool);
19133-
}
1913419120

19135-
if (!state->threadpool->disposable && state->ith == 0) {
19136-
// Don't need a lock, because there is a barrier after this, and only after that
19137-
// do the secondary threads go into standby
19138-
state->threadpool->new_work = false;
19121+
if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
19122+
break;
19123+
}
1913919124
}
1914019125

19141-
ggml_barrier(state->threadpool);
19142-
1914319126
return 0;
1914419127
}
1914519128

1914619129
#ifndef GGML_USE_OPENMP
1914719130

19148-
static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
19149-
struct ggml_compute_threadpool * threadpool = state->threadpool;
19150-
return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
19151-
}
19152-
1915319131
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
1915419132
struct ggml_compute_threadpool * threadpool = state->threadpool;
19155-
if (threadpool->stop || threadpool->pause) return true;
19156-
return ggml_graph_compute_got_work(state);
19133+
if (threadpool->stop || threadpool->pause || state->pending) { return true; }
19134+
19135+
// check for new graph/work
19136+
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
19137+
if (new_graph != state->last_graph) {
19138+
state->pending = (state->ith < threadpool->n_threads_cur);
19139+
state->last_graph = new_graph;
19140+
}
19141+
19142+
return state->pending;
1915719143
}
1915819144

1915919145
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
@@ -19168,14 +19154,14 @@ static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state *
1916819154
__cpu_relax();
1916919155
}
1917019156

19171-
return ggml_graph_compute_got_work(state);
19157+
return state->pending;
1917219158
}
1917319159

19174-
static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
19160+
static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
1917519161
struct ggml_compute_threadpool * threadpool = state->threadpool;
1917619162

1917719163
if (ggml_graph_compute_poll_for_work(state)) {
19178-
return ggml_graph_compute_got_work(state);
19164+
return state->pending;
1917919165
}
1918019166

1918119167
ggml_mutex_lock_shared(&threadpool->mutex);
@@ -19186,7 +19172,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
1918619172
}
1918719173
ggml_mutex_unlock_shared(&threadpool->mutex);
1918819174

19189-
return ggml_graph_compute_got_work(state);
19175+
return state->pending;
1919019176
}
1919119177

1919219178
static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@@ -19216,8 +19202,10 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
1921619202
// Check if there is new work
1921719203
// The main thread is the only one that can dispatch new work
1921819204

19219-
bool new_work = ggml_graph_compute_check_for_work(state);
19220-
if (new_work) {
19205+
ggml_graph_compute_check_for_work(state);
19206+
if (state->pending) {
19207+
state->pending = false;
19208+
1922119209
int64_t ret = (int64_t) ggml_graph_compute_thread(state);
1922219210
if (ret == GGML_EXIT_ABORTED)
1922319211
return (thread_ret_t) ret;
@@ -19258,12 +19246,12 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
1925819246
{
1925919247
threadpool->cgraph = cgraph;
1926019248
threadpool->cplan = cplan;
19249+
threadpool->n_graph = 0;
1926119250
threadpool->n_barrier = 0;
1926219251
threadpool->n_barrier_passed = 0;
1926319252
threadpool->current_chunk = 0;
1926419253
threadpool->stop = false;
1926519254
threadpool->pause = disposable ? false : tpp->paused;
19266-
threadpool->new_work = false;
1926719255
threadpool->workers = NULL;
1926819256
threadpool->n_threads_max = tpp->n_threads;
1926919257
threadpool->n_threads_cur = disposable ? tpp->n_threads : 0;
@@ -19306,7 +19294,9 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
1930619294
.thrd = 0,
1930719295
.mask_specified = tpp->mask_specified,
1930819296
.threadpool = threadpool,
19309-
.ith = j
19297+
.ith = j,
19298+
.last_graph = 0,
19299+
.pending = false
1931019300
};
1931119301

1931219302
if (tpp->mask_specified) {
@@ -19409,12 +19399,12 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1940919399
// always take the mutex here because the worker threads are doing hybrid poll/wait
1941019400

1941119401
ggml_mutex_lock(&threadpool->mutex);
19412-
threadpool->new_work = true;
19413-
if (!threadpool->pause) {
19414-
ggml_cond_broadcast(&threadpool->cond);
19415-
} else {
19402+
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
19403+
if (threadpool->pause) {
1941619404
// resume does cond broadcast
1941719405
__ggml_resume_threadpool(threadpool);
19406+
} else {
19407+
ggml_cond_broadcast(&threadpool->cond);
1941819408
}
1941919409
ggml_mutex_unlock(&threadpool->mutex);
1942019410
}

0 commit comments

Comments
 (0)