threadpool: add support for ggml_threadpool_params_default/init

max-krasnyansky · fmz · commit c4452edfead8 · 2024-08-27T06:37:57.000-07:00
Also removes the need for explicit mask_specified param.
all-zero cpumask means use default (usually inherited) cpu affinity mask.
diff --git a/common/common.cpp b/common/common.cpp
@@ -295,13 +295,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
         }
     }
 
-    if (n_set == 0) {
-        // You hit the jackpot!
-        memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS);
-        n_set = GGML_MAX_N_THREADS;
-    }
-
-    if (n_set < cpuparams.n_threads) {
+    if (n_set && n_set < cpuparams.n_threads) {
         // Not enough set bits, may experience performance issues.
         fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
     }
@@ -2606,16 +2600,15 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
     struct ggml_threadpool_params tpp;
 
-    tpp.mask_specified = params.mask_valid;
+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
+
     if (params.mask_valid) {
         std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
     }
 
-    tpp.n_threads  = params.n_threads;
     tpp.prio       = params.priority;
     tpp.poll       = params.poll;
     tpp.strict_cpu = params.strict_cpu;
-    tpp.paused     = false;
 
     return tpp;
 }
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -1462,14 +1462,13 @@ int main(int argc, char ** argv) {
 
         llama_kv_cache_clear(ctx);
 
-        struct ggml_threadpool_params tpp;
-        tpp.n_threads      = t.n_threads;
-        tpp.mask_specified = params.cpuparams.mask_valid;
+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
         tpp.strict_cpu     = params.cpuparams.strict_cpu;
         tpp.prio           = params.cpuparams.priority;
         tpp.poll           = params.cpuparams.poll;
-
-        std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+        if (params.cpuparams.mask_valid) {
+            std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+        }
 
         struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
         if (!threadpool) {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -626,9 +626,10 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
+    // Threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
     struct ggml_threadpool_params {
-        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
-        bool     mask_specified;              // mask is non-empty
+        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
         int      n_threads;                   // number of threads
         int32_t  prio;                        // thread priority
         uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
@@ -2025,6 +2026,8 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
+    GGML_API void                            ggml_threadpool_params_init(struct ggml_threadpool_params *p, int n_threads);
     GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
     GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
     GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -1987,7 +1987,6 @@ struct ggml_compute_state {
 #ifndef GGML_USE_OPENMP
     ggml_thread_t thrd;
     bool cpumask[GGML_MAX_N_THREADS];
-    bool mask_specified;
     int  last_graph;
     bool pending;
 #endif
@@ -18828,11 +18827,14 @@ static bool ggml_thread_apply_thread_priority(int32_t prio) {
 
 #endif
 
-static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
-    if (!global_mask) {
-        memset(local_mask, 1, GGML_MAX_N_THREADS);
-        return;
+static bool ggml_thread_cpumask_is_valid(const bool * mask) {
+    for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) { return true; }
     }
+    return false;
+}
+
+static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
     if (!strict) {
         memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
         return;
@@ -19189,8 +19191,10 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
     ggml_thread_apply_thread_priority(threadpool->prio);
-    if (state->mask_specified)
+
+    if (ggml_thread_cpumask_is_valid(state->cpumask)) {
         ggml_thread_apply_affinity(state->cpumask);
+    }
 
     while (true) {
         // Check if we need to sleep
@@ -19249,17 +19253,27 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo
 
 #endif // GGML_USE_OPENMP
 
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+    struct ggml_threadpool_params p;
+    ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+
 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
     if (p0->n_threads      != p1->n_threads  )    return false;
     if (p0->prio           != p1->prio       )    return false;
     if (p0->poll           != p1->poll       )    return false;
     if (p0->strict_cpu     != p1->strict_cpu )    return false;
-    if (p0->mask_specified != p1->mask_specified) return false;
-    if (p0->mask_specified) {
-        return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
-    }
-
-    return true;
+    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
 
 static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
@@ -19312,16 +19326,13 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     for (int j = 0; j < tpp->n_threads; j++) {
         workers[j] = (struct ggml_compute_state) {
             .thrd           = 0,
-            .mask_specified = tpp->mask_specified,
             .threadpool     = threadpool,
             .ith            = j,
             .last_graph     = 0,
             .pending        = false
         };
 
-        if (tpp->mask_specified) {
-            ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
-        }
+        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
 
         // Spin threads for all secondary workers
         if (j > 0) {
@@ -19357,15 +19368,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
         disposable_threadpool = true;
 
-        struct ggml_threadpool_params ttp = {
-            .mask_specified = false,
-            .n_threads      = n_threads,
-            .prio           = 0,
-            .poll           = 1,
-            .strict_cpu     = false,
-            .paused         = false
-        };
-
+        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
         threadpool = ggml_create_threadpool_impl(&ttp, cgraph, cplan);
     } else {
         // Reset some of the parameters that need resetting
@@ -19407,7 +19410,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     }
 #else
     // Update main thread affinity to match the current threadpool
-    if (threadpool->workers[0].mask_specified) {
+    if (!ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
         ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
     }
 

Original file line number	Diff line number	Diff line change
`@@ -295,13 +295,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)`
`295`	`295`	`}`
`296`	`296`	`}`
`297`	`297`
`298`		`- if (n_set == 0) {`
`299`		`- // You hit the jackpot!`
`300`		`- memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS);`
`301`		`- n_set = GGML_MAX_N_THREADS;`
`302`		`- }`
`303`		`-`
`304`		`- if (n_set < cpuparams.n_threads) {`
	`298`	`+ if (n_set && n_set < cpuparams.n_threads) {`
`305`	`299`	`// Not enough set bits, may experience performance issues.`
`306`	`300`	`fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);`
`307`	`301`	`}`
`@@ -2606,16 +2600,15 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param`
`2606`	`2600`	`struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {`
`2607`	`2601`	`struct ggml_threadpool_params tpp;`
`2608`	`2602`
`2609`		`- tpp.mask_specified = params.mask_valid;`
	`2603`	`+ ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults`
	`2604`	`+`
`2610`	`2605`	`if (params.mask_valid) {`
`2611`	`2606`	`std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);`
`2612`	`2607`	`}`
`2613`	`2608`
`2614`		`- tpp.n_threads = params.n_threads;`
`2615`	`2609`	`tpp.prio = params.priority;`
`2616`	`2610`	`tpp.poll = params.poll;`
`2617`	`2611`	`tpp.strict_cpu = params.strict_cpu;`
`2618`		`- tpp.paused = false;`
`2619`	`2612`
`2620`	`2613`	`return tpp;`
`2621`	`2614`	`}`