threadpool: consistent use of int type for n_threads params

max-krasnyansky · max-krasnyansky · commit 2bf6c55845b1 · 2024-08-24T10:50:06.000-07:00
diff --git a/common/common.h b/common/common.h
@@ -68,7 +68,7 @@ enum dimre_method {
 };
 
 struct cpu_params {
-    int32_t  n_threads                   = -1;
+    int      n_threads                   = -1;
     bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
     bool     mask_valid                  = false;   // Default: any CPU
     int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
@@ -214,7 +214,7 @@ struct gpt_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
@@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
 #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
 
 struct benchmark_params_struct {
-    int32_t n_threads     = 1;
+    int     n_threads     = 1;
     int32_t n_iterations  = 10;
 };
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
 
     LOG("%s: llama threadpool init = n_threads = %d\n",
         __func__,
-        (int32_t) params.cpuparams.n_threads
+        (int) params.cpuparams.n_threads
     );
     struct ggml_threadpool_params tpp_batch =
             ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -629,7 +629,7 @@ extern "C" {
     struct ggml_threadpool_params {
         bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
         bool     mask_specified;              // mask is non-empty
-        int32_t  n_threads;                   // number of threads
+        int      n_threads;                   // number of threads
         int32_t  prio;                        // thread priority
         uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
         bool     strict_cpu;                  // strict cpu placement
@@ -2027,7 +2027,7 @@ extern "C" {
     GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
     GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
     GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
-    GGML_API int32_t                         ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
+    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
     GGML_API void                            ggml_pause_threadpool        (struct ggml_compute_threadpool * threadpool);
     GGML_API void                            ggml_resume_threadpool       (struct ggml_compute_threadpool * threadpool);
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -1973,8 +1973,8 @@ struct ggml_compute_threadpool {
     atomic_bool pause;        // Used for pausing the threadpool or individual threads
 
     struct ggml_compute_state * workers;   // per thread state
-    int32_t                     n_threads_max; // number of threads in the pool
-    int32_t                     n_threads_cur; // number of threads used in the current graph
+    int          n_threads_max; // number of threads in the pool
+    int          n_threads_cur; // number of threads used in the current graph
 
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
@@ -18846,7 +18846,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
 
 #ifndef GGML_USE_OPENMP
     struct ggml_compute_state* workers = threadpool->workers;
-    const int32_t n_threads = threadpool->n_threads_max;
+    const int n_threads = threadpool->n_threads_max;
 
     ggml_mutex_lock(&threadpool->mutex);
 
@@ -18856,7 +18856,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     ggml_cond_broadcast(&threadpool->cond);
     ggml_mutex_unlock(&threadpool->mutex);
 
-    for (int32_t j = 1; j < n_threads; j++) {
+    for (int j = 1; j < n_threads; j++) {
         int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
         GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
         UNUSED(rc);
@@ -18912,11 +18912,11 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 
 struct ggml_cplan ggml_graph_plan(
           const struct ggml_cgraph * cgraph,
-                           int32_t   n_threads,
+                           int       n_threads,
     struct ggml_compute_threadpool * threadpool) {
 
     if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads);
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
     }
     if (n_threads <= 0) {
         n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
@@ -19335,13 +19335,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     GGML_ASSERT(cplan->n_threads > 0);
     GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
 
-    int32_t n_threads                           = cplan->n_threads;
+    int n_threads                               = cplan->n_threads;
     struct ggml_compute_threadpool * threadpool = cplan->threadpool;
 
     bool disposable_threadpool = false;
 
     if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads);
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
         disposable_threadpool = true;
 
         struct ggml_threadpool_params ttp = {
diff --git a/include/llama.h b/include/llama.h
@@ -304,8 +304,8 @@ extern "C" {
         uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
         uint32_t n_ubatch;          // physical maximum batch size
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        uint32_t n_threads;         // number of threads to use for generation
-        uint32_t n_threads_batch;   // number of threads to use for batch processing
+        int      n_threads;         // number of threads to use for generation
+        int      n_threads_batch;   // number of threads to use for batch processing
 
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
@@ -851,13 +851,13 @@ extern "C" {
     // Set the number of threads used for decoding
     // n_threads is the number of threads used for generation (single token)
     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch);
 
     // Get the number of threads used for generation of a single token.
-    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+    LLAMA_API int llama_n_threads(struct llama_context * ctx);
 
     // Get the number of threads used for prompt and batch processing (multiple token).
-    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
+    LLAMA_API int llama_n_threads_batch(struct llama_context * ctx);
 
     // Set whether the model is in embeddings mode or not
     // If true, embeddings will be returned but logits will not
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -2373,8 +2373,8 @@ struct llama_cparams {
     uint32_t n_batch;
     uint32_t n_ubatch;
     uint32_t n_seq_max;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
+    int      n_threads;       // number of threads to use for generation
+    int      n_threads_batch; // number of threads to use for batch processing
 
     float rope_freq_base;
     float rope_freq_scale;
@@ -15528,7 +15528,7 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
               int32_t   n_tokens) {
 
     const auto & cparams = lctx.cparams;
-    int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
 
     ggml_compute_threadpool_t threadpool = nullptr;  // nullptr -> disposable threadpool
 
@@ -15663,7 +15663,7 @@ static int llama_decode_internal(
         std::pair<int32_t, ggml_compute_threadpool_t> threads =
             llama_swap_threadpools(lctx, n_tokens);
 
-        int32_t n_threads                    = threads.first;
+        int n_threads                        = threads.first;
         ggml_compute_threadpool_t threadpool = threads.second;
 
         GGML_ASSERT(n_threads > 0);
@@ -15907,7 +15907,7 @@ static int llama_encode_internal(
     std::pair<int32_t, ggml_compute_threadpool_t> threads =
         llama_swap_threadpools(lctx, n_tokens);
 
-    int32_t n_threads                    = threads.first;
+    int n_threads                        = threads.first;
     ggml_compute_threadpool_t threadpool = threads.second;
     GGML_ASSERT(n_threads > 0);
 
@@ -19451,16 +19451,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
     }
 }
 
-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
+void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch) {
     ctx->cparams.n_threads       = n_threads;
     ctx->cparams.n_threads_batch = n_threads_batch;
 }
 
-uint32_t llama_n_threads(struct llama_context * ctx) {
+int llama_n_threads(struct llama_context * ctx) {
     return ctx->cparams.n_threads;
 }
 
-uint32_t llama_n_threads_batch(struct llama_context * ctx) {
+int llama_n_threads_batch(struct llama_context * ctx) {
     return ctx->cparams.n_threads_batch;
 }