fix: use persistent thread pool

DarkSharpness · DarkSharpness · commit 1da52cba1063 · 2025-09-19T16:24:18.000+08:00
diff --git a/cpp/earley_parser.cc b/cpp/earley_parser.cc
@@ -713,9 +713,7 @@ bool RepeatDetector::IsVisited(const ParserState& state) const {
 
 void RepeatDetector::Insert(const ParserState& state) {
   if (size_ == transition_threshold_) {
-    for (const auto& s : visited_vector_) {
-      visited_set_.insert(s);
-    }
+    visited_set_.insert(visited_vector_.begin(), visited_vector_.begin() + size_);
   }
   size_++;
   if (size_ > transition_threshold_) {
diff --git a/cpp/grammar_compiler.cc b/cpp/grammar_compiler.cc
@@ -20,6 +20,7 @@
 #include "grammar_functor.h"
 #include "grammar_impl.h"
 #include "support/logging.h"
+#include "support/reflection.h"
 #include "support/thread_pool.h"
 #include "support/thread_safe_cache.h"
 #include "support/utils.h"
@@ -544,7 +545,12 @@ AdaptiveTokenMask GrammarMatcherForTokenMaskCache::GetAdaptiveTokenMask(
 class GrammarCompilerNoCache {
  public:
   GrammarCompilerNoCache(const TokenizerInfo& tokenizer_info, int max_threads)
-      : tokenizer_info_(tokenizer_info), max_threads_(max_threads) {}
+      : tokenizer_info_(tokenizer_info), thread_pool_() {
+    if (max_threads > 1) {
+      /// NOTE: maybe we can allow max_threads = 1, and use 0 as no extra thread.
+      thread_pool_.emplace(max_threads);
+    }
+  }
 
   CompiledGrammar CompileBuiltinJSONGrammar();
 
@@ -571,8 +577,9 @@ class GrammarCompilerNoCache {
 
   /*! \brief The vocabulary associated with this storage class. */
   const TokenizerInfo tokenizer_info_;
-  /*! \brief The maximum number of threads to use. */
-  const int max_threads_;
+
+  /*! \brief The persistent thread pool for multi-threading. */
+  std::optional<ThreadPool> thread_pool_;
 };
 
 CompiledGrammar GrammarCompilerNoCache::MultiThreadCompileGrammar(Grammar grammar) {
@@ -597,12 +604,9 @@ CompiledGrammar GrammarCompilerNoCache::MultiThreadCompileGrammar(Grammar gramma
   // TODO(Charlie): Figure out how to support ThreadPool and std::mutex in WebAssembly.
   // Only declare ThreadPool and mutex if max_threads > 1, so when max_threads = 1, we do
   // not need ThreadPool or std::mutex, which throws error in runtime in WebAssembly.
-  std::optional<ThreadPool> thread_pool;
-  std::optional<std::mutex> adaptive_token_mask_cache_mutex;
-
-  if (max_threads_ > 1) {
-    thread_pool.emplace(max_threads_);
-    adaptive_token_mask_cache_mutex.emplace();
+  std::optional<TaskCounter> task_counter;
+  if (thread_pool_) {
+    task_counter.emplace();
   }
 
   auto add_adaptive_token_mask = [&](const ParserState& state, bool is_root_rule) {
@@ -613,18 +617,20 @@ CompiledGrammar GrammarCompilerNoCache::MultiThreadCompileGrammar(Grammar gramma
         tokenizer_info_.GetTrieSubtreeNodesRange(),
         is_root_rule
     );
-    if (max_threads_ > 1) {
-      std::lock_guard<std::mutex> lock(adaptive_token_mask_cache_mutex.value());
-      compiled_grammar_impl->adaptive_token_mask_cache[state] = cur_adaptive_token_mask_cache;
+    if (thread_pool_) {
+      task_counter->CompleteOne([&] {
+        compiled_grammar_impl->adaptive_token_mask_cache[state] = cur_adaptive_token_mask_cache;
+      });
     } else {
       compiled_grammar_impl->adaptive_token_mask_cache[state] = cur_adaptive_token_mask_cache;
     }
   };
 
   auto add_task_adaptive_token_mask = [&](const ParserState& state, bool is_root_rule) {
     // Execute depending on whether we use thread_pool
-    if (max_threads_ > 1) {
-      thread_pool->Execute([add_adaptive_token_mask, state, is_root_rule]() {
+    if (thread_pool_) {
+      task_counter->AddOne();
+      thread_pool_->Execute([add_adaptive_token_mask, state, is_root_rule] {
         add_adaptive_token_mask(state, is_root_rule);
       });
     } else {
@@ -685,8 +691,8 @@ CompiledGrammar GrammarCompilerNoCache::MultiThreadCompileGrammar(Grammar gramma
     }
   }
 
-  if (max_threads_ > 1) {
-    thread_pool->Join();
+  if (thread_pool_) {
+    task_counter->Wait();
   }
 
   return CompiledGrammar(compiled_grammar_impl);
@@ -916,7 +922,7 @@ CompiledGrammar GrammarCompiler::Impl::Compute(const UnionKey& key) {
         } else if constexpr (std::is_same_v<KeyType, BuiltinJSONGrammarKey>) {
           return this->no_cache_compiler_.CompileBuiltinJSONGrammar();
         } else {
-          XGRAMMAR_UNREACHABLE();
+          static_assert(detail::reflection::false_v<KeyType>, "non-exhaustive visitor!");
         }
       },
       key
diff --git a/cpp/support/thread_pool.h b/cpp/support/thread_pool.h
@@ -6,13 +6,13 @@
 #ifndef XGRAMMAR_SUPPORT_THREAD_POOL_H_
 #define XGRAMMAR_SUPPORT_THREAD_POOL_H_
 
+#include <atomic>
 #include <condition_variable>
+#include <cstddef>
 #include <functional>
-#include <future>
 #include <mutex>
 #include <queue>
 #include <thread>
-#include <type_traits>
 #include <vector>
 
 #include "logging.h"
@@ -35,8 +35,9 @@ class ThreadPool {
    */
   ThreadPool(size_t num_threads = std::thread::hardware_concurrency()) {
     // Initialize thread pool with num_threads threads
-    for (size_t i = 0; i < num_threads; ++i) {
-      workers_.emplace_back([this] {
+    workers_.resize(num_threads);
+    for (auto& worker : workers_) {
+      worker = std::thread([this] {
         while (true) {
           std::function<void()> task;
           {
@@ -58,38 +59,6 @@ class ThreadPool {
     }
   }
 
-  /*!
-   * \brief Add a new task to be executed by the thread pool.
-   * \tparam F Type of the function to execute
-   * \tparam Args Types of the arguments to pass to the function
-   * \param f Function to execute
-   * \param args Arguments to pass to the function
-   * \return std::shared_future containing the result of the function call
-   * \note Tasks are executed in FIFO order but may complete in any order.
-   */
-  template <class F, class... Args>
-  auto Submit(F&& f, Args&&... args) -> std::shared_future<std::invoke_result_t<F, Args...>> {
-    using return_type = std::invoke_result_t<F, Args...>;
-
-    // Package the task with its arguments into a shared pointer
-    auto task = std::make_shared<std::packaged_task<return_type()>>(
-        std::bind(std::forward<F>(f), std::forward<Args>(args)...)
-    );
-
-    std::shared_future<return_type> res = task->get_future().share();
-
-    {
-      std::unique_lock<std::mutex> lock(queue_mutex_);
-      XGRAMMAR_CHECK(!shutdown_) << "Cannot submit task to stopped ThreadPool";
-      ++unfinished_task_count_;  // Increment task count
-
-      // Directly add the task without wrapping
-      task_queue_.emplace([task]() { (*task)(); });
-    }
-    queue_condition_.notify_one();
-    return res;
-  }
-
   /*!
    * \brief Add a new task to be executed by the thread pool without returning a future.
    * \tparam F Type of the function to execute
@@ -98,21 +67,20 @@ class ThreadPool {
    * \param args Arguments to pass to the function
    * \note Tasks are executed asynchronously by the worker threads.
    */
-  template <class F, class... Args>
-  void Execute(F&& f, Args&&... args) {
+  void Execute(std::function<void()> f) {
     {
       std::unique_lock<std::mutex> lock(queue_mutex_);
       XGRAMMAR_CHECK(!shutdown_) << "Cannot execute task in stopped ThreadPool";
       ++unfinished_task_count_;  // Increment task count
 
       // Directly add the task without wrapping
-      task_queue_.emplace(std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+      task_queue_.emplace(std::move(f));
     }
     queue_condition_.notify_one();
   }
 
   void Wait() {
-    std::unique_lock<std::mutex> lock(queue_mutex_);
+    auto lock = std::unique_lock{queue_mutex_};
     tasks_done_condition_.wait(lock, [this] { return unfinished_task_count_ == 0; });
   }
 
@@ -147,6 +115,8 @@ class ThreadPool {
   ThreadPool& operator=(const ThreadPool&) = delete;
   ThreadPool& operator=(ThreadPool&&) = delete;
 
+  std::size_t NumThreads() const { return workers_.size(); }
+
  private:
   void TaskComplete() {
     std::unique_lock<std::mutex> lock(queue_mutex_);
@@ -172,6 +142,34 @@ class ThreadPool {
   int unfinished_task_count_ = 0;
 };
 
+class TaskCounter {
+ public:
+  template <typename F>
+  void CompleteOne(F&& f) {
+    const auto lock = std::lock_guard{mutex_};
+    std::forward<F>(f)();
+    const auto working = working_.fetch_sub(1, std::memory_order_relaxed) - 1;
+    if (working == 0 && waiting_ > 0) cv_.notify_all();
+  }
+
+  // This can be called by other threads, so we must use atomic.
+  // We don't rely on any happens before relationship, so we use relaxed order.
+  std::size_t AddOne() { return working_.fetch_add(1, std::memory_order_relaxed) + 1; }
+
+  void Wait() {
+    auto lock = std::unique_lock{mutex_};
+    ++waiting_;
+    cv_.wait(lock, [this] { return working_.load(std::memory_order_relaxed) == 0; });
+    --waiting_;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::size_t waiting_ = 0;
+  std::atomic_size_t working_ = 0;
+};
+
 inline void ParallelFor(int low, int high, int num_threads, std::function<void(int)> f) {
   if (high - low == 1) {
     f(low);