refactor: optimize the 'set_device' function calling to avoid set device on each step. (#321)

yq33victor · web-flow · commit c8fba1a4a42b · 2025-11-06T09:54:09.000+08:00
Signed-off-by: Tao Peng &lt;pengtao.156@jd.com&gt;
diff --git a/xllm/core/distributed_runtime/worker_service.cpp b/xllm/core/distributed_runtime/worker_service.cpp
@@ -40,6 +40,8 @@ WorkerService::WorkerService(runtime::Options options,
   device_.set_device();
   device_.init_device_context();
   stream_ = device_.get_stream_from_pool();
+  threadpool_ = std::make_unique<ThreadPool>(
+      4, [this]() mutable { device_.set_device(); });
 }
 
 WorkerService::WorkerService(runtime::Options options,
@@ -52,6 +54,8 @@ WorkerService::WorkerService(runtime::Options options,
   device_.set_device();
   device_.init_device_context();
   stream_ = device_.get_stream_from_pool();
+  threadpool_ = std::make_unique<ThreadPool>(
+      4, [this]() mutable { device_.set_device(); });
 }
 
 WorkerService::~WorkerService() = default;
@@ -72,7 +76,6 @@ void WorkerService::step(BatchedForwardInputs& batched_fwd_inputs,
                          torch::Tensor& src_seq_idxes,
                          torch::Tensor& out_tokens,
                          torch::Tensor& out_logprobs) {
-  device_.set_device();
   // execute model
   auto future = worker_->step_async(batched_fwd_inputs);
 
@@ -250,7 +253,7 @@ void WorkerService::InitModel(::google::protobuf::RpcController* controller,
                               const proto::ModelPath* request,
                               proto::Status* response,
                               ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, request, response, done]() mutable {
+  threadpool_->schedule([this, controller, request, response, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     auto model_weights_path = request->model_weights_path();
     auto init_future = worker_->init_model_async(model_weights_path);
@@ -270,7 +273,7 @@ void WorkerService::ProcessGroupTest(
     const proto::Empty* request,
     proto::Status* response,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, request, response, done]() mutable {
+  threadpool_->schedule([this, controller, request, response, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     auto future = worker_->process_group_test_async();
     std::move(future).get();
@@ -284,7 +287,7 @@ void WorkerService::ProfileDeviceMemory(
     const proto::Empty* request,
     proto::DeviceMemory* response,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, request, response, done]() mutable {
+  threadpool_->schedule([this, controller, request, response, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     auto future = worker_->estimate_kv_cache_capacity_async();
     std::tuple<int64_t, int64_t> result = std::move(future).get();
@@ -299,7 +302,7 @@ void WorkerService::AllocateKVCache(
     const proto::KVCacheShape* request,
     proto::Status* response,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, request, response, done]() mutable {
+  threadpool_->schedule([this, controller, request, response, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     std::vector<std::vector<int64_t>> kv_cache_shape;
     kv_cache_shape.reserve(2);
@@ -319,7 +322,7 @@ void WorkerService::AllocateContinuousKVCache(
     const proto::XTensorOptionsVec* request,
     proto::Status* response,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, request, response, done]() mutable {
+  threadpool_->schedule([this, controller, request, response, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     XTensor::Options key_options;
     XTensor::Options value_options;
@@ -350,7 +353,7 @@ void WorkerService::AllocateKVCacheWithTransfer(
     const proto::AllocateKVCacheWithTransferRequest* req,
     proto::Status* resp,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, req, resp, done]() mutable {
+  threadpool_->schedule([this, controller, req, resp, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     uint64_t kv_cache_size = req->kv_cache_size();
     std::vector<std::vector<int64_t>> kv_cache_shape;
@@ -373,7 +376,7 @@ void WorkerService::GetCacheInfo(::google::protobuf::RpcController* controller,
                                  const proto::Empty* req,
                                  proto::CacheInfo* resp,
                                  ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, req, resp, done]() mutable {
+  threadpool_->schedule([this, controller, req, resp, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     uint64_t cluster_id;
     std::string addr;
@@ -392,7 +395,7 @@ void WorkerService::PullKVCache(::google::protobuf::RpcController* controller,
                                 const proto::PullKVCacheRequest* req,
                                 proto::Status* resp,
                                 ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, req, resp, done]() mutable {
+  threadpool_->schedule([this, controller, req, resp, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     uint64_t src_cluster_id = req->cluster_id();
     std::string addr = req->addr();
@@ -433,7 +436,7 @@ void WorkerService::GetDeviceInfo(::google::protobuf::RpcController* controller,
                                   const proto::Empty* req,
                                   proto::DeviceInfo* resp,
                                   ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, req, resp, done]() mutable {
+  threadpool_->schedule([this, controller, req, resp, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     std::string device_ip;
     uint16_t listen_port;
@@ -448,7 +451,7 @@ void WorkerService::LinkCluster(::google::protobuf::RpcController* controller,
                                 const proto::ClusterInfo* req,
                                 proto::Status* resp,
                                 ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, req, resp, done]() mutable {
+  threadpool_->schedule([this, controller, req, resp, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     std::vector<uint64_t> cluster_ids(req->cluster_ids().begin(),
                                       req->cluster_ids().end());
@@ -467,7 +470,7 @@ void WorkerService::UnlinkCluster(::google::protobuf::RpcController* controller,
                                   const proto::ClusterInfo* req,
                                   proto::Status* resp,
                                   ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, req, resp, done]() mutable {
+  threadpool_->schedule([this, controller, req, resp, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     std::vector<uint64_t> cluster_ids(req->cluster_ids().begin(),
                                       req->cluster_ids().end());
@@ -488,11 +491,11 @@ void WorkerService::ExecuteModel(
     const proto::BatchedForwardInputs* pb_batched_fwd_inputs,
     proto::ForwardOutput* pb_forward_output,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this,
-                        controller,
-                        pb_batched_fwd_inputs,
-                        pb_forward_output,
-                        done]() mutable {
+  threadpool_->schedule([this,
+                         controller,
+                         pb_batched_fwd_inputs,
+                         pb_forward_output,
+                         done]() mutable {
     brpc::ClosureGuard done_guard(done);
     Timer timer;
     // convert proto::BatchedForwardInputs to BatchedForwardInputs
@@ -574,9 +577,8 @@ void WorkerService::GetLastStepResult(
     const proto::Empty* req,
     proto::ForwardOutput* pb_forward_output,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule(
+  threadpool_->schedule(
       [this, controller, req, pb_forward_output, done]() mutable {
-        device_.set_device();
         brpc::ClosureGuard done_guard(done);
 
         auto future = worker_->get_last_step_result_async();
@@ -642,7 +644,7 @@ void WorkerService::GetActiveActivationMemory(
     const proto::Empty* req,
     proto::ActivationMemory* resp,
     ::google::protobuf::Closure* done) {
-  threadpool_.schedule([this, controller, req, resp, done]() mutable {
+  threadpool_->schedule([this, controller, req, resp, done]() mutable {
     brpc::ClosureGuard done_guard(done);
     auto future = worker_->get_active_activation_memory_async();
     int64_t active_activation_memory = std::move(future).get();
diff --git a/xllm/core/distributed_runtime/worker_service.h b/xllm/core/distributed_runtime/worker_service.h
@@ -149,7 +149,7 @@ class WorkerService : public proto::DistributeWorker {
 
   std::unique_ptr<std::thread> polling_thread_;
 
-  ThreadPool threadpool_{4};
+  std::unique_ptr<ThreadPool> threadpool_;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/runtime/dit_worker.cpp b/xllm/core/runtime/dit_worker.cpp
@@ -41,11 +41,12 @@ namespace xllm {
 DiTWorker::DiTWorker(const ParallelArgs& parallel_args,
                      const torch::Device& device,
                      const runtime::Options& options)
-    : device_(device), options_(options), parallel_args_(parallel_args) {}
+    : device_(device), options_(options), parallel_args_(parallel_args) {
+  device_.set_device();
+}
 
 bool DiTWorker::init_model(const std::string& model_weights_path) {
   CHECK(dit_model_ == nullptr) << "Model is already initialized.";
-  device_.set_device();
 
   auto loader = std::make_unique<DiTModelLoader>(model_weights_path);
   dtype_ = util::parse_dtype(loader->get_torch_dtype(), device_);
@@ -80,7 +81,6 @@ bool DiTWorker::init_model(const std::string& model_weights_path) {
 }
 
 std::optional<DiTForwardOutput> DiTWorker::step(const DiTForwardInput& inputs) {
-  device_.set_device();
   Timer timer;
 
   auto output = dit_model_executor_->forward(inputs.to(device_, dtype_));
diff --git a/xllm/core/runtime/llm_worker_impl.cpp b/xllm/core/runtime/llm_worker_impl.cpp
@@ -41,11 +41,12 @@ namespace xllm {
 LLMWorkerImpl::LLMWorkerImpl(const ParallelArgs& parallel_args,
                              const torch::Device& device,
                              const runtime::Options& options)
-    : WorkerImpl(parallel_args, device, options) {}
+    : WorkerImpl(parallel_args, device, options) {
+  device_.set_device();
+}
 
 bool LLMWorkerImpl::init_model(ModelContext& context) {
   CHECK(model_ == nullptr) << "Model is already initialized.";
-  device_.set_device();
 
   // Try to create a causal LM model
   model_ = create_llm_model(context);
@@ -67,7 +68,6 @@ bool LLMWorkerImpl::init_model(ModelContext& context) {
 
 std::optional<ForwardOutput> LLMWorkerImpl::step(
     const BatchedForwardInputs& inputs) {
-  device_.set_device();
   Timer timer;
   std::vector<torch::Tensor> flatten_tokens_micro_batches;
   std::vector<torch::Tensor> flatten_positions_micro_batches;
diff --git a/xllm/core/runtime/vlm_worker_impl.cpp b/xllm/core/runtime/vlm_worker_impl.cpp
@@ -38,13 +38,13 @@ namespace xllm {
 VLMWorkerImpl::VLMWorkerImpl(const ParallelArgs& parallel_args,
                              const torch::Device& device,
                              const runtime::Options& options)
-    : WorkerImpl(parallel_args, device, options) {}
+    : WorkerImpl(parallel_args, device, options) {
+  device_.set_device();
+}
 
 bool VLMWorkerImpl::init_model(ModelContext& context) {
   CHECK(model_ == nullptr) << "Model is already initialized.";
 
-  device_.set_device();
-
   // initialize model
   context.set_image_embedding_mode(false);
   model_ = create_vlm_model(context);
@@ -56,7 +56,6 @@ bool VLMWorkerImpl::init_model(ModelContext& context) {
 
 std::optional<ForwardOutput> VLMWorkerImpl::step(
     const BatchedForwardInputs& inputs) {
-  device_.set_device();
   Timer timer;
   // TODO guojinrong, to adapt multi stream parallel later
   // all tensors should be on the same device as model
diff --git a/xllm/core/util/threadpool.cpp b/xllm/core/util/threadpool.cpp
@@ -18,9 +18,15 @@ limitations under the License.
 #include <thread>
 
 namespace xllm {
-ThreadPool::ThreadPool(size_t num_threads) : queues_(num_threads) {
+ThreadPool::ThreadPool(size_t num_threads) : ThreadPool(num_threads, nullptr) {}
+
+ThreadPool::ThreadPool(size_t num_threads, Runnable init_func)
+    : queues_(num_threads) {
   for (size_t i = 0; i < num_threads; ++i) {
-    threads_.emplace_back([this, i]() { internal_loop(i); });
+    threads_.emplace_back(
+        [this, i, init_func = std::move(init_func)]() mutable {
+          internal_loop(i, std::move(init_func));
+        });
   }
 }
 
@@ -60,7 +66,11 @@ void ThreadPool::schedule_with_tid(Runnable runnable, size_t tid) {
   queues_[tid].enqueue(std::move(runnable));
 }
 
-void ThreadPool::internal_loop(size_t index) {
+void ThreadPool::internal_loop(size_t index, Runnable&& init_func) {
+  if (init_func != nullptr) {
+    init_func();
+  }
+
   while (true) {
     Runnable runnable;
     queues_[index].wait_dequeue(runnable);
diff --git a/xllm/core/util/threadpool.h b/xllm/core/util/threadpool.h
@@ -40,6 +40,7 @@ class ThreadPool final {
   ThreadPool& operator=(ThreadPool&&) = delete;
 
   explicit ThreadPool(size_t num_threads);
+  explicit ThreadPool(size_t num_threads, Runnable init_func);
 
   // schedule a runnable to be executed
   int32_t schedule(Runnable runnable);
@@ -55,7 +56,7 @@ class ThreadPool final {
   size_t size() { return threads_.size(); }
 
  private:
-  void internal_loop(size_t tid);
+  void internal_loop(size_t tid, Runnable&& init_func);
 
   std::vector<std::thread> threads_;
   std::vector<moodycamel::BlockingConcurrentQueue<Runnable>> queues_;