jd-opensource
diff --git a/‎xllm/core/distributed_runtime/comm_channel.cpp‎
Lines changed: 34 additions & 21 deletions b/‎xllm/core/distributed_runtime/comm_channel.cpp‎
Lines changed: 34 additions & 21 deletions
diff --git a/‎xllm/core/distributed_runtime/comm_channel.h‎
Lines changed: 10 additions & 2 deletions b/‎xllm/core/distributed_runtime/comm_channel.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎xllm/core/distributed_runtime/remote_worker.cpp‎
Lines changed: 19 additions & 7 deletions b/‎xllm/core/distributed_runtime/remote_worker.cpp‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎xllm/core/distributed_runtime/remote_worker.h‎
Lines changed: 6 additions & 2 deletions b/‎xllm/core/distributed_runtime/remote_worker.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎xllm/core/distributed_runtime/worker_service.cpp‎
Lines changed: 11 additions & 8 deletions b/‎xllm/core/distributed_runtime/worker_service.cpp‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎xllm/core/distributed_runtime/worker_service.h‎
Lines changed: 4 additions & 5 deletions b/‎xllm/core/distributed_runtime/worker_service.h‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 4 additions & 6 deletions b/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎xllm/core/framework/batch/batch.h‎
Lines changed: 13 additions & 13 deletions b/‎xllm/core/framework/batch/batch.h‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎xllm/core/framework/batch/batch_factory.cpp‎
Lines changed: 5 additions & 16 deletions b/‎xllm/core/framework/batch/batch_factory.cpp‎
Lines changed: 5 additions & 16 deletions
diff --git a/‎xllm/core/framework/batch/batch_factory.h‎
Lines changed: 1 addition & 7 deletions b/‎xllm/core/framework/batch/batch_factory.h‎
Lines changed: 1 addition & 7 deletions
@@ -306,21 +306,33 @@ bool CommChannel::allocate_kv_cache_with_transfer(
   return true;
 }
 
-bool CommChannel::load_kv_blocks_from_store_async(
-    const std::vector<CacheBlockInfo>& cache_block_info,
+void CommChannel::transfer_kv_blocks(
+    const std::vector<BlockTransferInfo>& block_transfer_info,
     folly::Promise<uint32_t>& promise) {
-  proto::CacheBlockInfos pb_cache_block_info;
-  if (!cache_block_info_to_proto(cache_block_info, &pb_cache_block_info)) {
+  proto::BlockTransferInfos pb_block_transfer_info;
+  if (!block_transfer_info_to_proto(
+          0x0, block_transfer_info, &pb_block_transfer_info)) {
     promise.setValue(0);
-    return false;
+    return;
   }
 
-  auto done = new LoadKVCacheFromStoreClosure();
+  auto done = new TransferBlocksClosure();
   done->promise = std::move(promise);
-  stub_->LoadKVCacheFromStore(
-      &done->cntl, &pb_cache_block_info, &done->response, done);
+  stub_->TransferBlocks(
+      &done->cntl, &pb_block_transfer_info, &done->response, done);
+}
 
-  return true;
+void CommChannel::transfer_kv_blocks(
+    const uint64_t batch_id,
+    const std::vector<BlockTransferInfo>& block_transfer_info) {
+  proto::BlockTransferInfos pb_block_transfer_info;
+  if (!block_transfer_info_to_proto(
+          batch_id, block_transfer_info, &pb_block_transfer_info)) {
+    return;
+  }
+  brpc::Controller cntl;
+  proto::TransferStatus response;
+  stub_->TransferBlocks(&cntl, &pb_block_transfer_info, &response, nullptr);
 }
 
 bool CommChannel::get_last_step_result_async(
@@ -397,18 +409,6 @@ bool CommChannel::execute_model_with_brpc(
   return true;
 }
 
-void LoadKVCacheFromStoreClosure::Run() {
-  std::unique_ptr<LoadKVCacheFromStoreClosure> self_guard(this);
-
-  bool success = !cntl.Failed();
-  if (!success) {
-    promise.setValue(0);
-  } else {
-    promise.setValue(response.success_cnt());
-  }
-  return;
-}
-
 void ExecuteModelClosure::Run() {
   std::unique_ptr<ExecuteModelClosure> self_guard(this);
 
@@ -437,4 +437,17 @@ void InitModelClosure::Run() {
 
   return;
 }
+
+void TransferBlocksClosure::Run() {
+  std::unique_ptr<TransferBlocksClosure> self_guard(this);
+
+  bool success = !cntl.Failed();
+  if (!success) {
+    promise.setValue(0);
+  } else {
+    promise.setValue(response.success_cnt());
+  }
+  return;
+}
+
 }  // namespace xllm
@@ -91,6 +91,14 @@ class CommChannel {
       const std::vector<CacheBlockInfo>& cache_block_info,
       folly::Promise<uint32_t>& promise);
 
+  virtual void transfer_kv_blocks(
+      const std::vector<BlockTransferInfo>& block_transfer_info,
+      folly::Promise<uint32_t>& promise);
+
+  virtual void transfer_kv_blocks(
+      const uint64_t batch_id,
+      const std::vector<BlockTransferInfo>& block_transfer_info);
+
   virtual bool get_last_step_result_async(
       folly::Promise<std::optional<RawForwardOutput>>& promise);
 
@@ -128,11 +136,11 @@ class ExecuteModelClosure : public google::protobuf::Closure {
   folly::Promise<std::optional<RawForwardOutput>> promise;
 };
 
-class LoadKVCacheFromStoreClosure : public google::protobuf::Closure {
+class TransferBlocksClosure : public google::protobuf::Closure {
  public:
   void Run();
 
-  proto::StoreResponse response;
+  proto::TransferStatus response;
   brpc::Controller cntl;
   folly::Promise<uint32_t> promise;
 };
 
@@ -282,18 +282,30 @@ folly::SemiFuture<bool> RemoteWorker::pull_kv_blocks_async(
   return future;
 }
 
-folly::SemiFuture<uint32_t> RemoteWorker::load_kv_blocks_from_store_async(
-    const std::vector<CacheBlockInfo> cache_block_info) {
+folly::SemiFuture<uint32_t> RemoteWorker::transfer_kv_blocks(
+    const std::vector<BlockTransferInfo>& block_transfer_info) {
   folly::Promise<uint32_t> promise;
   auto future = promise.getSemiFuture();
-  general_threadpool_.schedule([this,
-                                cache_block_info = std::move(cache_block_info),
-                                promise = std::move(promise)]() mutable {
-    channel_->load_kv_blocks_from_store_async(cache_block_info, promise);
-  });
+  general_threadpool_.schedule(
+      [this,
+       block_transfer_info = std::move(block_transfer_info),
+       promise = std::move(promise)]() mutable {
+        channel_->transfer_kv_blocks(block_transfer_info, promise);
+      });
   return future;
 }
 
+void RemoteWorker::transfer_kv_blocks(
+    const uint64_t batch_id,
+    const std::vector<BlockTransferInfo>& block_transfer_info) {
+  general_threadpool_.schedule(
+      [this,
+       batch_id = batch_id,
+       block_transfer_info = std::move(block_transfer_info)]() mutable {
+        channel_->transfer_kv_blocks(batch_id, block_transfer_info);
+      });
+}
+
 const torch::Device& RemoteWorker::device() const {
   LOG(ERROR) << "RemoteWorker Method device is UnImplemented.";
 }
 
@@ -110,8 +110,12 @@ class RemoteWorker : public WorkerClient {
       const std::vector<uint64_t>& src_blocks,
       const std::vector<uint64_t>& dst_blocks);
 
-  virtual folly::SemiFuture<uint32_t> load_kv_blocks_from_store_async(
-      const std::vector<CacheBlockInfo> cache_block_info);
+  virtual folly::SemiFuture<uint32_t> transfer_kv_blocks(
+      const std::vector<BlockTransferInfo>& block_transfer_info) override;
+
+  virtual void transfer_kv_blocks(
+      const uint64_t batch_id,
+      const std::vector<BlockTransferInfo>& block_transfer_info) override;
 
   // Run the model and return the output.
   virtual folly::SemiFuture<std::optional<ForwardOutput>> step_async(
 
@@ -417,18 +417,21 @@ void WorkerService::PullKVCache(::google::protobuf::RpcController* controller,
   return;
 }
 
-void WorkerService::LoadKVCacheFromStore(
+void WorkerService::TransferBlocks(
     ::google::protobuf::RpcController* controller,
-    const ::xllm::proto::CacheBlockInfos* req,
-    ::xllm::proto::StoreResponse* resp,
+    const ::xllm::proto::BlockTransferInfos* req,
+    ::xllm::proto::TransferStatus* resp,
     ::google::protobuf::Closure* done) {
   brpc::ClosureGuard done_guard(done);
-  std::vector<CacheBlockInfo> dst_blocks;
-  proto_to_cache_block_info(*req, dst_blocks);
+  std::vector<BlockTransferInfo> block_transfer_info;
+  uint64_t batch_id;
+  proto_to_block_transfer_info(*req, batch_id, block_transfer_info);
 
-  auto future = worker_->load_kv_blocks_from_store_async(dst_blocks);
-
-  resp->set_success_cnt(std::move(future).get());
+  if (batch_id == 0x0) {
+    resp->set_success_cnt(worker_->transfer_kv_blocks(block_transfer_info));
+  } else {
+    worker_->transfer_kv_blocks(batch_id, std::move(block_transfer_info));
+  }
   return;
 }
 
 
@@ -80,11 +80,10 @@ class WorkerService : public proto::DistributeWorker {
                    proto::Status* resp,
                    ::google::protobuf::Closure* done) override;
 
-  virtual void LoadKVCacheFromStore(
-      ::google::protobuf::RpcController* controller,
-      const ::xllm::proto::CacheBlockInfos* req,
-      ::xllm::proto::StoreResponse* resp,
-      ::google::protobuf::Closure* done) override;
+  virtual void TransferBlocks(::google::protobuf::RpcController* controller,
+                              const ::xllm::proto::BlockTransferInfos* req,
+                              ::xllm::proto::TransferStatus* resp,
+                              ::google::protobuf::Closure* done) override;
 
   void GetDeviceInfo(::google::protobuf::RpcController* controller,
                      const proto::Empty* req,
 
@@ -73,9 +73,8 @@ ForwardInput Batch::prepare_forward_input(uint32_t num_decoding_tokens,
                             allowed_max_tokens_,
                             input_embeddings_vec_,
                             mm_data_vec_,
-                            copy_in_cache_block_infos_,
-                            copy_out_cache_block_infos_,
-                            swap_cache_block_infos_,
+                            swap_block_transfer_infos_,
+                            batch_id_,
                             &args);
   return builder.build_forward_input(num_decoding_tokens,
                                      min_decoding_batch_size);
@@ -88,9 +87,8 @@ RawForwardInput Batch::prepare_forward_input(uint32_t start_idx,
                             allowed_max_tokens_,
                             input_embeddings_vec_,
                             mm_data_vec_,
-                            copy_in_cache_block_infos_,
-                            copy_out_cache_block_infos_,
-                            swap_cache_block_infos_,
+                            swap_block_transfer_infos_,
+                            batch_id_,
                             nullptr,
                             thread_pool);
   return builder.build_raw_forward_input(start_idx, end_idx);
 
@@ -16,6 +16,8 @@ limitations under the License.
 
 #pragma once
 
+#include <absl/time/clock.h>
+#include <absl/time/time.h>
 #include <torch/torch.h>
 
 #include <limits>
@@ -48,20 +50,18 @@ class Batch {
     sequence_groups_.push_back(sequence_group);
   }
 
-  void set_copy_in_cache_block_infos(
-      std::vector<CacheBlockInfo>* copy_in_cache_block_infos) {
-    copy_in_cache_block_infos_ = copy_in_cache_block_infos;
+  void set_swap_block_transfer_infos(
+      std::vector<BlockTransferInfo>* swap_block_transfer_infos) {
+    swap_block_transfer_infos_ = swap_block_transfer_infos;
   }
 
-  void set_copy_out_cache_block_infos(
-      std::vector<CacheBlockInfo>* copy_out_cache_block_infos) {
-    copy_out_cache_block_infos_ = copy_out_cache_block_infos;
+  void set_batch_id() {
+    if (batch_id_ == 0x0) {
+      batch_id_ = absl::ToUnixMicros(absl::Now());
+    }
   }
 
-  void set_swap_cache_block_infos(
-      std::vector<CacheBlockInfo>* swap_cache_block_infos) {
-    swap_cache_block_infos_ = swap_cache_block_infos;
-  }
+  uint64_t batch_id() const { return batch_id_; }
 
   // get the number of sequences in the batch
   size_t size() const { return sequences_.size(); }
@@ -123,9 +123,7 @@ class Batch {
 
   std::vector<Sequence*> sequences_;
   std::vector<SequencesGroup*> sequence_groups_;
-  std::vector<CacheBlockInfo>* copy_in_cache_block_infos_ = nullptr;
-  std::vector<CacheBlockInfo>* copy_out_cache_block_infos_ = nullptr;
-  std::vector<CacheBlockInfo>* swap_cache_block_infos_ = nullptr;
+  std::vector<BlockTransferInfo>* swap_block_transfer_infos_ = nullptr;
 
   // max number of tokens to process for each sequence
   // default to max value
@@ -138,6 +136,8 @@ class Batch {
 
   // all sequences in this batch are in prefill stage
   bool all_seqs_in_prefill_ = false;
+
+  uint64_t batch_id_ = 0x0;
 };
 
 }  // namespace xllm
@@ -33,9 +33,7 @@ std::vector<Batch> BatchFactory::create_batches(
     const std::vector<std::shared_ptr<Request>>& running_requests,
     const std::vector<Sequence*>& running_sequences,
     const std::vector<size_t>& running_sequences_budgets,
-    std::vector<std::vector<CacheBlockInfo>>* copy_in_cache_block_infos,
-    std::vector<std::vector<CacheBlockInfo>>* copy_out_cache_block_infos,
-    std::vector<std::vector<CacheBlockInfo>>* swap_cache_block_infos) {
+    std::vector<std::vector<BlockTransferInfo>>* swap_block_transfer_infos) {
   size_t num_prompt_tokens = 0;
   size_t num_generated_tokens = 0;
   std::vector<Batch> batches(dp_size_);
@@ -74,19 +72,10 @@ std::vector<Batch> BatchFactory::create_batches(
 
   for (int i = 0; i < dp_size_; i++) {
     if (!batches[i].empty()) {
-      if (copy_in_cache_block_infos != nullptr &&
-          copy_in_cache_block_infos->size() == dp_size_) {
-        batches[i].set_copy_in_cache_block_infos(
-            &(copy_in_cache_block_infos->at(i)));
-      }
-      if (copy_out_cache_block_infos != nullptr &&
-          copy_out_cache_block_infos->size() == dp_size_) {
-        batches[i].set_copy_out_cache_block_infos(
-            &(copy_out_cache_block_infos->at(i)));
-      }
-      if (swap_cache_block_infos != nullptr &&
-          swap_cache_block_infos->size() == dp_size_) {
-        batches[i].set_swap_cache_block_infos(&(swap_cache_block_infos->at(i)));
+      if (swap_block_transfer_infos != nullptr &&
+          swap_block_transfer_infos->size() == dp_size_) {
+        batches[i].set_swap_block_transfer_infos(
+            &(swap_block_transfer_infos->at(i)));
       }
     }
   }
 
@@ -31,14 +31,8 @@ class BatchFactory {
       const std::vector<std::shared_ptr<Request>>& running_requests,
       const std::vector<Sequence*>& running_sequences,
       const std::vector<size_t>& running_sequences_budgets,
-      // for global kv cache copy block from host to device
-      std::vector<std::vector<CacheBlockInfo>>* copy_in_cache_block_infos =
-          nullptr,
-      // for global kv cache copy block from device to host
-      std::vector<std::vector<CacheBlockInfo>>* copy_out_cache_block_infos =
-          nullptr,
       // for beam-search
-      std::vector<std::vector<CacheBlockInfo>>* swap_cache_block_infos =
+      std::vector<std::vector<BlockTransferInfo>>* swap_block_transfer_infos =
           nullptr);
 
  private: