xiao-yu-chen
diff --git a/‎xllm/core/framework/batch/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎xllm/core/framework/batch/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 10 additions & 0 deletions b/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎xllm/core/framework/batch/batch.h‎
Lines changed: 15 additions & 0 deletions b/‎xllm/core/framework/batch/batch.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎xllm/core/framework/batch/batch_factory.cpp‎
Lines changed: 27 additions & 1 deletion b/‎xllm/core/framework/batch/batch_factory.cpp‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎xllm/core/framework/batch/batch_factory.h‎
Lines changed: 3 additions & 0 deletions b/‎xllm/core/framework/batch/batch_factory.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 16 additions & 1 deletion b/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/framework/batch/beam_search.h‎
Lines changed: 130 additions & 0 deletions b/‎xllm/core/framework/batch/beam_search.h‎
Lines changed: 130 additions & 0 deletions
@@ -15,6 +15,7 @@ cc_library(
     batch_factory.cpp
     batch_input_builder.cpp
     mposition.cpp
+    beam_search.h
   DEPS
     :request
     :runtime
 
@@ -75,6 +75,7 @@ ForwardInput Batch::prepare_forward_input(uint32_t num_decoding_tokens,
                             mm_data_vec_,
                             copy_in_cache_block_infos_,
                             copy_out_cache_block_infos_,
+                            swap_cache_block_infos_,
                             &args);
   return builder.build_forward_input(num_decoding_tokens,
                                      min_decoding_batch_size);
@@ -88,6 +89,7 @@ RawForwardInput Batch::prepare_forward_input(uint32_t start_idx,
                             mm_data_vec_,
                             copy_in_cache_block_infos_,
                             copy_out_cache_block_infos_,
+                            swap_cache_block_infos_,
                             nullptr);
   return builder.build_raw_forward_input(start_idx, end_idx);
 }
@@ -134,6 +136,7 @@ void Batch::process_sample_output(const RawForwardOutput& raw_output,
     }
   }
   CHECK_EQ(output_idx, num_seqs);
+  process_beam_search();
 }
 
 void Batch::process_sample_output(const SampleOutput& sample_output,
@@ -175,6 +178,7 @@ void Batch::process_sample_output(const SampleOutput& sample_output,
     append_token_for_sequence(seq, token, 0, enable_schedule_overlap);
   }
   CHECK_EQ(output_idx, num_seqs);
+  process_beam_search();
 }
 
 bool Batch::update_sequence_state(Sequence* seq, bool enable_schedule_overlap) {
@@ -246,4 +250,10 @@ void Batch::process_embedding_output(const torch::Tensor& output_embedding) {
     }
   }
 }
+
+void Batch::process_beam_search() {
+  for (auto* sequence_group : sequence_groups_) {
+    sequence_group->process_beam_search();
+  }
+}
 }  // namespace xllm
@@ -22,7 +22,9 @@ limitations under the License.
 #include <vector>
 
 #include "framework/request/mm_data.h"
+#include "framework/request/request.h"
 #include "framework/request/sequence.h"
+#include "framework/request/sequences_group.h"
 #include "runtime/forward_params.h"
 
 namespace xllm {
@@ -41,6 +43,10 @@ class Batch {
 
   void add(const std::vector<Sequence*>& sequences);
 
+  void add(SequencesGroup* sequence_group) {
+    sequence_groups_.push_back(sequence_group);
+  };
+
   void set_copy_in_cache_block_infos(
       std::vector<CacheBlockInfo>* copy_in_cache_block_infos) {
     copy_in_cache_block_infos_ = copy_in_cache_block_infos;
@@ -51,6 +57,11 @@ class Batch {
     copy_out_cache_block_infos_ = copy_out_cache_block_infos;
   }
 
+  void set_swap_cache_block_infos(
+      std::vector<CacheBlockInfo>* swap_cache_block_infos) {
+    swap_cache_block_infos_ = swap_cache_block_infos;
+  }
+
   // get the number of sequences in the batch
   size_t size() const { return sequences_.size(); }
   bool empty() const { return sequences_.empty(); }
@@ -93,9 +104,13 @@ class Batch {
                                  int token_idx,
                                  bool enable_schedule_overlap);
 
+  void process_beam_search();
+
   std::vector<Sequence*> sequences_;
+  std::vector<SequencesGroup*> sequence_groups_;
   std::vector<CacheBlockInfo>* copy_in_cache_block_infos_ = nullptr;
   std::vector<CacheBlockInfo>* copy_out_cache_block_infos_ = nullptr;
+  std::vector<CacheBlockInfo>* swap_cache_block_infos_ = nullptr;
 
   // max number of tokens to process for each sequence
   // default to max value
 
@@ -17,11 +17,25 @@ limitations under the License.
 
 namespace xllm {
 
+namespace {
+
+bool is_beam_search(const std::vector<std::shared_ptr<Request>>& requests) {
+  for (const auto& request : requests) {
+    if (request->check_beam_search()) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
 std::vector<Batch> BatchFactory::create_batches(
+    const std::vector<std::shared_ptr<Request>>& running_requests,
     const std::vector<Sequence*>& running_sequences,
     const std::vector<size_t>& running_sequences_budgets,
     std::vector<std::vector<CacheBlockInfo>>* copy_in_cache_block_infos,
-    std::vector<std::vector<CacheBlockInfo>>* copy_out_cache_block_infos) {
+    std::vector<std::vector<CacheBlockInfo>>* copy_out_cache_block_infos,
+    std::vector<std::vector<CacheBlockInfo>>* swap_cache_block_infos) {
   size_t num_prompt_tokens = 0;
   size_t num_generated_tokens = 0;
   std::vector<Batch> batches(dp_size_);
@@ -50,6 +64,14 @@ std::vector<Batch> BatchFactory::create_batches(
     }
   }
 
+  if (is_beam_search(running_requests)) {
+    for (const auto& request : running_requests) {
+      auto seq_group = request->sequence_group();
+      int32_t dp_rank = seq_group->dp_rank();
+      batches[dp_rank].add(seq_group);
+    }
+  }
+
   for (int i = 0; i < dp_size_; i++) {
     if (!batches[i].empty()) {
       if (copy_in_cache_block_infos != nullptr &&
@@ -62,6 +84,10 @@ std::vector<Batch> BatchFactory::create_batches(
         batches[i].set_copy_out_cache_block_infos(
             &(copy_out_cache_block_infos->at(i)));
       }
+      if (swap_cache_block_infos != nullptr &&
+          swap_cache_block_infos->size() == dp_size_) {
+        batches[i].set_swap_cache_block_infos(&(swap_cache_block_infos->at(i)));
+      }
     }
   }
 
 
@@ -28,11 +28,14 @@ class BatchFactory {
   }
 
   std::vector<Batch> create_batches(
+      const std::vector<std::shared_ptr<Request>>& running_requests,
       const std::vector<Sequence*>& running_sequences,
       const std::vector<size_t>& running_sequences_budgets,
       std::vector<std::vector<CacheBlockInfo>>* copy_in_cache_block_infos =
           nullptr,
       std::vector<std::vector<CacheBlockInfo>>* copy_out_cache_block_infos =
+          nullptr,
+      std::vector<std::vector<CacheBlockInfo>>* swap_cache_block_infos =
           nullptr);
 
  private:
 
@@ -57,6 +57,7 @@ BatchInputBuilder::BatchInputBuilder(
     const std::vector<MMData>& mm_data_vec,
     const std::vector<CacheBlockInfo>* copy_in_cache_block_infos,
     const std::vector<CacheBlockInfo>* copy_out_cache_block_infos,
+    const std::vector<CacheBlockInfo>* swap_cache_block_infos,
     const ModelArgs* args)
     : sequences_(sequences),
       allowed_max_tokens_(allowed_max_tokens),
@@ -65,7 +66,8 @@ BatchInputBuilder::BatchInputBuilder(
       args_(args),
       num_sequences_(static_cast<int32_t>(sequences.size())),
       copy_in_cache_block_infos_(copy_in_cache_block_infos),
-      copy_out_cache_block_infos_(copy_out_cache_block_infos) {
+      copy_out_cache_block_infos_(copy_out_cache_block_infos),
+      swap_cache_block_infos_(swap_cache_block_infos) {
   // Reserve space for better performance
   state_.flatten_tokens_vec.reserve(1000);
   state_.flatten_positions_vec.reserve(1000);
@@ -348,6 +350,13 @@ ForwardInput BatchInputBuilder::state_to_forward_input() {
     input_params.input_embedding = torch::cat(input_embeddings_vec_);
   }
 
+  if (swap_cache_block_infos_ != nullptr &&
+      swap_cache_block_infos_->size() > 0) {
+    input_params.swap_blocks.insert(input_params.swap_blocks.end(),
+                                    swap_cache_block_infos_->begin(),
+                                    swap_cache_block_infos_->end());
+  }
+
   CHECK_EQ(state_.sampling_params.size(), state_.selected_token_idxes.size());
   // Setup sampling parameters
   if (!state_.selected_token_idxes.empty()) {
@@ -427,6 +436,12 @@ RawForwardInput BatchInputBuilder::state_to_raw_forward_input() {
         copy_in_cache_block_infos_->begin(),
         copy_in_cache_block_infos_->end());
   }
+  if (swap_cache_block_infos_ != nullptr &&
+      swap_cache_block_infos_->size() > 0) {
+    raw_forward_input.swap_blocks.insert(raw_forward_input.swap_blocks.end(),
+                                         swap_cache_block_infos_->begin(),
+                                         swap_cache_block_infos_->end());
+  }
 
   split_copy_out_blocks(raw_forward_input, write_block_ids_);
 
 
@@ -38,6 +38,7 @@ class BatchInputBuilder {
       const std::vector<MMData>& mm_data_vec,
       const std::vector<CacheBlockInfo>* copy_in_cache_block_infos,
       const std::vector<CacheBlockInfo>* copy_out_cache_block_infos,
+      const std::vector<CacheBlockInfo>* swap_cache_block_infos,
       const ModelArgs* args);
 
   ForwardInput build_forward_input(uint32_t num_decoding_tokens,
@@ -125,6 +126,7 @@ class BatchInputBuilder {
   std::unordered_set<int32_t> write_block_ids_;
   const std::vector<CacheBlockInfo>* copy_in_cache_block_infos_ = nullptr;
   const std::vector<CacheBlockInfo>* copy_out_cache_block_infos_ = nullptr;
+  const std::vector<CacheBlockInfo>* swap_cache_block_infos_ = nullptr;
 };
 
 }  // namespace xllm
@@ -0,0 +1,130 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+namespace xllm {
+
+// BeamCandidate structure for beam search sorting
+struct BeamCandidate {
+  size_t seq_index;
+  float logprob_sum;
+  std::vector<int32_t> token_ids;
+  std::vector<std::optional<float>> logprobs;
+
+  BeamCandidate() = default;
+
+  BeamCandidate(size_t seq_idx,
+                float logprob,
+                std::vector<int32_t>& token_ids,
+                std::vector<std::optional<float>>& logprobs)
+      : seq_index(seq_idx),
+        logprob_sum(logprob),
+        token_ids(std::move(token_ids)),
+        logprobs(std::move(logprobs)) {}
+
+  bool operator<(const BeamCandidate& other) const {
+    return logprob_sum > other.logprob_sum;
+  }
+};
+
+template <typename CandidateType>
+class SimpleTopKOptimizer {
+ private:
+  std::priority_queue<CandidateType> min_heap_;
+  size_t k_;
+
+ public:
+  explicit SimpleTopKOptimizer(size_t k) : k_(k) {}
+
+  void clear() {
+    while (!min_heap_.empty()) {
+      min_heap_.pop();
+    }
+  }
+
+  void insert(const CandidateType& candidate) {
+    if (min_heap_.size() < k_) {
+      min_heap_.push(candidate);
+    } else if (candidate.logprob_sum > min_heap_.top().logprob_sum) {
+      min_heap_.pop();
+      min_heap_.push(candidate);
+    }
+  }
+
+  void insert(CandidateType&& candidate) {
+    if (min_heap_.size() < k_) {
+      min_heap_.push(std::move(candidate));
+    } else if (candidate.logprob_sum > min_heap_.top().logprob_sum) {
+      min_heap_.pop();
+      min_heap_.push(std::move(candidate));
+    }
+  }
+
+  void insert_batch(const std::vector<CandidateType>& candidates) {
+    for (const auto& candidate : candidates) {
+      insert(candidate);
+    }
+  }
+
+  std::vector<CandidateType> getTopK() {
+    std::vector<CandidateType> result;
+    result.reserve(min_heap_.size());
+
+    while (!min_heap_.empty()) {
+      result.emplace_back(
+          std::move(const_cast<CandidateType&>(min_heap_.top())));
+      min_heap_.pop();
+    }
+
+    return result;
+  }
+
+  std::vector<CandidateType>&& getTopKMove() {
+    std::vector<CandidateType> result;
+    result.reserve(min_heap_.size());
+
+    while (!min_heap_.empty()) {
+      result.emplace_back(
+          std::move(const_cast<CandidateType&>(min_heap_.top())));
+      min_heap_.pop();
+    }
+
+    return std::move(result);
+  }
+
+  std::vector<CandidateType> getTopKSorted() {
+    std::vector<CandidateType> result = getTopK();
+    std::reverse(result.begin(), result.end());
+    return result;
+  }
+
+  size_t size() const { return min_heap_.size(); }
+
+  bool empty() const { return min_heap_.empty(); }
+
+  bool worthInserting(float logprob_sum) const {
+    return min_heap_.size() < k_ || logprob_sum > min_heap_.top().logprob_sum;
+  }
+
+  float getMinLogprob() const {
+    return min_heap_.empty() ? -std::numeric_limits<float>::infinity()
+                             : min_heap_.top().logprob_sum;
+  }
+};
+
+using SimpleTopKOptimizerBeamCandidate = SimpleTopKOptimizer<BeamCandidate>;
+
+}  // namespace xllm
Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ ForwardInput Batch::prepare_forward_input(uint32_t num_decoding_tokens,`
`75`	`75`	`mm_data_vec_,`
`76`	`76`	`copy_in_cache_block_infos_,`
`77`	`77`	`copy_out_cache_block_infos_,`
	`78`	`+ swap_cache_block_infos_,`
`78`	`79`	`&args);`
`79`	`80`	`return builder.build_forward_input(num_decoding_tokens,`
`80`	`81`	`min_decoding_batch_size);`
`@@ -88,6 +89,7 @@ RawForwardInput Batch::prepare_forward_input(uint32_t start_idx,`
`88`	`89`	`mm_data_vec_,`
`89`	`90`	`copy_in_cache_block_infos_,`
`90`	`91`	`copy_out_cache_block_infos_,`
	`92`	`+ swap_cache_block_infos_,`
`91`	`93`	`nullptr);`
`92`	`94`	`return builder.build_raw_forward_input(start_idx, end_idx);`
`93`	`95`	`}`
`@@ -134,6 +136,7 @@ void Batch::process_sample_output(const RawForwardOutput& raw_output,`
`134`	`136`	`}`
`135`	`137`	`}`
`136`	`138`	`CHECK_EQ(output_idx, num_seqs);`
	`139`	`+ process_beam_search();`
`137`	`140`	`}`
`138`	`141`
`139`	`142`	`void Batch::process_sample_output(const SampleOutput& sample_output,`
`@@ -175,6 +178,7 @@ void Batch::process_sample_output(const SampleOutput& sample_output,`
`175`	`178`	`append_token_for_sequence(seq, token, 0, enable_schedule_overlap);`
`176`	`179`	`}`
`177`	`180`	`CHECK_EQ(output_idx, num_seqs);`
	`181`	`+ process_beam_search();`
`178`	`182`	`}`
`179`	`183`
`180`	`184`	`bool Batch::update_sequence_state(Sequence* seq, bool enable_schedule_overlap) {`
`@@ -246,4 +250,10 @@ void Batch::process_embedding_output(const torch::Tensor& output_embedding) {`
`246`	`250`	`}`
`247`	`251`	`}`
`248`	`252`	`}`
	`253`	`+`
	`254`	`+void Batch::process_beam_search() {`
	`255`	`+ for (auto* sequence_group : sequence_groups_) {`
	`256`	`+ sequence_group->process_beam_search();`
	`257`	`+ }`
	`258`	`+}`
`249`	`259`	`} // namespace xllm`