support chunked prefill

shihaobai · shihaobai · commit c0b11465b788 · 2025-02-20T21:17:26.000+08:00
diff --git a/lightllm/server/router/req_queue/chunked_prefill/impl.py b/lightllm/server/router/req_queue/chunked_prefill/impl.py
@@ -56,7 +56,7 @@ def _can_add_new_req(self, req: Req, is_busy, new_batch_first_router_need_tokens
             return False, new_batch_first_router_need_tokens
 
     # @calculate_time(show=True, min_cost_ms=10)
-    def generate_new_batch(self, current_batch: Batch):
+    def generate_new_batch(self, current_batch: Batch, from_req_list):
 
         # 如果当前已经被调度的请求数量超过了上限，直接不调度新的请求了。
         exist_req_num = self.get_batch_dp_req_size(current_batch) + len(self.pause_req_dict)
@@ -74,7 +74,7 @@ def generate_new_batch(self, current_batch: Batch):
         can_run_list = []
         abort_req_list = []
         aborted_count = 0
-        for req in self.waiting_req_list:
+        for req in from_req_list:
             if req.is_aborted and not req.is_paused:
                 # 由于管理的复杂性，只有没有被调度运行过的请求可以因为abort直接在队列中忽略掉.
                 # 暂停的请求需要恢复后，由 router manager 部分来过滤。暂时保持这种处理方法, 否则会导致管理token的泄漏
@@ -97,9 +97,11 @@ def generate_new_batch(self, current_batch: Batch):
             for req in abort_req_list:
                 self.router.shm_req_manager.put_back_req_obj(req)
             self.waiting_req_list = self.waiting_req_list[len(can_run_list) + aborted_count :]
-            return new_batch
+            poped_req_list = from_req_list[: len(can_run_list) + aborted_count]
+            remain_req_list = from_req_list[len(can_run_list) + aborted_count :]
+            return new_batch, poped_req_list, remain_req_list
         else:
-            return None
+            return None, [], from_req_list
 
     def _calcu_batch_token_load_batch_not_none(self, current_batch: Batch):
         is_busy = self.is_busy()