ModelTC
diff --git a/‎lightllm/server/api_start.py‎
Lines changed: 1 addition & 0 deletions b/‎lightllm/server/api_start.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lightllm/server/router/req_queue/__init__.py‎
Lines changed: 5 additions & 5 deletions b/‎lightllm/server/router/req_queue/__init__.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lightllm/server/router/req_queue/chunked_prefill/impl.py‎
Lines changed: 27 additions & 23 deletions b/‎lightllm/server/router/req_queue/chunked_prefill/impl.py‎
Lines changed: 27 additions & 23 deletions
diff --git a/‎…ue/continues_batch/impl_for_pd_decode.py‎ ‎…ue/chunked_prefill/impl_for_pd_decode.py‎lightllm/server/router/req_queue/continues_batch/impl_for_pd_decode.py renamed to lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_decode.py b/‎…ue/continues_batch/impl_for_pd_decode.py‎ ‎…ue/chunked_prefill/impl_for_pd_decode.py‎lightllm/server/router/req_queue/continues_batch/impl_for_pd_decode.py renamed to lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_decode.py
diff --git a/‎lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_prefill.py‎
Lines changed: 0 additions & 126 deletions b/‎lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_prefill.py‎
Lines changed: 0 additions & 126 deletions
diff --git a/‎lightllm/server/router/req_queue/continues_batch/__init__.py‎ b/‎lightllm/server/router/req_queue/continues_batch/__init__.py‎
diff --git a/‎lightllm/server/router/req_queue/continues_batch/impl.py‎
Lines changed: 0 additions & 121 deletions b/‎lightllm/server/router/req_queue/continues_batch/impl.py‎
Lines changed: 0 additions & 121 deletions
@@ -159,6 +159,7 @@ def normal_or_p_d_start(args):
         args.visual_nccl_ports = args.visual_nccl_ports[: args.visual_dp]
 
     if args.disable_chunked_prefill:
+        args.chunked_prefill_size = args.max_req_total_len
         # 普通模式下
         if args.batch_max_tokens is None:
             args.batch_max_tokens = args.max_req_total_len
 
@@ -1,6 +1,4 @@
-from .continues_batch.impl import ContinuesBatchQueue
-from .continues_batch.impl_for_pd_decode import QueueForPDDecode
-from .chunked_prefill.impl_for_pd_prefill import QueueForPDChunkedPrefill
+from .chunked_prefill.impl_for_pd_decode import QueueForPDDecode
 from .chunked_prefill.impl import ChunkedPrefillQueue
 from .chunked_prefill.beam_impl import ChunkedBeamContinuesBatchQueue
 from .dp_base_queue import DpQueue
@@ -18,10 +16,12 @@ def _get_req_queue_class(args, router, dp_size_in_node: int):
     if args.run_mode == "decode":
         return QueueForPDDecode
     if args.run_mode == "prefill":
-        return QueueForPDChunkedPrefill
+        return ChunkedPrefillQueue
 
     if args.disable_chunked_prefill:
-        return ContinuesBatchQueue
+        # 虽然也使用chuncked prefill queue 但是由于 args.chunked_prefill_size = args.max_req_total_len
+        # 所以调度的实际行为类似过去的 continues batch 调度，所以将两种调度的实现统一为一种实现，减少代码重复。
+        return ChunkedPrefillQueue
     else:
         return ChunkedPrefillQueue
 
 
@@ -2,6 +2,7 @@
 import numpy as np
 from ...batch import Batch, Req
 from lightllm.server.router.req_queue.base_queue import BaseQueue
+from lightllm.common.basemodel.infer_lock import g_router_lock
 
 
 class ChunkedPrefillQueue(BaseQueue):
@@ -25,31 +26,32 @@ def _can_add_new_req(self, req: Req, is_busy, new_batch_first_router_need_tokens
         self.cache_len_list.sort(key=lambda x: -x[1])
 
         left_out_len_array = np.array([e[1] for e in self.cache_len_list])
-        # assert left_out_len_array.min() >= 0
         has_run_len_array = np.array([e[0] for e in self.cache_len_list])
         cum_run_len_array = np.cumsum(has_run_len_array)
         size_array = np.arange(1, len(self.cache_len_list) + 1, 1)
 
         need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max()
-        ok_token_num = (
-            need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index)
-            < self.max_total_tokens
-        )
+        with g_router_lock.obj:
+            ok_token_num = (
+                need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index)
+                < self.max_total_tokens
+            )
 
-        ok_req_num = len(self.cache_len_list) <= self.running_max_req_size
-        new_batch_first_router_need_tokens += req.get_first_router_need_tokens()
-        ok_prefill = new_batch_first_router_need_tokens <= self.batch_max_tokens
+            ok_req_num = len(self.cache_len_list) <= self.running_max_req_size
 
-        if ok_token_num and ok_req_num and ok_prefill:
-            self.router.shared_token_load.set_estimated_peak_token_count(need_max_token_num, self.dp_index)
-            self.router.shared_token_load.set_dynamic_max_load(
-                (need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
-                / self.max_total_tokens,
-                self.dp_index,
-            )
-            return True, new_batch_first_router_need_tokens
-        else:
-            return False, new_batch_first_router_need_tokens
+            new_batch_first_router_need_tokens += req.get_first_router_need_tokens()
+            ok_prefill = new_batch_first_router_need_tokens <= self.batch_max_tokens
+
+            if ok_token_num and ok_req_num and ok_prefill:
+                self.router.shared_token_load.set_estimated_peak_token_count(need_max_token_num, self.dp_index)
+                self.router.shared_token_load.set_dynamic_max_load(
+                    (need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
+                    / self.max_total_tokens,
+                    self.dp_index,
+                )
+                return True, new_batch_first_router_need_tokens
+            else:
+                return False, new_batch_first_router_need_tokens
 
     # @calculate_time(show=True, min_cost_ms=10)
     def generate_new_batch(self, current_batch: Batch, limit_router_queue_length: int = None):
@@ -114,8 +116,10 @@ def _calcu_batch_token_load_batch_not_none(self, current_batch: Batch):
             need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max()
         else:
             need_max_token_num = 0
-        return (
-            need_max_token_num,
-            (need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
-            / self.max_total_tokens,
-        )
+
+        with g_router_lock.obj:
+            return (
+                need_max_token_num,
+                (need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
+                / self.max_total_tokens,
+            )