Skip to content

Commit 112656b

Browse files
committed
fix
1 parent af6f547 commit 112656b

File tree

7 files changed

+33
-275
lines changed

7 files changed

+33
-275
lines changed

lightllm/server/api_start.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def normal_or_p_d_start(args):
159159
args.visual_nccl_ports = args.visual_nccl_ports[: args.visual_dp]
160160

161161
if args.disable_chunked_prefill:
162+
args.chunked_prefill_size = args.max_req_total_len
162163
# 普通模式下
163164
if args.batch_max_tokens is None:
164165
args.batch_max_tokens = args.max_req_total_len

lightllm/server/router/req_queue/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
from .continues_batch.impl import ContinuesBatchQueue
2-
from .continues_batch.impl_for_pd_decode import QueueForPDDecode
3-
from .chunked_prefill.impl_for_pd_prefill import QueueForPDChunkedPrefill
1+
from .chunked_prefill.impl_for_pd_decode import QueueForPDDecode
42
from .chunked_prefill.impl import ChunkedPrefillQueue
53
from .chunked_prefill.beam_impl import ChunkedBeamContinuesBatchQueue
64
from .dp_base_queue import DpQueue
@@ -18,10 +16,12 @@ def _get_req_queue_class(args, router, dp_size_in_node: int):
1816
if args.run_mode == "decode":
1917
return QueueForPDDecode
2018
if args.run_mode == "prefill":
21-
return QueueForPDChunkedPrefill
19+
return ChunkedPrefillQueue
2220

2321
if args.disable_chunked_prefill:
24-
return ContinuesBatchQueue
22+
# 虽然也使用chuncked prefill queue 但是由于 args.chunked_prefill_size = args.max_req_total_len
23+
# 所以调度的实际行为类似过去的 continues batch 调度,所以将两种调度的实现统一为一种实现,减少代码重复。
24+
return ChunkedPrefillQueue
2525
else:
2626
return ChunkedPrefillQueue
2727

lightllm/server/router/req_queue/chunked_prefill/impl.py

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import numpy as np
33
from ...batch import Batch, Req
44
from lightllm.server.router.req_queue.base_queue import BaseQueue
5+
from lightllm.common.basemodel.infer_lock import g_router_lock
56

67

78
class ChunkedPrefillQueue(BaseQueue):
@@ -25,31 +26,32 @@ def _can_add_new_req(self, req: Req, is_busy, new_batch_first_router_need_tokens
2526
self.cache_len_list.sort(key=lambda x: -x[1])
2627

2728
left_out_len_array = np.array([e[1] for e in self.cache_len_list])
28-
# assert left_out_len_array.min() >= 0
2929
has_run_len_array = np.array([e[0] for e in self.cache_len_list])
3030
cum_run_len_array = np.cumsum(has_run_len_array)
3131
size_array = np.arange(1, len(self.cache_len_list) + 1, 1)
3232

3333
need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max()
34-
ok_token_num = (
35-
need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index)
36-
< self.max_total_tokens
37-
)
34+
with g_router_lock.obj:
35+
ok_token_num = (
36+
need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index)
37+
< self.max_total_tokens
38+
)
3839

39-
ok_req_num = len(self.cache_len_list) <= self.running_max_req_size
40-
new_batch_first_router_need_tokens += req.get_first_router_need_tokens()
41-
ok_prefill = new_batch_first_router_need_tokens <= self.batch_max_tokens
40+
ok_req_num = len(self.cache_len_list) <= self.running_max_req_size
4241

43-
if ok_token_num and ok_req_num and ok_prefill:
44-
self.router.shared_token_load.set_estimated_peak_token_count(need_max_token_num, self.dp_index)
45-
self.router.shared_token_load.set_dynamic_max_load(
46-
(need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
47-
/ self.max_total_tokens,
48-
self.dp_index,
49-
)
50-
return True, new_batch_first_router_need_tokens
51-
else:
52-
return False, new_batch_first_router_need_tokens
42+
new_batch_first_router_need_tokens += req.get_first_router_need_tokens()
43+
ok_prefill = new_batch_first_router_need_tokens <= self.batch_max_tokens
44+
45+
if ok_token_num and ok_req_num and ok_prefill:
46+
self.router.shared_token_load.set_estimated_peak_token_count(need_max_token_num, self.dp_index)
47+
self.router.shared_token_load.set_dynamic_max_load(
48+
(need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
49+
/ self.max_total_tokens,
50+
self.dp_index,
51+
)
52+
return True, new_batch_first_router_need_tokens
53+
else:
54+
return False, new_batch_first_router_need_tokens
5355

5456
# @calculate_time(show=True, min_cost_ms=10)
5557
def generate_new_batch(self, current_batch: Batch, limit_router_queue_length: int = None):
@@ -114,8 +116,10 @@ def _calcu_batch_token_load_batch_not_none(self, current_batch: Batch):
114116
need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max()
115117
else:
116118
need_max_token_num = 0
117-
return (
118-
need_max_token_num,
119-
(need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
120-
/ self.max_total_tokens,
121-
)
119+
120+
with g_router_lock.obj:
121+
return (
122+
need_max_token_num,
123+
(need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index))
124+
/ self.max_total_tokens,
125+
)

lightllm/server/router/req_queue/continues_batch/impl_for_pd_decode.py renamed to lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_decode.py

File renamed without changes.

lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_prefill.py

Lines changed: 0 additions & 126 deletions
This file was deleted.

lightllm/server/router/req_queue/continues_batch/__init__.py

Whitespace-only changes.

lightllm/server/router/req_queue/continues_batch/impl.py

Lines changed: 0 additions & 121 deletions
This file was deleted.

0 commit comments

Comments
 (0)