Skip to content

Commit b75b3c1

Browse files
committed
fix
1 parent 08cc488 commit b75b3c1

File tree

4 files changed

+7
-45
lines changed

4 files changed

+7
-45
lines changed

lightllm/server/api_cli.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -241,14 +241,6 @@ def make_argument_parser() -> argparse.ArgumentParser:
241241
help="""aggressive schedule can lead to frequent prefill interruptions during decode.
242242
disabling it allows the router_max_wait_tokens parameter to work more effectively.""",
243243
)
244-
parser.add_argument(
245-
"--dp_prefill_wait_step",
246-
type=int,
247-
default=0,
248-
help="""dp_prefill_wait_step is used to control the pacing of dp chunked prefill mode, aiming to reduce
249-
computational waste during prefill. However, higher values can negatively impact the
250-
first token latency. It is generally recommended to set this value between 0 and 12.""",
251-
)
252244

253245
parser.add_argument(
254246
"--use_dynamic_prompt_cache", action="store_true", help="This argument is deprecated and no longer in use."

lightllm/server/core/objs/req.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ def get_tuple_tokens(self, is_busy, router_max_new_token_len):
310310
# 就是通过模拟加长其输出token长度,来延长其在估计阶段的生命周期。max_waiting_token
311311
# 的计算是保守的,每次chuncked prefill 延迟的最大步数为两种模式之合,因为
312312
# 这个并不会导致预估的token占用量大幅增加,所以可以放心使用。
313-
max_waiting_token = args.router_max_wait_tokens + args.dp_prefill_wait_step
313+
max_waiting_token = args.router_max_wait_tokens
314314
has_out_len = self.shm_cur_output_len
315315
if self.sample_params.ignore_eos:
316316
cur_max_new_token_len = self.sample_params.max_new_tokens

lightllm/server/core/objs/start_args_type.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ class StartArgs:
4949
router_token_ratio: float = field(default=0.0)
5050
router_max_new_token_len: int = field(default=1024)
5151
router_max_wait_tokens: int = field(default=1)
52-
dp_prefill_wait_step: int = field(default=0)
5352
disable_aggressive_schedule: bool = field(default=False)
5453
disable_dynamic_prompt_cache: bool = field(default=False)
5554
chunked_prefill_size: int = field(default=8192)

lightllm/server/router/model_infer/mode_backend/dp_backend/control_state.py

Lines changed: 6 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@ def __init__(self, backend: ModeBackend):
1616
self.left_decode_num = self.decode_max_step
1717

1818
self.step_count = 0
19-
20-
# dp prefill 配平调度的延迟参数。
21-
self.dp_prefill_wait_step = 0
22-
self.dp_prefill_wait_max_step = get_env_start_args().dp_prefill_wait_step
2319
return
2420

2521
def select_run_way(
@@ -71,48 +67,23 @@ def _normal_way(
7167
prefill_reqs: List[InferReq],
7268
decode_reqs: List[InferReq],
7369
):
74-
"""
75-
_normal_way 接口用于控制 DP 模式下进行chuncked prefill时,需要考虑各个DP的真实运行请求数量:
76-
考虑 8 个 dp 的场景,如果每个 dp 执行 prefill 的请求的数量分别为: [1, 1, 0, 0, 0, 0, 0, 0], 则在运行
77-
的过程中,请求数量为0的dp会pad一个fake req来参与计算,但是这会导致这些dp因为一些通信同步的原因,造成大量
78-
算力浪费,实际有效率很低。
79-
解决方法:
80-
在判断是否可以进行 prefill 的时候,需要先考虑所有dp的请求数量是否均衡,浪费率是否在可以接受的范围,如果无法
81-
接受这么高的浪费率,则可以延迟 prefill 的执行时机,直到所有dp的浪费率较低时再进行prefill, 不过延迟执行的极限
82-
等待时间,受到 dp_prefill_wait_step 参数的控制。
83-
"""
84-
use_ratio = np.count_nonzero(dp_prefill_req_nums) / dp_prefill_req_nums.shape[0]
70+
# use_ratio = np.count_nonzero(dp_prefill_req_nums) / dp_prefill_req_nums.shape[0]
8571
max_decode_num = np.max(dp_decode_req_nums)
8672
max_prefill_num = np.max(dp_prefill_req_nums)
8773

8874
if self.left_decode_num > 0 and max_decode_num > 0:
8975
self.left_decode_num -= 1
9076
return RunWay.DECODE
9177

92-
if use_ratio < 0.6:
93-
if max_prefill_num > 0:
94-
self.dp_prefill_wait_step += 1
95-
if self.dp_prefill_wait_step > self.dp_prefill_wait_max_step:
96-
# prefill 一次允许进行几次 decode 操作。
97-
self.left_decode_num = self.decode_max_step
98-
self.dp_prefill_wait_step = max(0, (self.dp_prefill_wait_step - self.decode_max_step))
99-
return RunWay.PREFILL
100-
78+
if max_prefill_num > 0:
79+
# prefill 一次允许进行几次 decode 操作。
80+
self.left_decode_num = self.decode_max_step
81+
return RunWay.PREFILL
82+
else:
10183
if max_decode_num > 0:
10284
return RunWay.DECODE
10385
else:
10486
return RunWay.PASS
105-
else:
106-
if max_prefill_num > 0:
107-
self.dp_prefill_wait_step = 0
108-
# prefill 一次允许进行几次 decode 操作。
109-
self.left_decode_num = self.decode_max_step
110-
return RunWay.PREFILL
111-
else:
112-
if max_decode_num > 0:
113-
return RunWay.DECODE
114-
else:
115-
return RunWay.PASS
11687

11788
def try_recover_paused_reqs(self) -> bool:
11889
return self.step_count % 100 == 0

0 commit comments

Comments
 (0)