Skip to content

Commit 94ca166

Browse files
committed
Merge branch 'dp_balancer' into fused_moe_improve
2 parents 6430851 + ea0ada4 commit 94ca166

File tree

24 files changed

+484
-203
lines changed

24 files changed

+484
-203
lines changed

docs/CN/source/getting_started/installation.rst

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton
2323
$ # 拉取官方镜像
2424
$ docker pull ghcr.io/modeltc/lightllm:main
2525
$
26-
$ # 运行
26+
$ # 运行服务, 注意现在的lightllm服务非常的依赖共享内存部分,在启动
27+
$ # 前请确保你的docker设置中已经分配了足够的共享内存,否则可能导致
28+
$ # 服务无法正常启动。
29+
$ # 1.如果是纯文本服务,建议分配2GB以上的共享内存, 如果你的内存充足,建议分配16GB以上的共享内存.
30+
$ # 2.如果是多模态服务,建议分配16GB以上的共享内存,具体可以根据实际情况进行调整.
31+
$ # 如果你没有足够的共享内存,可以尝试在启动服务的时候调低 --running_max_req_size 参数,这会降低
32+
$ # 服务的并发请求数量,但可以减少共享内存的占用。如果是多模态服务,也可以通过降低 --cache_capacity
33+
$ # 参数来减少共享内存的占用。
2734
$ docker run -it --gpus all -p 8080:8080 \
28-
$ --shm-size 1g -v your_local_path:/data/ \
35+
$ --shm-size 2g -v your_local_path:/data/ \
2936
$ ghcr.io/modeltc/lightllm:main /bin/bash
3037
3138
你也可以使用源码手动构建镜像并运行:
@@ -37,7 +44,7 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton
3744
$
3845
$ # 运行
3946
$ docker run -it --gpus all -p 8080:8080 \
40-
$ --shm-size 1g -v your_local_path:/data/ \
47+
$ --shm-size 2g -v your_local_path:/data/ \
4148
$ <image_name> /bin/bash
4249
4350
或者你也可以直接使用脚本一键启动镜像并且运行:

docs/CN/source/tutorial/api_server_args_zh.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,10 @@ attention类型选择参数
236236

237237
多结果输出模式
238238

239+
.. option:: --schedule_time_interval
240+
241+
调度时间间隔,默认为 ``0.03``,单位为秒
242+
239243

240244
输出约束参数
241245
-----------

docs/EN/source/getting_started/installation.rst

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,20 @@ The easiest way to install Lightllm is using the official image. You can directl
2323
$ # Pull the official image
2424
$ docker pull ghcr.io/modeltc/lightllm:main
2525
$
26-
$ # Run
26+
$ # Run,The current LightLLM service relies heavily on shared memory.
27+
$ # Before starting, please make sure that you have allocated enough shared memory
28+
$ # in your Docker settings; otherwise, the service may fail to start properly.
29+
$ #
30+
$ # 1. For text-only services, it is recommended to allocate more than 2GB of shared memory.
31+
$ # If your system has sufficient RAM, allocating 16GB or more is recommended.
32+
$ # 2.For multimodal services, it is recommended to allocate 16GB or more of shared memory.
33+
$ # You can adjust this value according to your specific requirements.
34+
$ #
35+
$ # If you do not have enough shared memory available, you can try lowering
36+
$ # the --running_max_req_size parameter when starting the service.
37+
$ # This will reduce the number of concurrent requests, but also decrease shared memory usage.
2738
$ docker run -it --gpus all -p 8080:8080 \
28-
$ --shm-size 1g -v your_local_path:/data/ \
39+
$ --shm-size 2g -v your_local_path:/data/ \
2940
$ ghcr.io/modeltc/lightllm:main /bin/bash
3041
3142
You can also manually build the image from source and run it:
@@ -35,9 +46,9 @@ You can also manually build the image from source and run it:
3546
$ # Manually build the image
3647
$ docker build -t <image_name> .
3748
$
38-
$ # Run
49+
$ # Run,
3950
$ docker run -it --gpus all -p 8080:8080 \
40-
$ --shm-size 1g -v your_local_path:/data/ \
51+
$ --shm-size 2g -v your_local_path:/data/ \
4152
$ <image_name> /bin/bash
4253
4354
Or you can directly use the script to launch the image and run it with one click:

docs/EN/source/tutorial/api_server_args_zh.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,10 @@ Scheduling Parameters
236236

237237
Multi-result output mode
238238

239+
.. option:: --schedule_time_interval
240+
241+
Schedule time interval, default is ``0.03``, unit is seconds
242+
239243
Output Constraint Parameters
240244
---------------------------
241245

lightllm/common/basemodel/triton_kernel/gen_sampling_params.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,37 +121,54 @@ def _token_id_counter_update_kernel(
121121
counter_stride_m,
122122
counter_stride_n,
123123
next_token_ids_ptr,
124+
mask_ptr,
124125
batch_size,
126+
HAS_MASK: tl.constexpr,
125127
BLOCK: tl.constexpr,
126128
):
127129

128130
block_start_index = tl.program_id(0) * BLOCK
129131
offs = block_start_index + tl.arange(0, BLOCK)
130-
mask = offs < batch_size
131-
132-
req_idx = tl.load(b_req_idx_ptr + offs, mask=mask, other=0)
133-
token_ids = tl.load(next_token_ids_ptr + offs, mask=mask, other=0)
134-
135-
tl.atomic_add(
136-
req_to_out_token_id_counter_ptr + req_idx * counter_stride_m + token_ids * counter_stride_n, 1, mask=mask
137-
)
132+
loc_mask = offs < batch_size
133+
134+
req_idx = tl.load(b_req_idx_ptr + offs, mask=loc_mask, other=0)
135+
token_ids = tl.load(next_token_ids_ptr + offs, mask=loc_mask, other=0)
136+
137+
if HAS_MASK:
138+
mask = tl.load(mask_ptr + offs, mask=loc_mask, other=False)
139+
tl.atomic_add(
140+
req_to_out_token_id_counter_ptr + req_idx * counter_stride_m + token_ids * counter_stride_n,
141+
1,
142+
mask=loc_mask & mask,
143+
)
144+
else:
145+
tl.atomic_add(
146+
req_to_out_token_id_counter_ptr + req_idx * counter_stride_m + token_ids * counter_stride_n,
147+
1,
148+
mask=loc_mask,
149+
)
138150
return
139151

140152

141153
@torch.no_grad()
142154
def update_req_to_token_id_counter(
143-
b_req_idx: torch.Tensor, next_token_ids: torch.Tensor, req_to_out_token_id_counter: torch.Tensor
155+
b_req_idx: torch.Tensor,
156+
next_token_ids: torch.Tensor,
157+
req_to_out_token_id_counter: torch.Tensor,
158+
mask: torch.Tensor = None,
144159
):
145160
batch_size = b_req_idx.shape[0]
146161
BLOCK = 256
147-
162+
has_mask = mask is not None
148163
_token_id_counter_update_kernel[(triton.cdiv(batch_size, BLOCK),)](
149164
b_req_idx_ptr=b_req_idx,
150165
req_to_out_token_id_counter_ptr=req_to_out_token_id_counter,
151166
counter_stride_m=req_to_out_token_id_counter.stride(0),
152167
counter_stride_n=req_to_out_token_id_counter.stride(1),
153168
next_token_ids_ptr=next_token_ids,
169+
mask_ptr=mask,
154170
batch_size=batch_size,
171+
HAS_MASK=has_mask,
155172
BLOCK=BLOCK,
156173
num_warps=1,
157174
)

lightllm/common/req_manager.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -155,36 +155,42 @@ def init_req_sampling_params(self, req):
155155
else:
156156
self.req_to_out_token_id_counter[req.req_idx].fill_(0)
157157
if req.sampling_param.shm_param.input_penalty and req.need_out_token_id_statistics:
158-
prompt_ids = torch.from_numpy(req.shm_req.get_prompt_ids()).pin_memory().cuda(non_blocking=True)
158+
prompt_ids = torch.from_numpy(req.shm_req.get_prompt_ids_numpy()).pin_memory().cuda(non_blocking=True)
159159
token_id_counter(
160160
prompt_ids=prompt_ids, out_token_id_counter=self.req_to_out_token_id_counter[req.req_idx]
161161
)
162162

163163
return
164164

165+
def update_reqs_out_token_counter_gpu(
166+
self, b_req_idx: torch.Tensor, next_token_ids: torch.Tensor, mask: torch.Tensor = None
167+
):
168+
if self.penalty_counter_mode not in ["gpu_counter", "pin_mem_counter"]:
169+
return
170+
171+
assert b_req_idx.is_cuda and next_token_ids.is_cuda and b_req_idx.shape[0] == next_token_ids.shape[0]
172+
173+
update_req_to_token_id_counter(
174+
b_req_idx=b_req_idx,
175+
next_token_ids=next_token_ids,
176+
req_to_out_token_id_counter=self.req_to_out_token_id_counter,
177+
mask=mask,
178+
)
179+
return
180+
165181
def update_reqs_token_counter(
166182
self, req_objs: List, next_token_ids: List[int], accept_mark: Optional[List[List[bool]]] = None
167183
):
168184
from lightllm.server.router.model_infer.infer_batch import InferReq
169185

170186
req_objs: List[InferReq] = req_objs
171187

172-
if self.penalty_counter_mode == "cpu_counter":
173-
for req_obj, next_token_id in zip(req_objs, next_token_ids):
174-
if req_obj.need_out_token_id_statistics and req_obj.cur_output_len > 0:
175-
req_obj.out_token_id_count[next_token_id] += 1
176-
else:
177-
b_req_idx = torch.tensor(
178-
[req.req_idx for req in req_objs], dtype=torch.int32, device="cpu", pin_memory=True
179-
).cuda(non_blocking=True)
180-
next_token_ids = (
181-
torch.tensor(next_token_ids, dtype=torch.int32, device="cpu").pin_memory().cuda(non_blocking=True)
182-
)
183-
update_req_to_token_id_counter(
184-
b_req_idx=b_req_idx,
185-
next_token_ids=next_token_ids,
186-
req_to_out_token_id_counter=self.req_to_out_token_id_counter,
187-
)
188+
if self.penalty_counter_mode != "cpu_counter":
189+
return
190+
191+
for req_obj, next_token_id in zip(req_objs, next_token_ids):
192+
if req_obj.need_out_token_id_statistics and req_obj.cur_output_len > 0:
193+
req_obj.out_token_id_count[next_token_id] += 1
188194
return
189195

190196
def gen_cpu_out_token_counter_sampling_params(self, req_objs: List):

lightllm/server/api_cli.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
112112
help="tool call parser type",
113113
)
114114
parser.add_argument(
115-
"--running_max_req_size", type=int, default=1000, help="the max size for forward requests in the same time"
115+
"--running_max_req_size", type=int, default=2048, help="the max size for forward requests in the same time"
116116
)
117117
parser.add_argument("--nnodes", type=int, default=1, help="the number of nodes")
118118
parser.add_argument("--node_rank", type=int, default=0, help="the rank of the current node")
@@ -137,6 +137,12 @@ def make_argument_parser() -> argparse.ArgumentParser:
137137
using the deepseekv2 model, set dp to be equal to the tp parameter. In other cases, please
138138
do not set it and keep the default value as 1.""",
139139
)
140+
parser.add_argument(
141+
"--dp_balancer",
142+
type=str,
143+
default="round_robin",
144+
help="the dp balancer type, default is round_robin",
145+
)
140146
parser.add_argument(
141147
"--max_req_total_len", type=int, default=16384, help="the max value for req_input_len + req_output_len"
142148
)
@@ -476,4 +482,10 @@ def make_argument_parser() -> argparse.ArgumentParser:
476482
default=None,
477483
help="""Path of the kv quant calibration config. It can be used for llama and qwen model.""",
478484
)
485+
parser.add_argument(
486+
"--schedule_time_interval",
487+
type=float,
488+
default=0.03,
489+
help="""The interval of the schedule time, default is 30ms.""",
490+
)
479491
return parser

lightllm/server/core/objs/req.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ def link_logprobs_shm_array(self):
188188
def get_prompt_ids(self):
189189
return self.shm_prompt_ids.arr[: self.input_len].tolist()
190190

191+
def get_prompt_ids_numpy(self):
192+
return self.shm_prompt_ids.arr[: self.input_len]
193+
191194
def to_router_rpc_obj(self):
192195
if hasattr(self, "multimodal_params"):
193196
return (

lightllm/server/httpserver/manager.py

Lines changed: 46 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -645,50 +645,54 @@ async def handle_loop(self):
645645
except asyncio.TimeoutError:
646646
pass
647647

648-
for group_req_id_ in list(self.req_id_to_out_inf.keys()):
649-
req_status = self.req_id_to_out_inf.get(group_req_id_, None)
650-
if req_status is None:
651-
continue
648+
try:
649+
for group_req_id_ in list(self.req_id_to_out_inf.keys()):
650+
req_status = self.req_id_to_out_inf.get(group_req_id_, None)
651+
if req_status is None:
652+
continue
652653

653-
token_list = []
654-
for req in req_status.group_req_objs.shm_req_objs:
655-
req_id = req.request_id
656-
read_token_count = 1
657-
if req.out_tokens_queue.is_full():
658-
read_token_count = LIGHTLLM_OUT_TOKEN_QUEUE_SIZE
659-
660-
for _ in range(read_token_count):
661-
if not req.out_tokens_queue.is_empty():
662-
663-
text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
664-
req.cumlogprob += float(req.shm_logprobs.arr[src_index])
665-
metadata = {
666-
"id": int(req.shm_prompt_ids.arr[src_index]),
667-
"logprob": float(req.shm_logprobs.arr[src_index]),
668-
"cumlogprob": float(req.cumlogprob) / count_output_tokens,
669-
"special": special,
670-
"count_output_tokens": count_output_tokens,
671-
"prompt_cache_len": req.prompt_cache_len,
672-
"mtp_accepted_token_num": req.mtp_accepted_token_num,
673-
}
674-
if self.args.return_all_prompt_logprobs:
675-
metadata.update(req.get_all_prompt_metadata())
676-
if self.args.use_reward_model:
677-
metadata["score"] = float(req.reward_score)
678-
679-
req.out_tokens_queue.pop_no_ret()
680-
681-
if req.finish_token_index != src_index:
682-
token_list.append((req_id, text, metadata, FinishStatus()))
654+
token_list = []
655+
for req in req_status.group_req_objs.shm_req_objs:
656+
req_id = req.request_id
657+
read_token_count = 1
658+
if req.out_tokens_queue.is_full():
659+
read_token_count = LIGHTLLM_OUT_TOKEN_QUEUE_SIZE
660+
661+
for _ in range(read_token_count):
662+
if not req.out_tokens_queue.is_empty():
663+
664+
text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
665+
req.cumlogprob += float(req.shm_logprobs.arr[src_index])
666+
metadata = {
667+
"id": int(req.shm_prompt_ids.arr[src_index]),
668+
"logprob": float(req.shm_logprobs.arr[src_index]),
669+
"cumlogprob": float(req.cumlogprob) / count_output_tokens,
670+
"special": special,
671+
"count_output_tokens": count_output_tokens,
672+
"prompt_cache_len": req.prompt_cache_len,
673+
"mtp_accepted_token_num": req.mtp_accepted_token_num,
674+
}
675+
if self.args.return_all_prompt_logprobs:
676+
metadata.update(req.get_all_prompt_metadata())
677+
if self.args.use_reward_model:
678+
metadata["score"] = float(req.reward_score)
679+
680+
req.out_tokens_queue.pop_no_ret()
681+
682+
if req.finish_token_index != src_index:
683+
token_list.append((req_id, text, metadata, FinishStatus()))
684+
else:
685+
finish_status = FinishStatus(req.finish_status.status)
686+
token_list.append((req_id, text, metadata, finish_status))
683687
else:
684-
finish_status = FinishStatus(req.finish_status.status)
685-
token_list.append((req_id, text, metadata, finish_status))
686-
else:
687-
break
688-
689-
async with req_status.lock:
690-
req_status.out_token_info_list.extend(token_list)
691-
req_status.event.set()
688+
break
689+
690+
async with req_status.lock:
691+
req_status.out_token_info_list.extend(token_list)
692+
req_status.event.set()
693+
except BaseException as e:
694+
logger.exception(str(e))
695+
raise e
692696

693697
self.recycle_event.set()
694698
return

lightllm/server/router/batch.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,15 @@ def get_req_list_for_dp(self, dp_index: int):
4040
req_list.append(req)
4141
return req_list
4242

43+
def get_all_dp_req_num(self) -> List[int]:
44+
if self.dp_size_in_node == 1:
45+
return [len(self.reqs)]
46+
47+
all_dp_req_num = [0 for _ in range(self.dp_size_in_node)]
48+
for req in self.reqs:
49+
all_dp_req_num[req.sample_params.suggested_dp_index] += 1
50+
return all_dp_req_num
51+
4352
def filter_out_finished_req(self, shm_req_manager: ShmReqManager):
4453
unfinished_req_ids = []
4554
for req in self.reqs:

0 commit comments

Comments
 (0)