Skip to content

Commit 112a28e

Browse files
committed
merge main branch and fix conflict
2 parents fe6b167 + 74ed914 commit 112a28e

File tree

129 files changed

+13331
-3283
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

129 files changed

+13331
-3283
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ repos:
1111
hooks:
1212
- id: flake8
1313
additional_dependencies: [flake8-typing-imports==1.9.0]
14-
args: ['--config=.flake8', '--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606']
14+
args: ['--config=.flake8', '--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606, E231']

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ LightLLM is a Python-based LLM (Large Language Model) inference and serving fram
2121
[English Docs](https://lightllm-en.readthedocs.io/en/latest/) | [中文文档](https://lightllm-cn.readthedocs.io/en/latest/) | [Blogs](https://modeltc.github.io/lightllm-blog/)
2222

2323
## News
24-
- [2025/05] LightLLM paper on constrained decoding accepted by [ACL25](https://openreview.net/pdf?id=g1aBeiyZEi) (Pre $^3$: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation)
24+
- [2025/05] LightLLM paper on constrained decoding accepted by [ACL25](https://arxiv.org/pdf/2506.03887) (Pre $^3$: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation). For a more accessible overview of the research with key insights and examples, check out our blog post: [LightLLM Blog](https://www.light-ai.top/lightllm-blog/2025/06/15/pre3.html)
2525
- [2025/04] LightLLM paper on request scheduler published in [ASPLOS’25](https://dl.acm.org/doi/10.1145/3676641.3716011) (Past-Future Scheduler for LLM Serving under SLA Guarantees)
2626
- [2025/02] 🔥 LightLLM v1.0.0 release, achieving the **fastest DeepSeek-R1** serving performance on single H200 machine.
2727

docs/CN/source/tutorial/api_server_args_zh.rst

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,6 @@ attention类型选择参数
274274

275275
多模态资源的缓存服务器容量,默认为 ``200``
276276

277-
.. option:: --cache_reserved_ratio
278-
279-
缓存服务器清理后的保留容量比例,默认为 ``0.5``
280-
281277
.. option:: --visual_infer_batch_size
282278

283279
每次推理批次中处理的图像数量,默认为 ``1``

docs/EN/source/tutorial/api_server_args_zh.rst

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -273,10 +273,6 @@ Multimodal Parameters
273273

274274
Cache server capacity for multimodal resources, default is ``200``
275275

276-
.. option:: --cache_reserved_ratio
277-
278-
Reserved capacity ratio after cache server cleanup, default is ``0.5``
279-
280276
.. option:: --visual_infer_batch_size
281277

282278
Number of images processed in each inference batch, default is ``1``

lightllm/common/basemodel/basemodel.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
1919
from lightllm.common.basemodel.cuda_graph import CudaGraph
2020
from lightllm.common.quantization import Quantcfg
21+
from lightllm.common.basemodel.triton_kernel.gather_token_id import gather_token
2122
from lightllm.utils.log_utils import init_logger
2223
from lightllm.utils.dist_utils import get_dp_world_size, get_global_world_size, get_global_rank
2324
from lightllm.utils.envs_utils import get_env_start_args
24-
from lightllm.distributed.communication_op import CustomProcessGroup, dist_group_manager
25+
from lightllm.distributed.communication_op import dist_group_manager
2526
from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
2627
from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
28+
from lightllm.utils.envs_utils import set_model_init_status
2729

2830

2931
logger = init_logger(__name__)
@@ -104,6 +106,7 @@ def __init__(self, kvargs):
104106
self._init_cudagraph()
105107
self._check_max_len_infer()
106108
torch.cuda.empty_cache()
109+
set_model_init_status(True)
107110
return
108111

109112
def _init_config(self):
@@ -236,6 +239,7 @@ def _init_custom(self):
236239

237240
@torch.no_grad()
238241
def forward(self, model_input: ModelInput):
242+
model_input.to_cuda()
239243
assert model_input.mem_indexes.is_cuda
240244

241245
if model_input.is_prefill:
@@ -345,6 +349,14 @@ def _decode(
345349
self,
346350
model_input: ModelInput,
347351
) -> ModelOutput:
352+
# for overlap mode
353+
if model_input.input_ids is None:
354+
model_input.input_ids = gather_token(
355+
self.req_manager.req_sampling_params_manager.req_to_next_token_ids,
356+
model_input.b_req_idx,
357+
model_input.b_mtp_index,
358+
)
359+
348360
# collect global max batch_size
349361
world_size = get_global_world_size()
350362
rank = get_global_rank()
@@ -466,6 +478,9 @@ def _token_forward(self, input_ids, infer_state: InferStateInfo):
466478

467479
@torch.no_grad()
468480
def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: ModelInput):
481+
model_input0.to_cuda()
482+
model_input1.to_cuda()
483+
469484
assert model_input0.mem_indexes.is_cuda
470485
assert model_input1.mem_indexes.is_cuda
471486
input_ids0, input_ids1 = model_input0.input_ids, model_input1.input_ids
@@ -503,6 +518,22 @@ def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: Mod
503518

504519
@torch.no_grad()
505520
def microbatch_overlap_decode(self, model_input0: ModelInput, model_input1: ModelInput):
521+
model_input0.to_cuda()
522+
model_input1.to_cuda()
523+
524+
if model_input0.input_ids is None:
525+
model_input0.input_ids = gather_token(
526+
self.req_manager.req_sampling_params_manager.req_to_next_token_ids,
527+
model_input0.b_req_idx,
528+
model_input0.b_mtp_index,
529+
)
530+
if model_input1.input_ids is None:
531+
model_input1.input_ids = gather_token(
532+
self.req_manager.req_sampling_params_manager.req_to_next_token_ids,
533+
model_input1.b_req_idx,
534+
model_input1.b_mtp_index,
535+
)
536+
506537
assert model_input0.batch_size == model_input1.batch_size
507538
assert model_input0.mem_indexes.is_cuda
508539
assert model_input1.mem_indexes.is_cuda
@@ -686,6 +717,7 @@ def _check_max_len_infer(self):
686717
b_seq_len[:] = self.batch_max_tokens
687718
b_ready_cache_len = torch.zeros(1, dtype=torch.int32, device="cuda")
688719
total_token_num = self.batch_max_tokens
720+
b_mtp_index = torch.zeros(1, dtype=torch.int32, device="cuda")
689721
model_input = ModelInput(
690722
batch_size=1,
691723
total_token_num=total_token_num,
@@ -694,6 +726,7 @@ def _check_max_len_infer(self):
694726
mem_indexes=mem_indexes,
695727
b_req_idx=b_req_idx,
696728
b_seq_len=b_seq_len,
729+
b_mtp_index=b_mtp_index,
697730
is_prefill=True,
698731
b_ready_cache_len=b_ready_cache_len,
699732
)
@@ -741,13 +774,15 @@ def _init_padded_req(self):
741774
b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda")
742775
b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
743776
total_token_num = prefill_input_len * batch_size
777+
b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
744778
model_input = ModelInput(
745779
batch_size=batch_size,
746780
total_token_num=total_token_num,
747781
max_len_in_batch=prefill_input_len,
748782
input_ids=dummy_input_ids,
749783
mem_indexes=mem_indexes,
750784
b_req_idx=b_req_idx,
785+
b_mtp_index=b_mtp_index,
751786
b_seq_len=b_seq_len,
752787
b_ready_cache_len=b_ready_cache_len,
753788
is_prefill=True,

lightllm/common/basemodel/batch_objs.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import torch
22
from dataclasses import dataclass, field
33
from typing import Optional
4+
from typing import List
45

56

67
@dataclass
@@ -10,20 +11,38 @@ class ModelInput:
1011
total_token_num: int
1112
max_len_in_batch: int
1213
input_ids: torch.Tensor
13-
mem_indexes: torch.Tensor
1414
b_req_idx: torch.Tensor
15+
b_mtp_index: torch.Tensor
1516
b_seq_len: torch.Tensor
17+
mem_indexes: torch.Tensor = None
1618
is_prefill: bool = False
1719
b_ready_cache_len: torch.Tensor = None
1820
multimodal_params: list = field(default_factory=list)
1921

22+
# cpu 变量
23+
mem_indexes_cpu: torch.Tensor = None
24+
# prefill 阶段使用的参数,但是不是推理过程使用的参数,是推理外部进行资源管理
25+
# 的一些变量
26+
b_prefill_has_output_cpu: List[bool] = None # 标记进行prefill的请求是否具有输出
27+
2028
# 专有变量,用于一些特殊的模型,特殊的模式下, 传递一些特殊
2129
# 的输入变量。只在特殊的模型模式下才会具体使用和生效。
2230

2331
# deepseekv3_mtp_draft_input_hiddens 用于 deepseekv3 模型 mtp 模式下
2432
# 的 draft 模型的输入
2533
deepseekv3_mtp_draft_input_hiddens: Optional[torch.Tensor] = None
2634

35+
def to_cuda(self):
36+
if self.input_ids is not None:
37+
self.input_ids = self.input_ids.cuda(non_blocking=True)
38+
if self.mem_indexes is None:
39+
self.mem_indexes = self.mem_indexes_cpu.cuda(non_blocking=True)
40+
self.b_req_idx = self.b_req_idx.cuda(non_blocking=True)
41+
self.b_seq_len = self.b_seq_len.cuda(non_blocking=True)
42+
self.b_mtp_index = self.b_mtp_index.cuda(non_blocking=True)
43+
if self.b_ready_cache_len is not None:
44+
self.b_ready_cache_len = self.b_ready_cache_len.cuda(non_blocking=True)
45+
2746

2847
@dataclass
2948
class ModelOutput:

lightllm/common/basemodel/cuda_graph.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def warmup(self, model):
221221
)
222222
b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")
223223
b_seq_len.fill_(seq_len)
224+
b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
224225

225226
model_input = ModelInput(
226227
batch_size=batch_size,
@@ -230,6 +231,7 @@ def warmup(self, model):
230231
mem_indexes=mem_indexes,
231232
b_req_idx=b_req_idx,
232233
b_seq_len=b_seq_len,
234+
b_mtp_index=b_mtp_index,
233235
is_prefill=False,
234236
**model._gen_special_model_input(batch_size),
235237
)
@@ -275,13 +277,15 @@ def warmup_overlap(self, model):
275277
)
276278
b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")
277279
b_seq_len.fill_(seq_len)
280+
b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
278281

279282
micro_batch = ModelInput(
280283
is_prefill=False,
281284
batch_size=batch_size,
282285
total_token_num=total_token_num,
283286
max_len_in_batch=max_len_in_batch,
284287
input_ids=input_ids,
288+
b_mtp_index=b_mtp_index,
285289
mem_indexes=mem_indexes,
286290
b_req_idx=b_req_idx,
287291
b_seq_len=b_seq_len,

lightllm/common/basemodel/infer_struct.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from typing import Tuple, Any, Optional
66
from .triton_kernel.gen_prefill_params import gen_prefill_params
77
from .triton_kernel.gen_decode_params import gen_decode_params
8+
from .triton_kernel.multimodal_emb import mark_multimodal_obj
9+
from .batch_objs import ModelInput
810

911

1012
class InferStateInfo:
@@ -86,9 +88,10 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
8688
self.b_kv_seq_len,
8789
self.b1_cu_kv_seq_len,
8890
self.position_ids,
89-
self.max_q_seq_len,
90-
self.max_kv_seq_len,
91-
) = gen_decode_params(b_seq_len=self.b_seq_len)
91+
) = gen_decode_params(self.b_seq_len)
92+
self.max_q_seq_len = 1
93+
# TODO: check the correctness
94+
self.max_kv_seq_len = self.max_len_in_batch
9295
self.b_start_loc = self.b1_cu_kv_seq_len[0:-1]
9396

9497
def copy_for_cuda_graph(self, new_infer_state: "InferStateInfo"):
@@ -98,3 +101,24 @@ def copy_for_cuda_graph(self, new_infer_state: "InferStateInfo"):
98101
if attr_ is not None and attr_.data_ptr() != attr_value.data_ptr():
99102
attr_.copy_(attr_value, non_blocking=True)
100103
return
104+
105+
def mark_multimodal_objs_for_prefill(self, input_ids: torch.Tensor):
106+
"""
107+
功能函数,用于标记在chuncked prefill的过程中,到底哪些多模态对象对应的token是需要参与计算的。
108+
因为分chunck的原因,并不是所有的多模态对象对应的token都需要参与计算。
109+
"""
110+
multi_objs = []
111+
for _, p in enumerate(self.multimodal_params):
112+
for obj in p["images"] + p["audios"]:
113+
multi_objs.append(obj)
114+
115+
if multi_objs:
116+
obj_start_ids = torch.tensor([e["token_id"] for e in multi_objs], dtype=torch.int64, device="cuda")
117+
obj_token_lens = torch.tensor([e["token_num"] for e in multi_objs], dtype=torch.int64, device="cuda")
118+
marks = mark_multimodal_obj(
119+
obj_start_token_ids=obj_start_ids, obj_token_lens=obj_token_lens, input_ids=input_ids
120+
)
121+
marks_array = marks.detach().cpu().numpy()
122+
for mark, obj in zip(marks_array, multi_objs):
123+
obj["_prefill_"] = mark > 0
124+
return

lightllm/common/basemodel/layer_infer/cache_tensor_manager.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ def __init__(self):
9393
self.cuda_graph_cur_batch_size = None
9494
self.is_cuda_graph = False
9595
self.managed_total_tensor_bytes = 0
96+
# 防止误用导致显存泄露,添加标记变量。
97+
# 当使用者没有合法的调用 cache_env_in 和 cache_env_out 的时候
98+
# 如果调用了alloc_tensor 接口,则退化为 torch.empty 申请方式。
99+
self.cache_env_ok = False
96100

97101
def cache_env_in(
98102
self, is_cuda_graph: bool = False, cur_batch_size: int = 0, cuda_graph_max_batch_size: int = 0
@@ -107,6 +111,7 @@ def cache_env_in(
107111
assert self.inner_cuda_graph_manager.cuda_graph_max_batch_size == cuda_graph_max_batch_size
108112
self.cuda_graph_cur_batch_size = cur_batch_size
109113
assert cur_batch_size != 0
114+
self.cache_env_ok = True
110115
return
111116

112117
def cache_env_out(self):
@@ -115,6 +120,7 @@ def cache_env_out(self):
115120
self.free_shape_dtype_to_bufs.clear()
116121
self.calcu_shape_cache.clear()
117122
self.changed_ptr.clear()
123+
self.cache_env_ok = False
118124
return
119125

120126
def alloc_tensor(
@@ -129,6 +135,11 @@ def alloc_tensor(
129135
# shape 类型转换
130136
if isinstance(shape, list):
131137
shape = torch.Size(shape)
138+
139+
# cache manager 没有被正常使用时
140+
if not self.cache_env_ok:
141+
return torch.empty(shape, dtype=data_type, device=device, requires_grad=False)
142+
132143
# 是 cuda graph的时候,由cuda graph manager 接管
133144
if self.is_cuda_graph:
134145
return self.inner_cuda_graph_manager.alloc_tensor_for_cuda_graph(

0 commit comments

Comments
 (0)