overlap mtp

shihaobai · shihaobai · commit 965cdaee8e1f · 2025-07-18T15:44:34.000+08:00
diff --git a/lightllm/common/basemodel/batch_objs.py b/lightllm/common/basemodel/batch_objs.py
@@ -10,14 +10,17 @@ class ModelInput:
     total_token_num: int
     max_len_in_batch: int
     input_ids: torch.Tensor
-    mem_indexes: torch.Tensor
     b_req_idx: torch.Tensor
     b_mtp_index: torch.Tensor
     b_seq_len: torch.Tensor
+    mem_indexes: torch.Tensor = None
     is_prefill: bool = False
     b_ready_cache_len: torch.Tensor = None
     multimodal_params: list = field(default_factory=list)
 
+    # cpu 变量
+    mem_indexes_cpu: torch.Tensor = None
+
     # 专有变量，用于一些特殊的模型，特殊的模式下, 传递一些特殊
     # 的输入变量。只在特殊的模型模式下才会具体使用和生效。
 
@@ -28,7 +31,8 @@ class ModelInput:
     def to_cuda(self):
         if self.input_ids is not None:
             self.input_ids = self.input_ids.cuda(non_blocking=True)
-        self.mem_indexes = self.mem_indexes.cuda(non_blocking=True)
+        if self.mem_indexes is None:
+            self.mem_indexes = self.mem_indexes_cpu.cuda(non_blocking=True)
         self.b_req_idx = self.b_req_idx.cuda(non_blocking=True)
         self.b_seq_len = self.b_seq_len.cuda(non_blocking=True)
         self.b_mtp_index = self.b_mtp_index.cuda(non_blocking=True)
diff --git a/lightllm/common/basemodel/triton_kernel/mtp_verify.py b/lightllm/common/basemodel/triton_kernel/mtp_verify.py
@@ -0,0 +1,224 @@
+import triton
+import triton.language as tl
+import torch
+
+
+@triton.jit
+def _fwd_kernel_mtp_verify(
+    req_to_next_token_ids,
+    req_to_next_token_ids_stride,
+    new_next_token_ids,
+    mtp_accept_len,
+    b_req_mtp_start_loc,
+    b_req_idx,
+    b_mtp_index,
+    accepted_index,
+    batch_size: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    cur_index = tl.program_id(0)
+    req_start_loc = tl.load(b_req_mtp_start_loc + cur_index)
+    cur_req_idx = tl.load(b_req_idx + req_start_loc)
+    offset = tl.arange(0, BLOCK_SIZE)
+    req_offset = req_start_loc + offset
+    cur_mtp_index = tl.load(b_mtp_index + req_offset, mask=req_offset < batch_size)
+
+    mask = cur_mtp_index == offset
+
+    cur_next_token_id = tl.load(
+        req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + offset + 1, mask=mask, other=-1
+    )
+    cur_new_next_token_id = tl.load(new_next_token_ids + req_offset, mask=mask, other=-2)
+
+    match_mask = cur_next_token_id == cur_new_next_token_id
+
+    first_false = tl.where(~match_mask, offset, BLOCK_SIZE - 1)
+    accept_len = tl.min(first_false)
+    tl.store(mtp_accept_len + cur_index, accept_len)
+    accpeted_index = tl.where((offset < accept_len + 1), 1, 0)
+    tl.store(accepted_index + req_offset, accpeted_index, mask=mask)
+    return
+
+
+def mtp_verify(
+    req_to_next_token_ids: torch.Tensor,
+    b_req_mtp_start_loc: torch.Tensor,
+    new_next_token_ids: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_mtp_index: torch.Tensor,
+):
+    """
+    This function is used to verify the accept_len.
+    Args:
+        req_to_next_token_ids: (max_req_num, max_mtp_step)
+        b_req_mtp_start_loc: (num_reqs,)
+        new_next_token_ids: (batch_size,)
+        b_req_idx: (batch_size,)
+        b_mtp_index: (batch_size,)
+    Returns:
+        mtp_accept_len: (num_reqs,)
+        accepted_index: (batch_size,)
+        accepted_index: [1, 0, 1, 1, 0], 0 means the token is not accepted, 1 means the token is accepted.
+    """
+    max_mtp_step = req_to_next_token_ids.shape[1]
+    BLOCK_SIZE = 16
+    assert max_mtp_step <= BLOCK_SIZE, f"max_mtp_step must be less than {BLOCK_SIZE}"
+    num_reqs = b_req_mtp_start_loc.shape[0]
+    batch_size = b_req_idx.shape[0]
+    mtp_accept_len = torch.empty((num_reqs,), dtype=torch.int32, device=req_to_next_token_ids.device)
+    accepted_index = torch.empty((batch_size,), dtype=torch.int32, device=req_to_next_token_ids.device)
+
+    grid = (num_reqs,)
+    num_warps = 1
+    _fwd_kernel_mtp_verify[grid](
+        req_to_next_token_ids,
+        req_to_next_token_ids.stride(0),
+        new_next_token_ids,
+        mtp_accept_len,
+        b_req_mtp_start_loc,
+        b_req_idx,
+        b_mtp_index,
+        accepted_index,
+        batch_size,
+        BLOCK_SIZE,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return mtp_accept_len, accepted_index
+
+
+@triton.jit
+def _fwd_kernel_mtp_scatter_next_token_ids(
+    req_to_next_token_ids,
+    req_to_next_token_ids_stride,
+    all_next_token_ids,
+    all_next_token_ids_stride,
+    mtp_accept_len,
+    b_req_mtp_start_loc,
+    b_req_idx,
+    b_mtp_index,
+    mtp_step: tl.constexpr,
+    batch_size: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+
+    cur_index = tl.program_id(0)
+    req_start_loc = tl.load(b_req_mtp_start_loc + cur_index)
+    accept_len = tl.load(mtp_accept_len + cur_index)
+    cur_req_idx = tl.load(b_req_idx + req_start_loc)
+    offset = tl.arange(0, BLOCK_SIZE)
+    req_offset = req_start_loc + offset
+    cur_mtp_index = tl.load(b_mtp_index + req_offset, mask=req_offset < batch_size)
+
+    mask = cur_mtp_index == offset
+    scatter_next_token_ids = tl.load(
+        all_next_token_ids + (req_start_loc + accept_len) * all_next_token_ids_stride + offset,
+        mask=offset < mtp_step,
+        other=0,
+    )
+    scatter_next_token_ids = tl.where(mask, scatter_next_token_ids, -1)
+    tl.store(
+        req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + offset,
+        scatter_next_token_ids,
+        mask=offset < mtp_step,
+    )
+    return
+
+
+def mtp_scatter_next_token_ids(
+    req_to_next_token_ids: torch.Tensor,
+    b_req_mtp_start_loc: torch.Tensor,
+    all_next_token_ids: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_mtp_index: torch.Tensor,
+    mtp_accept_len: torch.Tensor,
+):
+    max_mtp_step = req_to_next_token_ids.shape[1]
+    BLOCK_SIZE = 16
+    assert max_mtp_step <= BLOCK_SIZE, f"max_mtp_step must be less than {BLOCK_SIZE}"
+    num_reqs = b_req_mtp_start_loc.shape[0]
+    batch_size = b_req_idx.shape[0]
+    mtp_step = all_next_token_ids.shape[1]
+    grid = (num_reqs,)
+    num_warps = 1
+    _fwd_kernel_mtp_scatter_next_token_ids[grid](
+        req_to_next_token_ids,
+        req_to_next_token_ids.stride(0),
+        all_next_token_ids,
+        all_next_token_ids.stride(0),
+        mtp_accept_len,
+        b_req_mtp_start_loc,
+        b_req_idx,
+        b_mtp_index,
+        mtp_step,
+        batch_size,
+        BLOCK_SIZE,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+
+@triton.jit
+def _fwd_kernel_gen_b_req_mtp_start_loc(
+    b_mtp_index,
+    b_req_mtp_start_loc,
+    num_reqs: tl.constexpr,
+    batch_size: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    offset = tl.arange(0, BLOCK_SIZE)
+    cur_mtp_index = tl.load(b_mtp_index + offset, mask=offset < batch_size, other=-1)
+    non_zero_mask = tl.where(cur_mtp_index == 0, 1, 0)  # 1 0 1 0 0
+    output_offset = tl.cumsum(non_zero_mask) - 1
+    tl.store(b_req_mtp_start_loc + output_offset, offset, mask=non_zero_mask == 1)
+    return
+
+
+def gen_b_req_mtp_start_loc(b_mtp_index: torch.Tensor, num_reqs: int):
+    b_req_mtp_start_loc = torch.empty((num_reqs,), dtype=torch.int32, device=b_mtp_index.device)
+    BLOCK_SIZE = triton.next_power_of_2(b_mtp_index.shape[0])
+    batch_size = b_mtp_index.shape[0]
+    grid = (1,)
+    _fwd_kernel_gen_b_req_mtp_start_loc[grid](
+        b_mtp_index=b_mtp_index,
+        b_req_mtp_start_loc=b_req_mtp_start_loc,
+        num_reqs=num_reqs,
+        batch_size=batch_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=8,
+    )
+    return b_req_mtp_start_loc
+
+
+def test_mtp_verify():
+    req_to_next_token_ids = torch.tensor(
+        [[1, 2, -2, -1, -1], [1, 2, 0, -1, -1], [1, 3, 4, 4, 5]], dtype=torch.int32, device="cuda"
+    )
+    b_req_idx = torch.tensor([0, 0, 2, 2, 2], dtype=torch.int32, device="cuda")
+    b_mtp_index = torch.tensor([0, 1, 0, 1, 2], dtype=torch.int32, device="cuda")
+    b_req_mtp_start_loc = torch.tensor([0, 2], dtype=torch.int32, device="cuda")
+    new_next_token_ids = torch.tensor([1, 4, 3, 4, 13], dtype=torch.int32, device="cuda")
+    all_next_token_ids = torch.tensor(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=torch.int32, device="cuda"
+    )
+    mtp_accept_len, accepted_index = mtp_verify(
+        req_to_next_token_ids, b_req_mtp_start_loc, new_next_token_ids, b_req_idx, b_mtp_index
+    )
+    mtp_scatter_next_token_ids(
+        req_to_next_token_ids, b_req_mtp_start_loc, all_next_token_ids, b_req_idx, b_mtp_index, mtp_accept_len
+    )
+    print(mtp_accept_len)
+    print(req_to_next_token_ids)
+    print(accepted_index)
+
+
+def test_gen_b_req_mtp_start_loc():
+    b_mtp_index = torch.tensor([0, 1, 0, 1, 2], dtype=torch.int32, device="cuda")
+    gt_output = torch.where(b_mtp_index == 0)[0]
+    b_req_mtp_start_loc = gen_b_req_mtp_start_loc(b_mtp_index, 2)
+    print(b_req_mtp_start_loc, gt_output)
+
+
+if __name__ == "__main__":
+    # test_mtp_verify()
+    test_gen_b_req_mtp_start_loc()
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -298,11 +298,9 @@ def __init__(
         self.need_out_token_id_statistics = True
         self.out_token_id_count: Dict[int, int] = None
 
-        # mtp_gen_token_ids 用于处理一个请求可以通过mtp进行很多token的预先生成
-        # 的技术，在没有开启 mtp 功能的时候，这个成员变量不会有任何的实际实用意义。
-        # 当开启后，mtp_gen_token_ids 保存多生成的多余的token_id,但是在后面的
-        # 步骤中需要重新进行校验。
-        self.mtp_gen_token_ids: List[int] = []
+        # mtp_step 用来记录一个请求 draft模型每步需要生成的token数量
+        # 正常模式下，这个值为0，在 mtp 模式下，这个值为 draft 模型每步需要生成的token数量
+        self.mtp_step: int = get_env_start_args().mtp_step
 
         self._init_all_state()
         if init_prefix_cache:
@@ -417,7 +415,7 @@ def prefill_need_token_num(self, is_chuncked_prefill: bool):
         return input_token_len
 
     def decode_need_token_num(self):
-        return 1 + len(self.mtp_gen_token_ids)
+        return 1 + self.mtp_step
 
 
 class InferReqGroup:
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -14,7 +14,8 @@
 from lightllm.server.router.token_load import TokenLoad
 from lightllm.common.basemodel.infer_lock import g_infer_state_lock, InferStateLock
 from lightllm.common.basemodel.basemodel import TpPartBaseModel
-from lightllm.common.basemodel.batch_objs import ModelOutput
+from lightllm.common.basemodel.batch_objs import ModelOutput, ModelInput
+from lightllm.common.basemodel.triton_kernel.mtp_verify import mtp_verify
 from lightllm.utils.dist_utils import init_distributed_env
 from lightllm.utils.envs_utils import get_unique_server_name
 from lightllm.server.core.objs import ShmReqManager, StartArgs
@@ -253,7 +254,7 @@ def init_mtp_draft_model(self, main_kvargs: dict):
 
     def _save_next_token_ids_and_logprobs(self, next_token_ids: torch.Tensor, next_token_logprobs: torch.Tensor):
         """
-        这个函数会把next token id和logprobs保存到pinned memory中，并返回一个同步事件。
+        这个函数会把next token id和logprobs保存到pinned memory中
         这样可以保障post_handle 函数可以读取到正常的输出结果。
         """
         next_token_ids_cpu = g_pin_mem_manager.alloc_pin_tensor(
@@ -521,6 +522,38 @@ def _filter_reqs(self, reqs: List[InferReq]):
     def _trans_req_ids_to_req_objs(self, req_ids: List[int]) -> List[InferReq]:
         return [g_infer_context.requests_mapping[req_id] for req_id in req_ids]
 
+    def _verify_mtp_v2(
+        self, new_next_token_ids: torch.Tensor, model_input: ModelInput, b_req_mtp_start_loc: torch.Tensor
+    ):
+        b_mtp_index = model_input.b_mtp_index
+        mtp_accept_len, accepted_index = mtp_verify(
+            req_to_next_token_ids=self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
+            b_req_mtp_start_loc=b_req_mtp_start_loc,
+            new_next_token_ids=new_next_token_ids,
+            b_req_idx=model_input.b_req_idx,
+            b_mtp_index=b_mtp_index,
+        )
+        return mtp_accept_len, accepted_index
+
+    def _get_need_free_mem_indexes(
+        self,
+        run_reqs: List[InferReq],
+        accepted_index_cpu: torch.Tensor,
+        mtp_accept_len_cpu: torch.Tensor,
+        mem_indexes_cpu: torch.Tensor,
+    ) -> Tuple[List[InferReq], torch.Tensor]:
+        need_free_mem_indexes = []
+        start_idx = 0
+        for i in range(mtp_accept_len_cpu.shape[0]):
+            req = run_reqs[start_idx]
+            accept_len = mtp_accept_len_cpu[i]
+            end_idx = start_idx + req.mtp_step + 1
+            need_free_mem_indexes.extend(mem_indexes_cpu[start_idx + accept_len + 1 : end_idx])
+            start_idx = end_idx
+            if self.is_master_in_dp:
+                req.update_mtp_accepted_token_num(accept_token_num=accept_len)
+        return need_free_mem_indexes
+
     # 对mtp 运行模式下的请求进行校验和过滤，保留校验成功的请求对象，并释放不再使用的kv 的 mem_index
     def _verify_mtp(self, run_reqs: List[InferReq], next_token_ids_cpu: np.ndarray, input_mem_indexes_cpu: np.ndarray):
         verify_ok_reqs = []
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py