add mtp index

shihaobai · shihaobai · commit ef35cf634131 · 2025-07-16T15:45:38.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -353,6 +353,7 @@ def _decode(
             model_input.input_ids = gather_token(
                 self.req_manager.req_sampling_params_manager.req_to_next_token_ids,
                 model_input.b_req_idx,
+                model_input.b_mtp_index,
             )
 
         if self.graph is not None and self.graph.can_run(model_input.batch_size, model_input.max_len_in_batch):
@@ -668,6 +669,7 @@ def _check_max_len_infer(self):
             b_seq_len[:] = self.batch_max_tokens
             b_ready_cache_len = torch.zeros(1, dtype=torch.int32, device="cuda")
             total_token_num = self.batch_max_tokens
+            b_mtp_index = torch.zeros(1, dtype=torch.int32, device="cuda")
             model_input = ModelInput(
                 batch_size=1,
                 total_token_num=total_token_num,
@@ -676,6 +678,7 @@ def _check_max_len_infer(self):
                 mem_indexes=mem_indexes,
                 b_req_idx=b_req_idx,
                 b_seq_len=b_seq_len,
+                b_mtp_index=b_mtp_index,
                 is_prefill=True,
                 b_ready_cache_len=b_ready_cache_len,
             )
@@ -723,13 +726,15 @@ def _init_padded_req(self):
         b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda")
         b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
         total_token_num = prefill_input_len * batch_size
+        b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
         model_input = ModelInput(
             batch_size=batch_size,
             total_token_num=total_token_num,
             max_len_in_batch=prefill_input_len,
             input_ids=dummy_input_ids,
             mem_indexes=mem_indexes,
             b_req_idx=b_req_idx,
+            b_mtp_index=b_mtp_index,
             b_seq_len=b_seq_len,
             b_ready_cache_len=b_ready_cache_len,
             is_prefill=True,
diff --git a/lightllm/common/basemodel/batch_objs.py b/lightllm/common/basemodel/batch_objs.py
@@ -12,6 +12,7 @@ class ModelInput:
     input_ids: torch.Tensor
     mem_indexes: torch.Tensor
     b_req_idx: torch.Tensor
+    b_mtp_index: torch.Tensor
     b_seq_len: torch.Tensor
     is_prefill: bool = False
     b_ready_cache_len: torch.Tensor = None
@@ -30,6 +31,7 @@ def to_cuda(self):
         self.mem_indexes = self.mem_indexes.cuda(non_blocking=True)
         self.b_req_idx = self.b_req_idx.cuda(non_blocking=True)
         self.b_seq_len = self.b_seq_len.cuda(non_blocking=True)
+        self.b_mtp_index = self.b_mtp_index.cuda(non_blocking=True)
         if self.b_ready_cache_len is not None:
             self.b_ready_cache_len = self.b_ready_cache_len.cuda(non_blocking=True)
 
diff --git a/lightllm/common/basemodel/cuda_graph.py b/lightllm/common/basemodel/cuda_graph.py
@@ -202,6 +202,7 @@ def warmup(self, model):
             )
             b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")
             b_seq_len.fill_(seq_len)
+            b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
 
             model_input = ModelInput(
                 batch_size=batch_size,
@@ -211,6 +212,7 @@ def warmup(self, model):
                 mem_indexes=mem_indexes,
                 b_req_idx=b_req_idx,
                 b_seq_len=b_seq_len,
+                b_mtp_index=b_mtp_index,
                 is_prefill=False,
                 **model._gen_special_model_input(batch_size),
             )
diff --git a/lightllm/common/basemodel/triton_kernel/gather_token_id.py b/lightllm/common/basemodel/triton_kernel/gather_token_id.py
@@ -72,37 +72,48 @@ def gather_and_scatter_token_to_cpu(
 
 @triton.jit
 def _fwd_kernel_scatter(
-    token_info,
-    req_to_token_info,
+    next_token_ids,
+    req_to_next_token_ids,
     b_req_idx,
-    req_to_token_info_stride,
+    b_mtp_index,
+    req_to_next_token_ids_stride,
+    req_to_next_token_ids_stride_1,
 ):
     cur_index = tl.program_id(0)
     cur_req_idx = tl.load(b_req_idx + cur_index)
-    cur_token_info = tl.load(token_info + cur_index)
-    tl.store(req_to_token_info + cur_req_idx * req_to_token_info_stride, cur_token_info)
+    cur_mtp_index = tl.load(b_mtp_index + cur_index)
+    cur_next_token_id = tl.load(next_token_ids + cur_index)
+    tl.store(req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + cur_mtp_index, cur_next_token_id)
     return
 
 
 @torch.no_grad()
-def scatter_token(token_info: torch.Tensor, req_to_token_info: torch.Tensor, b_req_idx: torch.Tensor):
+def scatter_token(
+    next_token_ids: torch.Tensor,
+    req_to_next_token_ids: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_mtp_index: torch.Tensor,
+):
     """
     This function is used to scatter the token_info(GPU tensor) to the req_to_token_info(CPU tensor).
     Args:
-        token_info: (batch_size, vocab_size)
-        req_to_token_info: (max_req_num,)
+        next_token_ids: (batch_size,)
+        req_to_next_token_ids: (max_req_num, max_mtp_step)
         b_req_idx: (batch_size,)
+        b_mtp_index: (batch_size,)
     """
-    assert token_info.shape[0] == b_req_idx.shape[0]
+    assert next_token_ids.shape[0] == b_req_idx.shape[0]
     batch_size = b_req_idx.shape[0]
     grid = (batch_size,)
     num_warps = 1
 
     _fwd_kernel_scatter[grid](
-        token_info,
-        req_to_token_info,
+        next_token_ids,
+        req_to_next_token_ids,
         b_req_idx,
-        req_to_token_info.stride(0),
+        b_mtp_index,
+        req_to_next_token_ids.stride(0),
+        req_to_next_token_ids.stride(1),
         num_warps=num_warps,
         num_stages=1,
     )
@@ -111,24 +122,28 @@ def scatter_token(token_info: torch.Tensor, req_to_token_info: torch.Tensor, b_r
 
 @triton.jit
 def _fwd_kernel_gather(
-    req_to_token_info,
-    req_to_token_info_stride,
+    req_to_next_token_ids,
+    req_to_next_token_ids_stride,
+    req_to_next_token_ids_stride_1,
     output,
     b_req_idx,
+    b_mtp_index,
 ):
     cur_index = tl.program_id(0)
     cur_req_idx = tl.load(b_req_idx + cur_index)
-    cur_token_info = tl.load(req_to_token_info + cur_req_idx * req_to_token_info_stride)
-    tl.store(output + cur_index, cur_token_info)
+    cur_mtp_index = tl.load(b_mtp_index + cur_index)
+    cur_next_token_id = tl.load(req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + cur_mtp_index)
+    tl.store(output + cur_index, cur_next_token_id)
     return
 
 
-def gather_token(req_to_token_info: torch.Tensor, b_req_idx: torch.Tensor):
+def gather_token(req_to_next_token_ids: torch.Tensor, b_req_idx: torch.Tensor, b_mtp_index: torch.Tensor):
     """
     This function is used to gather the token_info(CPU tensor) to the token_info(GPU tensor).
     Args:
         req_to_token_info: (max_req_num, max_mtp_step)
         b_req_idx: (batch_size,)
+        b_mtp_index: (batch_size,)
     Returns:
         output: (batch_size,)
     """
@@ -137,10 +152,12 @@ def gather_token(req_to_token_info: torch.Tensor, b_req_idx: torch.Tensor):
     grid = (batch_size,)
     num_warps = 1
     _fwd_kernel_gather[grid](
-        req_to_token_info,
-        req_to_token_info.stride(0),
+        req_to_next_token_ids,
+        req_to_next_token_ids.stride(0),
+        req_to_next_token_ids.stride(1),
         output,
         b_req_idx,
+        b_mtp_index,
         num_warps=num_warps,
         num_stages=1,
     )
@@ -187,7 +204,8 @@ def test_scatter_token_to_cpu():
     req_to_token_info = torch.zeros((1000,), dtype=torch.float32, pin_memory=True)
     token_info = torch.randn((batch_size,)).cuda()
     req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
-    scatter_token(token_info, req_to_token_info, req_ids)
+    mtp_index = torch.zeros((batch_size,), dtype=torch.int32).cuda()
+    scatter_token(token_info, req_to_token_info, req_ids, mtp_index)
     diff = (req_to_token_info[20 : 20 + batch_size].cuda() - token_info).abs().max()
     assert diff < 1e-6
     print("test_scatter_token_to_cpu passed")
@@ -198,8 +216,9 @@ def test_gather_token():
     req_to_token_info = torch.zeros((1000,), dtype=torch.int32, pin_memory=True)
     token_info = torch.randn((batch_size,)).cuda()
     req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
-    scatter_token(token_info, req_to_token_info, req_ids)
-    output = gather_token(req_to_token_info, req_ids)
+    mtp_index = torch.zeros((batch_size,), dtype=torch.int32).cuda()
+    scatter_token(token_info, req_to_token_info, req_ids, mtp_index)
+    output = gather_token(req_to_token_info, req_ids, mtp_index)
     diff = (token_info - output).abs().max()
     assert diff < 1e-6
     print("test_gather_token passed")
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -102,6 +102,7 @@ def prefill_normal(
                 next_token_ids,
                 self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
                 model_input.b_req_idx,
+                model_input.b_mtp_index,
             )
             next_token_ids_cpu = g_pin_mem_manager.alloc_pin_tensor(
                 "next_token_ids", next_token_ids.shape[0], next_token_ids.dtype
@@ -149,6 +150,7 @@ def decode_normal(
                 next_token_ids,
                 self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
                 model_input.b_req_idx,
+                model_input.b_mtp_index,
             )
             next_token_ids_cpu = g_pin_mem_manager.alloc_pin_tensor(
                 "next_token_ids", next_token_ids.shape[0], next_token_ids.dtype
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py
@@ -17,6 +17,7 @@ def prepare_prefill_inputs(
     b_seq_len = []
     batch_multimodal_params = []
     b_ready_cache_len = []
+    b_mtp_index = []
     for req in req_objs:
         run_reqs.append(req)
         batch_multimodal_params.append(req.multimodal_params)
@@ -37,12 +38,14 @@ def prepare_prefill_inputs(
         total_token_num += seq_len
         max_len_in_batch = max(max_len_in_batch, input_token_len)
         b_ready_cache_len.append(req.cur_kv_len)
+        b_mtp_index.append(0)
 
     input_ids = np.concatenate(input_ids, dtype=np.int64)
 
     input_ids = torch.tensor(input_ids, dtype=torch.int64, device="cpu")
     b_req_idx = torch.tensor(b_req_idx, dtype=torch.int32, device="cpu")
     b_seq_len = torch.tensor(b_seq_len, dtype=torch.int32, device="cpu")
+    b_mtp_index = torch.tensor(b_mtp_index, dtype=torch.int32, device="cpu")
     b_ready_cache_len = torch.tensor(b_ready_cache_len, dtype=torch.int32, device="cpu")
 
     # dynamic prompt cache 准备 token
@@ -59,6 +62,7 @@ def prepare_prefill_inputs(
         input_ids=input_ids,
         mem_indexes=mem_indexes,
         b_req_idx=b_req_idx,
+        b_mtp_index=b_mtp_index,
         b_seq_len=b_seq_len,
         b_ready_cache_len=b_ready_cache_len,
         is_prefill=True,
@@ -74,6 +78,7 @@ def prepare_decode_inputs(req_objs: List[InferReq]) -> Tuple[ModelInput, List[In
     total_token_num = 0
     max_len_in_batch = 0
     b_req_idx = []
+    b_mtp_index = []
     b_seq_len = []
     for req in req_objs:
         run_reqs.append(req)
@@ -83,7 +88,7 @@ def prepare_decode_inputs(req_objs: List[InferReq]) -> Tuple[ModelInput, List[In
         b_seq_len.append(seq_len)
         total_token_num += seq_len
         max_len_in_batch = max(max_len_in_batch, seq_len)
-
+        b_mtp_index.append(0)
         # process the draft tokens.
         for step in range(len(req.mtp_gen_token_ids)):
             run_reqs.append(req)
@@ -92,9 +97,11 @@ def prepare_decode_inputs(req_objs: List[InferReq]) -> Tuple[ModelInput, List[In
             b_seq_len.append(seq_len)
             total_token_num += seq_len
             max_len_in_batch = max(max_len_in_batch, seq_len)
+            b_mtp_index.append(step + 1)
 
     b_req_idx = torch.tensor(b_req_idx, dtype=torch.int32, device="cpu")
     b_seq_len = torch.tensor(b_seq_len, dtype=torch.int32, device="cpu")
+    b_mtp_index = torch.tensor(b_mtp_index, dtype=torch.int32, device="cpu")
 
     # dynamic prompt cache 准备 token
     g_infer_state_lock.acquire()
@@ -110,6 +117,7 @@ def prepare_decode_inputs(req_objs: List[InferReq]) -> Tuple[ModelInput, List[In
         input_ids=None,
         mem_indexes=mem_indexes,
         b_req_idx=b_req_idx,
+        b_mtp_index=b_mtp_index,
         b_seq_len=b_seq_len,
         is_prefill=False,
     )

Original file line number	Diff line number	Diff line change
`@@ -202,6 +202,7 @@ def warmup(self, model):`
`202`	`202`	`)`
`203`	`203`	`b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")`
`204`	`204`	`b_seq_len.fill_(seq_len)`
	`205`	`+ b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")`
`205`	`206`
`206`	`207`	`model_input = ModelInput(`
`207`	`208`	`batch_size=batch_size,`
`@@ -211,6 +212,7 @@ def warmup(self, model):`
`211`	`212`	`mem_indexes=mem_indexes,`
`212`	`213`	`b_req_idx=b_req_idx,`
`213`	`214`	`b_seq_len=b_seq_len,`
	`215`	`+ b_mtp_index=b_mtp_index,`
`214`	`216`	`is_prefill=False,`
`215`	`217`	`**model._gen_special_model_input(batch_size),`
`216`	`218`	`)`