mutli step mtp and dynamic_prompt cache for mtp

shihaobai · shihaobai · commit 1a0d132b0300 · 2025-05-24T01:25:56.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -450,9 +450,12 @@ def _context_forward(self, input_ids, infer_state: InferStateInfo):
         predict_logits = post_method(input_embs, infer_state, self.pre_post_weight)
 
         g_cache_manager.cache_env_out()
+        is_return_hidden_states = self.spec_algo.is_mtp() or (
+            self.spec_algo.is_mtp_module() and not self.last_mtp_module
+        )
         return ModelOutput(
             logits=predict_logits,
-            hidden_states=input_embs if self.spec_algo.is_mtp() else None,
+            hidden_states=input_embs if is_return_hidden_states else None,
         )
 
     @final
@@ -475,9 +478,12 @@ def _token_forward(self, input_ids, infer_state: InferStateInfo):
         predict_logits = post_method(input_embs, infer_state, self.pre_post_weight)
 
         g_cache_manager.cache_env_out()
+        is_return_hidden_states = self.spec_algo.is_mtp() or (
+            self.spec_algo.is_mtp_module() and not self.last_mtp_module
+        )
         return ModelOutput(
             logits=predict_logits,
-            hidden_states=input_embs if self.spec_algo.is_mtp() else None,
+            hidden_states=input_embs if is_return_hidden_states else None,
         )
 
     @final
diff --git a/lightllm/models/deepseek_mtp/model.py b/lightllm/models/deepseek_mtp/model.py
@@ -22,12 +22,12 @@ class Deepseek3MTPModel(Deepseek2TpPartModel):
     def __init__(self, kvargs):
         self.main_model = kvargs.pop("main_model")
         self.req_manager = self.main_model.req_manager
+        self.last_mtp_module = kvargs.pop("last_mtp_module", False)
         super().__init__(kvargs)
 
     def _init_req_manager(self):
         # draft model shares the same req_manager with the main model
         if hasattr(self, "req_manager"):
-            print("SKIP INIT REQ!!!!!!!!")
             return
         create_max_seq_len = 0
 
diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py
@@ -94,6 +94,8 @@ class Req(ctypes.Structure):
         ("reward_score", ctypes.c_float),
         # 请求回复累计概率和
         ("cumlogprob", ctypes.c_float),
+        # mtp draft model 接受长度
+        ("mtp_accepted_len", ctypes.c_int),
     ]
 
     def get_str(self):
@@ -145,6 +147,7 @@ def init(
         self.create_prompt_ids_shm_array()
         self.chunked_prefill_size = chunked_prefill_size
         self.shm_prompt_ids.arr[0 : len(prompt_ids)] = prompt_ids
+        self.mtp_accepted_len = 0
 
         self.post_init()
 
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -540,6 +540,8 @@ async def _wait_to_token_package(
                         x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
                         x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
                         prompt_cache_ratio = prompt_cache_len / prompt_tokens
+
+                        avg_token_per_step = out_token_counter / (out_token_counter - metadata["mtp_accepted_len"])
                         format_start_time = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S")
                         logger.info(
                             f"X-Request-Id:{x_request_id} "
@@ -550,6 +552,7 @@ async def _wait_to_token_package(
                             f"prompt_token_num:{prompt_tokens} "
                             f"prompt_cache_len:{prompt_cache_len} "
                             f"prompt_cache_ratio:{prompt_cache_ratio} "
+                            f"avg_token_per_step:{avg_token_per_step} "
                         )
                         if group_request_id < 0:
                             # health 探测请求，不记录日志和监控
@@ -652,6 +655,7 @@ async def handle_loop(self):
                             "special": special,
                             "count_output_tokens": count_output_tokens,
                             "prompt_cache_len": req.prompt_cache_len,
+                            "mtp_accepted_len": req.mtp_accepted_len,
                         }
                         if self.args.return_all_prompt_logprobs:
                             metadata.update(req.get_all_prompt_metadata())
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -257,6 +257,7 @@ def __init__(
         self.vocab_size = vocab_size
         self.initialized = False
         self.paused = False
+        self.cur_accepted_len = 0  # for mtp forward
 
     def init_all(self):
         if self.initialized is False:
@@ -319,6 +320,13 @@ def get_chuncked_input_token_ids(self):
         chunked_end = min(self.get_cur_total_len(), chunked_start + self.shm_req.chunked_prefill_size)
         return self.shm_req.shm_prompt_ids.arr[0:chunked_end]
 
+    def get_chunked_input_token_ids_shift(self, shift=-1):
+        input_ids = self.get_input_token_ids()
+        shift_input_ids = np.roll(input_ids, shift)
+        chunked_start = self.cur_kv_len
+        chunked_end = min(self.get_cur_total_len(), chunked_start + self.shm_req.chunked_prefill_size)
+        return shift_input_ids[shift:chunked_end]
+
     def get_chuncked_input_token_len(self):
         chunked_start = self.cur_kv_len
         chunked_end = min(self.get_cur_total_len(), chunked_start + self.shm_req.chunked_prefill_size)
@@ -330,6 +338,9 @@ def set_next_gen_token_id(self, next_token_id: int, logprob: float):
         self.shm_req.shm_logprobs.arr[index] = logprob
         return
 
+    def set_total_accepted_len(self):
+        self.shm_req.mtp_accepted_len += self.cur_accepted_len
+
     def get_last_gen_token(self):
         return self.shm_req.shm_prompt_ids.arr[self.shm_req.input_len + self.cur_output_len - 1]
 
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_mtp.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_mtp.py
@@ -0,0 +1,154 @@
+import torch
+import numpy as np
+from typing import List, Tuple
+from lightllm.server.router.model_infer.mode_backend.base_backend import ModeBackend
+from lightllm.utils.infer_utils import calculate_time, mark_start, mark_end
+from lightllm.utils.log_utils import init_logger
+from lightllm.server.router.model_infer.infer_batch import g_infer_context
+from lightllm.server.router.model_infer.mode_backend.generic_pre_process import (
+    prepare_prefill_inputs,
+)
+from lightllm.server.router.model_infer.mode_backend.mtp_pre_process import (
+    prepare_mtp_prefill_inputs,
+    prepare_draft_main_model_decode_inputs,
+)
+from lightllm.server.router.model_infer.mode_backend.generic_post_process import sample
+import os
+from lightllm.common.basemodel.infer_lock import g_infer_state_lock
+from lightllm.server.router.model_infer.infer_batch import InferReq
+from lightllm.server.router.model_infer.mode_backend.continues_batch.impl_mtp import ContinuesBatchWithMTPBackend
+import copy
+from lightllm.utils.dist_utils import device0_print
+
+
+logger = init_logger(__name__)
+
+
+class ChunkedPrefillWithMTPBackend(ContinuesBatchWithMTPBackend):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def decode(self):
+        uninit_reqs, aborted_reqs, ok_finished_reqs, prefill_reqs, decode_reqs = self._get_classed_reqs(
+            g_infer_context.infer_req_ids
+        )
+
+        if aborted_reqs:
+            g_infer_context.filter_reqs(aborted_reqs)
+
+        if prefill_reqs:
+            model_input, run_reqs = prepare_prefill_inputs(
+                prefill_reqs, is_chuncked_mode=False, is_multimodal=self.is_multimodal
+            )
+            model_output = self.model.forward(model_input)
+
+            self._overlap_req_init_and_filter(
+                uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs, clear_list=True
+            )
+
+            next_token_ids, next_token_probs = sample(model_output.logits, run_reqs, self.eos_id)
+            next_token_ids = next_token_ids.detach().cpu().numpy()
+            next_token_logprobs = torch.log(next_token_probs).detach().cpu().numpy()
+            self._post_handle(
+                run_reqs, next_token_ids, next_token_logprobs, is_chuncked_mode=False, do_filter_finished_reqs=False
+            )
+            # spec prefill: MTP
+            last_input_ids_cpu = None
+            draft_model_input = model_input
+            last_hidden_states = model_output.hidden_states
+            for draft_model_idx in range(self.spec_step):
+                device0_print(f"main {draft_model_input}")
+                draft_model_input, last_input_ids_cpu = prepare_mtp_prefill_inputs(
+                    prefill_reqs, model_input, last_hidden_states, next_token_ids, last_input_ids_cpu
+                )
+                device0_print(f"draft_model_input {draft_model_input}")
+                draft_model_output = self.draft_models[draft_model_idx].forward(draft_model_input)
+                draft_next_token_ids, _ = sample(draft_model_output.logits, run_reqs, self.eos_id)
+                draft_next_token_ids = draft_next_token_ids.detach().cpu().numpy()
+
+                last_hidden_states = draft_model_output.hidden_states
+                next_token_ids = draft_next_token_ids
+                self._save_draft_token_ids(draft_next_token_ids, run_reqs, draft_model_idx)
+
+        if decode_reqs:
+            model_input, run_reqs, mem_indexes_cpu = prepare_draft_main_model_decode_inputs(
+                decode_reqs, self.draft_token_id_map
+            )
+            model_output = self.model.forward(model_input)
+            assert model_output.logits.shape[0] % self.spec_stride == 0
+
+            self._overlap_req_init_and_filter(
+                uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs, clear_list=True
+            )
+
+            next_token_ids_cuda, next_token_probs = sample(model_output.logits, run_reqs, self.eos_id)
+            next_token_ids = next_token_ids_cuda.detach().cpu().numpy()
+            next_token_logprobs = torch.log(next_token_probs).detach().cpu().numpy()
+
+            # verify
+            accepted_reqs, accepted_index, need_free_mem_indexes = self.verify(
+                next_token_ids, run_reqs, mem_indexes_cpu
+            )
+            self._post_handle(
+                accepted_reqs,
+                next_token_ids[accepted_index],
+                next_token_logprobs[accepted_index],
+                is_chuncked_mode=False,
+                do_filter_finished_reqs=False,
+            )
+            # share some inference info with the main model
+            draft_model_input = model_input
+            draft_model_input.input_ids = next_token_ids_cuda
+            draft_model_input.hidden_states = model_output.hidden_states
+            # process the draft model output
+            for draft_model_idx in range(self.spec_step):
+                # spec decode: MTP
+                draft_model_output = self.draft_models[draft_model_idx].forward(draft_model_input)
+                draft_next_token_ids, _ = sample(draft_model_output.logits, run_reqs, self.eos_id)
+                # prepare inputs for the next draft model
+                draft_model_input.input_ids = draft_next_token_ids
+                draft_model_input.hidden_states = draft_model_output.hidden_states
+                draft_next_token_ids_numpy = draft_next_token_ids.detach().cpu().numpy()
+                self._save_draft_token_ids(draft_next_token_ids_numpy, run_reqs, draft_model_idx)
+
+            if need_free_mem_indexes:
+                g_infer_state_lock.acquire()
+                g_infer_context.req_manager.mem_manager.free(need_free_mem_indexes)
+                g_infer_state_lock.release()
+
+        self._overlap_req_init_and_filter(uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs, clear_list=True)
+        return
+
+    def verify(self, next_token_ids, run_reqs, draft_mem_indexes):
+        accepted_reqs = []
+        accepted_index = []
+        need_free_mem_indexes = []
+        assert next_token_ids.shape[0] % self.spec_stride == 0
+
+        for i, req in enumerate(run_reqs):
+            # main model output
+            if i % self.spec_stride == 0:
+                accepted_reqs.append(req)
+                accepted_index.append(i)
+                continue
+            draft_model_idx = i % self.spec_stride - 1
+            if (
+                self.draft_token_id_map[req.req_idx][draft_model_idx] == next_token_ids[i - 1]
+                and req.cur_accepted_len == draft_model_idx
+            ):
+                accepted_reqs.append(req)
+                accepted_index.append(i)
+                req.cur_accepted_len += 1
+                device0_print(f"req {req.req_idx} accepted, cur_accepted_len {req.cur_accepted_len}")
+            else:
+                need_free_mem_indexes.append(draft_mem_indexes[i])
+        return accepted_reqs, accepted_index, need_free_mem_indexes
+
+    def _save_draft_token_ids(self, draft_next_token_ids, run_reqs, draft_model_idx):
+        batch_size = len(run_reqs) // self.spec_stride
+        for i in range(batch_size):
+            req = run_reqs[self.spec_stride * i]
+            self.draft_token_id_map[req.req_idx][draft_model_idx] = draft_next_token_ids[i + req.cur_accepted_len]
+            #  reset the cur_accepted_len
+            if draft_model_idx == self.spec_step - 1:
+                req.cur_accepted_len = 0
diff --git a/lightllm/server/router/model_infer/mode_backend/continues_batch/impl_mtp.py b/lightllm/server/router/model_infer/mode_backend/continues_batch/impl_mtp.py
diff --git a/lightllm/server/router/model_infer/mode_backend/mtp_pre_process.py b/lightllm/server/router/model_infer/mode_backend/mtp_pre_process.py