add warmup to draft and support long input split

RunningLeon · RunningLeon · commit 7e2c8b8965d1 · 2025-09-03T16:48:42.000+08:00
diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
@@ -20,7 +20,8 @@ def get_launching_server_cmd(model_path, backend, server_config):
         # Convert snake_case to kebab-case for command line args
         key = key.replace('_', '-')
         cmd.append(f'--{key}')
-        cmd.append(str(value))
+        if str(value):
+            cmd.append(str(value))
     # Special handling for proxy server case
     if server_config.get('proxy_url') and server_config.get('dp'):
         cmd.append('--allow-terminate-by-client')
@@ -66,9 +67,9 @@ def get_server_ip_port(backend: str, server_config: Dict) -> Tuple[str, int]:
             server_ip = server_config.get('server_ip', '0.0.0.0')
             server_port = server_config.get('server_port', 23333)
     elif backend == 'sglang':
-        return (server_config.get('server_ip', '0.0.0.0'), server_config.get('server_port', 30000))
+        return (server_config.get('server_ip', '0.0.0.0'), server_config.get('port', 30000))
     elif backend == 'vllm':
-        return (server_config.get('server_ip', '0.0.0.0'), server_config.get('server_port', 8000))
+        return (server_config.get('server_ip', '0.0.0.0'), server_config.get('port', 8000))
     else:
         raise ValueError(f'unknown backend: {backend}')
     return server_ip, server_port
diff --git a/lmdeploy/pytorch/configurations/llama.py b/lmdeploy/pytorch/configurations/llama.py
@@ -19,6 +19,7 @@ def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False,
             # update draft model arch
             assert speculative_config is not None
             hf_config.architectures[0] = speculative_config.method.capitalize() + hf_config.architectures[0]
+            cfg.vocab_size = getattr(hf_config, 'draft_vocab_size', hf_config.vocab_size)
         elif speculative_config is not None:
             # add aux_hidden_state_layers for eagle3
             if speculative_config.method == 'eagle3':
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -910,7 +910,7 @@ def update_running_migration(self, running: SeqList, next_token_ids: np.ndarray,
 
     def _make_spec_stats(self, seqs: SeqList, next_token_ids: torch.LongTensor):
         """Make spec stats."""
-        debug = True
+        debug = False
         all_stats = [None] * len(seqs)
         if self.speculative_config is not None and (debug or self.engine_config.enable_metrics):
             if debug and not hasattr(self, 'spec_stats'):
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -333,6 +333,7 @@ def __init__(
         self.method = specdecode_config.method
         self.model_config = specdecode_config.model_config
         self.cache_config = specdecode_config.cache_config
+        self.num_spec_tokens = specdecode_config.num_speculative_tokens
         self.backend_config = backend_config
         self.device = device
 
@@ -365,12 +366,17 @@ def build_graph_runner(self):
     def build_cache_engine(self, cache_stream: torch.cuda.Stream):
         """Build cache engine."""
         if self.cache_config is not None:
-            self.cache_engine = CacheEngine(self.cache_config, self.model_config, rank=0, tp_rank=0, world_size=1, cache_stream=cache_stream)
+            self.cache_engine = CacheEngine(self.cache_config,
+                                            self.model_config,
+                                            rank=0,
+                                            tp_rank=0,
+                                            world_size=1,
+                                            cache_stream=cache_stream)
 
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
         """Forward impl."""
         cache_swapping(self.cache_engine, swap_in_map=swap_in_map, swap_out_map=swap_out_map)
-        output = self.proposer.propose(inputs, cache_engine=self.cache_engine, stream=self.stream)
+        output = self.proposer._forward(inputs, cache_engine=self.cache_engine, stream=self.stream)
         return output
 
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
@@ -385,32 +391,122 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_ou
         await asyncio.sleep(0)
         return output
 
+    async def _async_model_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
+        """Model forward.
+
+        Args:
+            inputs (Dict): The input data comes from _make_inputs.
+            swap_in_map (SwapMap): Cache maps to swap in.
+            swap_out_map (SwapMap): Cache maps to swap out.
+        """
+        max_prefill_token_num = self.cache_config.max_prefill_token_num
+        swap_done = False
+
+        async def __forward(inputs):
+            """forward."""
+            nonlocal swap_done, swap_in_map, swap_out_map
+            if swap_done:
+                return await self.async_forward(inputs, swap_in_map=dict(), swap_out_map=dict())
+            else:
+                swap_done = True
+                return await self.async_forward(inputs, swap_in_map=swap_in_map, swap_out_map=swap_out_map)
+
+        async def __long_context_single_forward(new_inputs):
+            """One large sequence."""
+            model_metas = new_inputs[0].model_metas
+            for inp in new_inputs:
+                inp.model_metas = model_metas
+                output = await __forward(inp)
+                model_metas = output.get('model_metas')
+            return output
+
+        # make long context inputs
+        is_long_context = inputs.input_ids.numel() > max_prefill_token_num and not inputs.is_decoding
+
+        if is_long_context:
+            seq_len = inputs.seq_length
+            batch_size = seq_len.size(0)
+            assert batch_size == 1, 'Do not support batched long context.'
+            inputs_li = inputs.split(max_prefill_token_num)
+            outputs = await __long_context_single_forward(inputs_li)
+        else:
+            outputs = await __forward(inputs)
+
+        loop_count = self.num_spec_tokens - 1
+        draft_token_ids, model_metas, target_hidden_states = self.proposer.get_outputs(outputs, inputs)
+        draft_tokens_li = [draft_token_ids]
+        if loop_count > 0:
+            inputs = self.proposer.update_inputs_decoding(inputs, draft_token_ids.transpose(0, 1), target_hidden_states,
+                                                          model_metas)
+            for loop_idx in range(loop_count):
+                outputs = await self.async_forward(inputs, swap_in_map=dict(), swap_out_map=dict())
+                draft_token_ids, model_metas, target_hidden_states = self.proposer.get_outputs(outputs, inputs)
+                draft_tokens_li.append(draft_token_ids)
+                if loop_idx < loop_count - 1:
+                    inputs.update(draft_token_ids.transpose(0, 1))
+                    inputs.model_metas = model_metas
+                    inputs.target_hidden_states = target_hidden_states
+                    if inputs.target_position_ids is not None:
+                        inputs.target_position_ids += 1
+
+        return torch.cat(draft_tokens_li, dim=-1)
+
     async def async_model_forward(self,
                                   model_inputs: ModelInputs,
                                   spec_inputs: SpecDecodeInputs,
                                   swap_in_map: SwapMap = dict(),
                                   swap_out_map: SwapMap = dict()):
         """Draft model forward."""
-        if model_inputs.spec_metadata.draft_token_ids is not None:
-            spec_metadata = model_inputs.spec_metadata
-            output_token_ids, num_rejected_tokens, last_token_ids = self.rejection_sampler(
-                spec_inputs.target_logits, spec_metadata.draft_token_ids, spec_inputs.bonus_token_ids,
-                spec_metadata.num_draft_tokens, spec_metadata.max_spec_len)
-            spec_inputs.num_rejected_tokens = num_rejected_tokens
-            spec_inputs.reject_sample_tokens = output_token_ids
-            spec_inputs.next_token_ids = last_token_ids
-        else:
-            spec_inputs.next_token_ids = spec_inputs.bonus_token_ids
-            output_token_ids = spec_inputs.next_token_ids.unsqueeze(-1)
+        with torch.cuda.stream(self.stream):
+            if model_inputs.spec_metadata.draft_token_ids is not None:
+                spec_metadata = model_inputs.spec_metadata
+                output_token_ids, num_rejected_tokens, last_token_ids = self.rejection_sampler(
+                    spec_inputs.target_logits, spec_metadata.draft_token_ids, spec_inputs.bonus_token_ids,
+                    spec_metadata.num_draft_tokens, spec_metadata.max_spec_len)
+                spec_inputs.num_rejected_tokens = num_rejected_tokens
+                spec_inputs.reject_sample_tokens = output_token_ids
+                spec_inputs.next_token_ids = last_token_ids
+            else:
+                spec_inputs.next_token_ids = spec_inputs.bonus_token_ids
+                output_token_ids = spec_inputs.next_token_ids.unsqueeze(-1)
 
-        with record_function('draft_prepare_inputs'):
-            draft_model_inputs = self.proposer.prepare_inputs(model_inputs, spec_inputs)
+            with record_function('draft_prepare_inputs'):
+                draft_model_inputs = self.proposer.prepare_inputs(model_inputs, spec_inputs)
 
-        new_draft_tokens = await self.async_forward(draft_model_inputs,
-                                                    swap_in_map=swap_in_map,
-                                                    swap_out_map=swap_out_map)
-        outputs = dict(output_token_ids=output_token_ids, spec_token_ids=new_draft_tokens)
-        return outputs
+            new_draft_tokens = await self._async_model_forward(draft_model_inputs,
+                                                               swap_in_map=swap_in_map,
+                                                               swap_out_map=swap_out_map)
+            outputs = dict(output_token_ids=output_token_ids, spec_token_ids=new_draft_tokens)
+            return outputs
+
+    def warmup(self, max_batches: int, target_model_config: ModelConfig):
+        """warmup."""
+        target_hidden_size = self.proposer.get_target_hidden_size(target_model_config)
+
+        # warmup prefill
+        inputs = ModelInputs.make_dummy(max_batches,
+                                        is_decoding=False,
+                                        device='cuda',
+                                        vocab_size=self.model_config.vocab_size)
+        inputs.target_hidden_states = torch.randn((1, max_batches, target_hidden_size),
+                                                  dtype=self.model_config.dtype,
+                                                  device='cuda')
+        self._forward_impl(inputs, swap_in_map=dict(), swap_out_map=dict())
+
+        capture_batch_sizes = self.proposer.model.get_capture_batch_sizes()
+        capture_batch_sizes = sorted(capture_batch_sizes, reverse=True)
+
+        for batch_size in capture_batch_sizes:
+            inputs = ModelInputs.make_dummy(
+                batch_size,
+                is_decoding=True,
+                device='cuda',
+                vocab_size=self.model_config.vocab_size,
+            )
+            inputs.target_hidden_states = torch.randn((1, batch_size, self.model_config.hidden_size),
+                                                      dtype=self.model_config.dtype,
+                                                      device='cuda')
+            self._forward_impl(inputs, swap_in_map=dict(), swap_out_map=dict())
 
 
 class BaseModelAgent:
@@ -525,8 +621,9 @@ def get_free_mem(self):
     def warmup(self):
         """warmup."""
         # TODO: disable for now, do not remove the comments.
-        with self.all_context():
+        with self.all_context(), torch.cuda.stream(self.stream), torch.inference_mode():
             max_batches = self.cache_config.max_batches
+
             num_tokens = max_batches
 
             # warmup prefill
@@ -546,6 +643,10 @@ def warmup(self):
                                                 vocab_size=self.model_config.vocab_size)
                 self._forward_impl(inputs, swap_in_map=dict(), swap_out_map=dict())
 
+            # warmup draft model
+            if self.spec_agent is not None:
+                self.spec_agent.warmup(max_batches, self.model_config)
+
     async def _async_model_forward(
         self,
         inputs: ModelInputs,
@@ -639,8 +740,8 @@ async def __long_context_single_forward(new_inputs, max_seqlen: int):
             return tmp_out
 
         # make long context inputs
-        is_long_context = inputs.input_ids.numel(
-        ) > max_prefill_token_num and not inputs.is_decoding and inputs.seq_length[0] == 1
+        is_long_context = inputs.input_ids.numel() > max_prefill_token_num and not inputs.is_decoding
+
         max_seqlen = 0
         if is_long_context:
             seq_len = inputs.seq_length
@@ -1165,7 +1266,7 @@ def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map:
                                inputs,
                                self.cache_engine,
                                stream=self.stream,
-                               output_position_ids=self.spec_agent is not None)
+                               output_position_ids=False)
         return output
 
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
@@ -1194,6 +1295,10 @@ def reset_graph_runner(self):
         if hasattr(self.patched_model, 'reset'):
             self.patched_model.reset()
 
+        if self.spec_agent is not None:
+            if self.spec_agent.proposer.model is not None and hasattr(self.spec_agent.proposer.model, 'reset'):
+                self.spec_agent.proposer.model.reset()
+
     @torch.inference_mode()
     def update_params(self, request: UpdateParamsRequest):
         """Update params."""
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
@@ -274,22 +274,26 @@ def __make_next_vision_inputs(flatten_mms: List, start: int):
             max_q_seqlen = end - start
             if isinstance(max_q_seqlen, torch.Tensor):
                 max_q_seqlen = max_q_seqlen.item()
-            inp = ModelInputs(
-                input_ids=self.input_ids[:, start:end],
-                seq_length=input_ids.new_tensor([end - start]),
-                block_offsets=self.block_offsets,
-                history_lengths=self.history_lengths + start,
-                is_decoding=self.is_decoding,
-                num_ignored_history=self.num_ignored_history,
-                max_q_seqlen=max_q_seqlen,
-                max_kv_seqlen=max_kv_seqlen,
-                sum_kv_seqlen=max_kv_seqlen,
-                local_adapter_ids=self.local_adapter_ids,
-                vision_inputs=vision_inputs,
-                model_metas=self.model_metas,
-                cross_length=cross_length,
-                history_cross_length=history_cross_length,
-            )
+            target_hidden_states = self.target_hidden_states[:, start:
+                                                             end] if self.target_hidden_states is not None else None
+            target_position_ids = self.target_position_ids[:,
+                                                           start:end] if self.target_position_ids is not None else None
+            inp = ModelInputs(input_ids=self.input_ids[:, start:end],
+                              seq_length=input_ids.new_tensor([end - start]),
+                              block_offsets=self.block_offsets,
+                              history_lengths=self.history_lengths + start,
+                              is_decoding=self.is_decoding,
+                              num_ignored_history=self.num_ignored_history,
+                              max_q_seqlen=max_q_seqlen,
+                              max_kv_seqlen=max_kv_seqlen,
+                              sum_kv_seqlen=max_kv_seqlen,
+                              local_adapter_ids=self.local_adapter_ids,
+                              vision_inputs=vision_inputs,
+                              model_metas=self.model_metas,
+                              cross_length=cross_length,
+                              history_cross_length=history_cross_length,
+                              target_hidden_states=target_hidden_states,
+                              target_position_ids=target_position_ids)
             ret.append(inp)
             history_cross_length = cross_length
             max_kv_seqlen += max_q_seqlen
diff --git a/lmdeploy/pytorch/spec_decode/base.py b/lmdeploy/pytorch/spec_decode/base.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional
 
 import torch
 from mmengine import Registry
+from torch.profiler import record_function
 
 from lmdeploy.utils import get_logger
 
@@ -79,13 +80,23 @@ def build_model(self, empty_init: bool, target_model: torch.nn.Module = None):
         self.model = patched_model
         self.target_model = target_model
 
-    def propose(self, model_inputs: ModelInputs, cache_engine: CacheEngine = None, stream: torch.cuda.Stream = None):
+    def get_outputs(self, model_outputs: Dict[str, torch.Tensor], model_inputs: ModelInputs):
+        """Get outputs."""
         raise NotImplementedError()
 
     def prepare_inputs(self, model_inputs: ModelInputs, spec_inputs: SpecDecodeInputs):
         """Prepare inputs."""
         raise NotImplementedError()
 
+    @record_function('draft_model_forward')
+    def _forward(self, model_inputs: ModelInputs, cache_engine: CacheEngine = None, stream: torch.cuda.Stream = None):
+        """Forward."""
+        return draft_model_forward(self.model,
+                                   model_inputs,
+                                   model_config=self.specdecode_config.model_config,
+                                   cache_engine=cache_engine,
+                                   stream=stream)
+
     def update_inputs_decoding(self, model_inputs: ModelInputs, input_ids: torch.Tensor,
                                target_hidden_states: torch.Tensor, model_metas: List[Any]):
         """Update to decoding inputs."""
@@ -102,6 +113,7 @@ def update_inputs_decoding(self, model_inputs: ModelInputs, input_ids: torch.Ten
         model_inputs.target_hidden_states = target_hidden_states
         return model_inputs
 
+    @record_function('draft_get_logits')
     def get_logits(self, hidden_states: torch.Tensor):
         """Get logits of model output."""
         draft_model = self.model
@@ -114,6 +126,10 @@ def get_logits(self, hidden_states: torch.Tensor):
             logits = self.target_model.get_logits(hidden_states)
         return logits
 
+    def get_target_hidden_size(self, model_config: ModelConfig):
+        """Get target hidden size."""
+        return model_config.hidden_size
+
 
 def build_specdecode_proposer(specdecode_config: SpecDecodeConfig, device: str = 'cuda'):
     """Build spec decoding proposer."""
diff --git a/lmdeploy/pytorch/spec_decode/deepseek_mtp.py b/lmdeploy/pytorch/spec_decode/deepseek_mtp.py
diff --git a/lmdeploy/pytorch/spec_decode/eagle3.py b/lmdeploy/pytorch/spec_decode/eagle3.py