update scheduler

RunningLeon · RunningLeon · commit 28ddcc1de557 · 2025-09-10T21:00:10.000+08:00
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -250,9 +250,6 @@ def do_prefill_dp(self):
         return ret
 
     def do_prefill_default(self):
-        if self.spec_decoding:
-            return True
-
         # decoding if no waiting
         scheduler = self.scheduler
         if not scheduler.has_waiting():
@@ -298,7 +295,7 @@ async def prefetch_next_inputs(self):
         else:
             num_running = scheduler.num_running()
             is_decoding = self.forward_inputs['inputs'].is_decoding
-            running_threshold = (self.scheduler_config.max_batches // 4) if is_decoding else 0
+            running_threshold = (self.scheduler_config.max_batches // 4) if is_decoding or self.spec_decoding else 0
 
             if num_running > running_threshold:
                 enable = True
@@ -1269,7 +1266,6 @@ async def _async_loop_main(
                 if idx == num_loops - 1:
                     scheduler.collect_migration_done()
                     forward_inputs, next_running = await inputs_maker.prefetch_next_inputs()
-
                 # send output
                 out = await self.executor.get_output_async()
                 if out is not None:
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -706,7 +706,7 @@ def gather(self, output):
             def get_output(self):
                 """Get tmp_output."""
                 if not return_logits:
-                    return self._output[:, -1:]
+                    return self._output[:, -1:], None
                 torch.cuda.synchronize()
                 return self._output, self._aux_output
 
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -189,14 +189,14 @@ def _schedule_prefill(self):
         copy_map: Dict[int, int] = dict()
         running: SeqList = []
         token_count = 0
+        prealloc_size = self.num_spec_tokens or self.num_spec_tokens - 1
 
         def _to_running(seq: SchedulerSequence):
             """To running."""
             seq.status = MessageStatus.RUNNING
             running.append(seq)
             nonlocal token_count
             token_count += seq.num_token_ids
-            token_count += self.num_spec_tokens
             token_count += len(seq.spec_token_ids)
 
         def __evict_for_seq(seq: SchedulerSequence, waiting):
@@ -205,7 +205,7 @@ def __evict_for_seq(seq: SchedulerSequence, waiting):
             hanging = reversed(self.hanging)
             waiting = reversed(waiting)
             evictable = list(chain(hanging, waiting))
-            return eviction_helper.evict_for_seq(seq, evictable, prealloc_size=self.num_spec_tokens)
+            return eviction_helper.evict_for_seq(seq, evictable, prealloc_size=prealloc_size)
 
         def _reorder_waiting():
             """Reorder waiting."""
@@ -218,7 +218,7 @@ def _reorder_waiting():
         waiting = _reorder_waiting()
         while len(waiting) > 0 and len(running) < max_batches:
             seq = waiting.pop(0)
-            cur_token_count = token_count + seq.num_token_ids + self.num_spec_tokens + len(seq.spec_token_ids)
+            cur_token_count = token_count + seq.num_token_ids + len(seq.spec_token_ids)
             if (len(running) > 0 and cur_token_count > self.cache_config.max_prefill_token_num):
                 break
 
@@ -228,7 +228,7 @@ def _reorder_waiting():
                 break
 
             # allocate session memory
-            self.block_manager.allocate(seq, prealloc_size=self.num_spec_tokens)
+            self.block_manager.allocate(seq, prealloc_size=prealloc_size)
             _to_running(seq)
 
             seq.record_event(EventType.SCHEDULED)