[Mamba][KVCacheManager] Simplify kv cache manage logic for mamba + MTP (vllm-project#25119)

heheda12345 · web-flow · commit 3d5f1c864051 · 2025-10-02T18:48:31.000Z
Signed-off-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -565,35 +565,14 @@ def get_num_common_prefix_blocks(self, request_id: str,
     def get_num_blocks_to_allocate(
             self, request_id: str, num_tokens: int,
             new_computed_blocks: list[KVCacheBlock]) -> int:
-        """
-        Get the number of blocks needed to be allocated for the request.
-
-        Args:
-            request_id: The request ID.
-            num_tokens: The total number of tokens that need a slot (including
-                tokens that are already allocated).
-            new_computed_blocks: The new computed blocks just hitting the
-                prefix caching.
-
-        Returns:
-            The number of blocks
-        """
-
+        # Allocate extra `num_speculative_blocks` blocks for
+        # speculative decoding (MTP/EAGLE) with linear attention.
         assert isinstance(self.kv_cache_spec, MambaSpec)
         if self.kv_cache_spec.num_speculative_blocks > 0:
             num_tokens += (self.kv_cache_spec.block_size *
                            self.kv_cache_spec.num_speculative_blocks)
-        num_required_blocks = cdiv(num_tokens, self.block_size)
-        num_new_blocks = (num_required_blocks - len(new_computed_blocks) -
-                          len(self.req_to_blocks[request_id]))
-        # If a computed block of a request is an eviction candidate (in the
-        # free queue and ref_cnt == 0), it will be changed from a free block
-        # to a computed block when the request is allocated, so we also count
-        # it as needed to be allocated.
-        num_evictable_computed_blocks = sum(
-            blk.ref_cnt == 0 and not blk.is_null
-            for blk in new_computed_blocks)
-        return num_new_blocks + num_evictable_computed_blocks
+        return super().get_num_blocks_to_allocate(request_id, num_tokens,
+                                                  new_computed_blocks)
 
     def allocate_new_blocks(self, request_id: str,
                             num_tokens: int) -> list[KVCacheBlock]: