Reimplement on top of PR vllm-project#21329

Jialin · Jialin · commit 8edbf78876b2 · 2025-07-22T00:16:35.000-07:00
Signed-off-by: Jialin Ouyang &lt;Jialin.Ouyang@gmail.com&gt;
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
@@ -8,10 +8,9 @@
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         FreeKVCacheBlockQueue, KVCacheBlock,
-                                         generate_block_hash_extra_keys,
-                                         hash_block_tokens)
-from vllm.v1.request import Request
+                                         FreeKVCacheBlockQueue, KVCacheBlock)
+from vllm.v1.request import (Request, generate_block_hash_extra_keys,
+                             hash_block_tokens)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/core/kv_cache_common.py b/vllm/v1/core/kv_cache_common.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""KV-Cache Types."""
+from typing import Any, NamedTuple, Optional
+
+
+class BlockHash(NamedTuple):
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    We keep a tuple of token IDs and extra keys to reduce the likelihood of
+    hash collisions when the hash value is the same. By using SHA256 however,
+    hash collisions are practically impossible.
+    """
+    # Hash value of the block in an integer.
+    hash_value: int
+    # Token IDs in the block.
+    token_ids: tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -9,11 +9,11 @@
 from vllm.logger import init_logger
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
-from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
-                                         hash_request_tokens, init_none_hash)
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
-from vllm.v1.request import Request, RequestStatus
+from vllm.v1.request import (Request, RequestStatus, hash_request_tokens,
+                             init_none_hash)
 
 logger = init_logger(__name__)
 
@@ -166,8 +166,8 @@ def get_computed_blocks(self,
         block_hashes = self.req_to_block_hashes[request.request_id]
         if not block_hashes:
             assert self.block_size is not None
-            block_hashes = hash_request_tokens(self.caching_hash_fn,
-                                               self.block_size, request)
+            block_hashes = request.precomputed_block_hashes if request.precomputed_block_hashes is not None else hash_request_tokens(
+                self.caching_hash_fn, self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
         if self.log_stats:
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
@@ -2,39 +2,24 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """KV-Cache Utilities."""
 
-import os
 from collections import defaultdict, deque
-from collections.abc import Iterable, Sequence
+from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Any, Callable, NamedTuple, Optional
+from typing import NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, cdiv, sha256_cbor_64bit
+from vllm.utils import GiB_bytes, cdiv
+from vllm.v1.core.kv_cache_common import BlockHash
 from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         KVCacheTensor, SlidingWindowSpec)
 from vllm.v1.metrics.stats import PrefixCacheStats
-from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
 
-class BlockHash(NamedTuple):
-    """Hash value of a block (int), the token IDs in the block, and extra keys.
-    We keep a tuple of token IDs and extra keys to reduce the likelihood of
-    hash collisions when the hash value is the same. By using SHA256 however,
-    hash collisions are practically impossible.
-    """
-    # Hash value of the block in an integer.
-    hash_value: int
-    # Token IDs in the block.
-    token_ids: tuple[int, ...]
-    # Extra keys for the block.
-    extra_keys: Optional[Any] = None
-
-
 class BlockHashWithGroupId(NamedTuple):
     # The hash value for the contents (e.g., token_ids) of a block without group
     # ID. The value is the same for blocks representing the same tokens but for
@@ -47,32 +32,6 @@ def get_hash_value(self) -> int:
         return self.block_hash.hash_value
 
 
-# The hash seed for the first block of any prefix block sequence.
-#
-# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
-# variable if set such that processes can share the seed if needed.
-# This aligns with the behavior of Python's hash() function, which also uses
-# a random seed if PYTHONHASHSEED is not set.
-#
-# The function `init_none_hash` initializes this variable globally.
-NONE_HASH: int
-
-
-def init_none_hash(hash_fn: Callable):
-    global NONE_HASH
-
-    hash_seed = os.getenv("PYTHONHASHSEED")
-    if hash_seed is None and hash_fn is sha256_cbor_64bit:
-        logger.warning(
-            "PYTHONHASHSEED is not set. This will lead to non-reproducible "
-            "block-hashes when using sha256_cbor_64bit as the hash function."
-            "Consider setting PYTHONHASHSEED to a fixed value for "
-            "reproducibility.")
-
-    NONE_HASH = (int.from_bytes(os.urandom(32), byteorder="big")
-                 if hash_seed is None else hash_fn(hash_seed))
-
-
 class PrefixCachingMetrics:
     """Metrics for prefix caching with a hit rate of the max recent N requests.
 
@@ -335,207 +294,6 @@ def get_all_free_blocks(self) -> list[KVCacheBlock]:
         return ret
 
 
-def need_extra_keys(request: Request) -> bool:
-    """Check whether the blocks allocated to this request need extra hash keys.
-
-    Args:
-        request (Request): The request.
-
-    Returns:
-        bool: Whether blocks allocated to this request need extra hash keys.
-    """
-
-    # Multimodal requests need to include the MM hash.
-    # LoRA requests need to include the LoRA ID.
-    # Request with provided cache salt need to include the salt.
-    return bool(request.mm_positions) or (request.lora_request
-                                          is not None) or (request.cache_salt
-                                                           is not None)
-
-
-def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
-                            end_token_idx: int,
-                            start_mm_idx: int) -> tuple[list[Any], int]:
-    """Generate extra keys related to MultiModal request for block hash
-    computation. For multi-modal inputs, the extra keys are
-    (mm_hash, start_offset) that indicate a mm input contained in the
-    block and its starting offset in the block tokens.
-
-    Args:
-        request: The request object.
-        start_token_idx: The start token index of the block.
-        end_token_idx: The end token index of the block.
-        start_mm_idx: The start multi-modal index of the block.
-
-    Returns:
-        A tuple of extra keys and the next multi-modal index.
-    """
-    extra_keys: list[Any] = []
-
-    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
-    if not mm_positions:
-        return extra_keys, start_mm_idx
-
-    if mm_positions and len(mm_positions) != len(mm_hashes):
-        raise ValueError(
-            "The number of multi-modal positions and hashes must match. This "
-            "is likely because you do not enable MM preprocessor hashing. "
-            "Please set disable_mm_preprocessor_cache=False.")
-
-    # Note that we assume mm_positions is sorted by offset.
-    # We do not need to check all mm inputs if the start token index is out of
-    # range. This usually happens in the late prefill phase and decoding phase.
-    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
-        return extra_keys, start_mm_idx
-
-    # Support start_mm_idx == -1 to indicate the last mm input.
-    if start_mm_idx < 0:
-        assert -start_mm_idx <= len(mm_positions)
-        start_mm_idx = len(mm_positions) + start_mm_idx
-
-    curr_mm_idx = start_mm_idx
-    while mm_positions and curr_mm_idx < len(mm_positions):
-        assert mm_hashes[curr_mm_idx] is not None
-        offset = mm_positions[curr_mm_idx].offset
-        length = mm_positions[curr_mm_idx].length
-        if end_token_idx > offset:
-            if start_token_idx > offset + length:
-                # This block has passed the current mm input.
-                curr_mm_idx += 1
-                continue
-
-            # The block contains the current mm input.
-            extra_keys.append(mm_hashes[curr_mm_idx])
-
-            if end_token_idx >= offset + length:
-                # If this block contains the end of the current mm input,
-                # move to the next mm input as this block may also contain
-                # the next mm input.
-                curr_mm_idx += 1
-            else:
-                # Otherwise this block is done with mm inputs.
-                break
-        else:
-            # This block has not reached the current mm input.
-            break
-    return extra_keys, curr_mm_idx
-
-
-def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
-    """Generate extra keys related to LoRA for block hash computation.
-
-    Args:
-        request: The request object.
-
-    Returns:
-        Return LoRA id of the request if it is a LoRA request. Return empty
-        list otherwise.
-    """
-    if not request.lora_request:
-        return []
-    return [request.lora_request.lora_int_id]
-
-
-def generate_block_hash_extra_keys(
-        request: Request, start_token_idx: int, end_token_idx: int,
-        start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
-    """Generate extra keys for the block hash. The extra keys can come from
-    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
-
-    Args:
-        request: The request object.
-        start_token_idx: The start token index of the block.
-        end_token_idx: The end token index of the block.
-        start_mm_idx: The start multi-modal index of the block.
-
-    Returns:
-        A tuple of extra keys and the next multi-modal index.
-    """
-    mm_extra_keys: list[Any]
-    mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
-        request, start_token_idx, end_token_idx, start_mm_idx)
-    lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
-    cache_salt_keys: list[str] = [request.cache_salt] if (
-        start_token_idx == 0 and request.cache_salt) else []
-
-    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys + cache_salt_keys
-
-    if not extra_keys:
-        return None, new_start_mm_idx
-
-    return tuple(extra_keys), new_start_mm_idx
-
-
-def hash_block_tokens(
-        hash_function: Callable,
-        parent_block_hash: Optional[int],
-        curr_block_token_ids: Sequence[int],
-        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash:
-    """Computes a hash value corresponding to the contents of a block and
-    the contents of the preceding block(s). The hash value is used for
-    prefix caching. We use LRU cache for this function to avoid recomputing
-    hash values for the same block contents.
-
-    Args:
-        parent_block_hash: The hash of the parent block. None
-            if this is the first block.
-        curr_block_token_ids: A list of token ids in the current
-            block. The current block is assumed to be full.
-        extra_keys: Extra keys for the block.
-
-    Returns:
-        The hash value of the block and the token ids in the block.
-        The entire tuple is used as the hash key of the block.
-    """
-    if not parent_block_hash:
-        parent_block_hash = NONE_HASH
-
-    curr_block_token_ids_tuple = tuple(curr_block_token_ids)
-    return BlockHash(
-        hash_function(
-            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
-        curr_block_token_ids_tuple, extra_keys)
-
-
-def hash_request_tokens(hash_function: Any, block_size: int,
-                        request: Request) -> list[BlockHash]:
-    """Computes hash values of a chain of blocks given a sequence of
-    token IDs. The hash value is used for prefix caching.
-
-    Args:
-        block_size: The size of each block.
-        request: The request object.
-
-    Returns:
-        The list of computed hash values.
-    """
-    token_ids = request.all_token_ids
-
-    req_need_extra_keys = need_extra_keys(request)
-    req_extra_keys = None
-    curr_mm_idx = 0
-
-    ret = []
-    parent_block_hash_value = None
-    for start in range(0, len(token_ids), block_size):
-        end = start + block_size
-        block_token_ids = token_ids[start:end]
-        # Do not hash the block if it is not full.
-        if len(block_token_ids) < block_size:
-            break
-
-        if req_need_extra_keys:
-            # MM and LoRA requests need extra keys for block-hash computation.
-            req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
-                request, start, end, curr_mm_idx)
-
-        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
-                                       block_token_ids, req_extra_keys)
-        ret.append(block_hash)
-        parent_block_hash_value = block_hash.hash_value
-    return ret
-
-
 def max_memory_usage_bytes(vllm_config: VllmConfig,
                            kv_cache_specs: Iterable[KVCacheSpec]) -> int:
     """
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -39,6 +39,7 @@
 from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
 from vllm.v1.metrics.prometheus import shutdown_prometheus
 from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.request import init_none_hash
 
 logger = init_logger(__name__)
 
@@ -131,6 +132,11 @@ def __init__(
             self.logger_manager.log_engine_initialized()
 
         self.output_handler: Optional[asyncio.Task] = None
+
+        # logger.info("===jialino init_none_hash")
+        # TODO(Jialin): Extract the right hash function from vllm_config @nocommit
+        init_none_hash(hash)
+
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
             asyncio.get_running_loop()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -42,7 +42,7 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.request import Request, RequestStatus
+from vllm.v1.request import Request, RequestStatus, hash_request_tokens
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
 from vllm.version import __version__ as VLLM_VERSION
@@ -396,7 +396,12 @@ def _preprocess_add_request(self, request: EngineCoreRequest) -> Request:
         
         This function could be directly used in input processing thread to allow
         request initialization running in parallel with Model forward"""
-        return Request.from_engine_core_request(request)
+        converted_request = Request.from_engine_core_request(request)
+        # TODO(Jialin): Use the right hash function here
+        # TODO(Jialin): Use the right block size here
+        converted_request.precomputed_block_hashes = hash_request_tokens(
+            hash, 16, converted_request)
+        return converted_request
 
 
 class EngineCoreProc(EngineCore):
diff --git a/vllm/v1/request.py b/vllm/v1/request.py