Address comments

sarckk · sarckk · commit 24ded7e069d8 · 2025-08-25T12:24:59.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
@@ -47,8 +47,7 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.v1.attention.backends.utils import (
-    KVSharingFastPrefillAttentionMetadata)
+from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
 
 from .interfaces import SupportsQuant
 from .utils import (AutoWeightsLoader, extract_layer_index,
@@ -866,8 +865,7 @@ def fast_prefill_forward(
             # Last layer is a KV sharing layer
             layer_attn_metadata = attn_metadata[
                 self.layers[-1].self_attn.attn.layer_name]
-            if (isinstance(layer_attn_metadata,
-                           KVSharingFastPrefillAttentionMetadata)):
+            if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)):
                 logits_indices_padded = (
                     layer_attn_metadata.logits_indices_padded)
                 num_logits_indices = layer_attn_metadata.num_logits_indices
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -4,7 +4,6 @@
 import enum
 import functools
 from abc import abstractmethod
-from collections.abc import Hashable
 from dataclasses import dataclass, fields, make_dataclass
 from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Optional, Protocol,
                     TypeVar)
@@ -67,11 +66,12 @@ class CommonAttentionMetadata:
     block_table_tensor: torch.Tensor
     slot_mapping: torch.Tensor
 
+    causal: bool = True
+
+    # Needed by FastPrefillAttentionBuilder
     logits_indices_padded: Optional[torch.Tensor] = None
     num_logits_indices: Optional[int] = None
 
-    causal: bool = True
-
 
 @dataclass
 class UbatchSlice:
@@ -557,9 +557,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
         # Skip computing fast prefill path
         return common_attn_metadata
 
-    if (common_attn_metadata.logits_indices_padded is None
-            or common_attn_metadata.num_logits_indices is None):
-        return common_attn_metadata
+    assert common_attn_metadata.logits_indices_padded is not None
+    assert common_attn_metadata.num_logits_indices is not None
 
     logits_indices_padded = common_attn_metadata.logits_indices_padded
     num_logits_indices = common_attn_metadata.num_logits_indices
@@ -750,59 +749,12 @@ def subclass_attention_metadata(
     return Wrapped
 
 
-@functools.lru_cache
-def make_kv_sharing_fast_prefill_attention_metadata(
-    metadata_cls: Hashable, ) -> Any:
-    """
-    Return a new subclass of `metadata_cls` for fast prefill
-    """
-    attn_metadata_dataclass = subclass_attention_metadata(
-        name_prefix="KVSharingFastPrefill",
-        metadata_cls=metadata_cls,
-        fields=KV_SHARING_FAST_PREFILL_METADATA_FIELDS,
-    )
-    # Make attention metadata type inherit
-    # KVSharingFastPrefillAttentionMetadata type
-    fast_prefill_metadata_type = type(
-        attn_metadata_dataclass.__name__,
-        (
-            attn_metadata_dataclass,
-            KVSharingFastPrefillAttentionMetadata,
-        ),
-        {},
-    )
-    return fast_prefill_metadata_type
-
-
 @runtime_checkable
-class KVSharingFastPrefillAttentionMetadata(Protocol):
+class KVSharingFastPrefillMetadata(Protocol):
     logits_indices_padded: torch.Tensor
     num_logits_indices: int
 
 
-def create_kv_sharing_fast_prefill_attn_metadata_subclass(
-    metadata: Any,
-    common_attn_metadata: CommonAttentionMetadata,
-) -> Any:
-    # Dynamically create a a dataclass type that inherits
-    # from attention metadata type but includes additional
-    # fields logits_indices_padded and num_logits_indices
-    # which are required for prefill truncation
-    fast_prefill_metadata_type = (
-        make_kv_sharing_fast_prefill_attention_metadata(
-            metadata_cls=type(metadata), ))  # type: ignore
-    # Avoid deepcopy caused by dict.asdict
-    attn_metadata_fields = {}
-    for field in fields(metadata.__class__):
-        attn_metadata_fields[field.name] = getattr(metadata, field.name)
-    attn_metadata_i = fast_prefill_metadata_type(
-        **attn_metadata_fields,
-        logits_indices_padded=common_attn_metadata.logits_indices_padded,
-        num_logits_indices=common_attn_metadata.num_logits_indices,
-    )
-    return attn_metadata_i
-
-
 def create_fast_prefill_custom_backend(
     prefix: str,
     underlying_attn_backend: AttentionBackend,
@@ -820,7 +772,27 @@ def build(self,
             make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
             metadata = super().build(common_prefix_len,
                                      new_common_attn_metadata, fast_build)
-            return create_kv_sharing_fast_prefill_attn_metadata_subclass(
+
+            class KVSharingFastPrefillAttentionMetadata(
+                    metadata.__class__, KVSharingFastPrefillMetadata):
+
+                def __init__(self, metadata, common_attention_metadata):
+                    # Shallow copy all fields in metadata cls
+                    for field in fields(metadata.__class__):
+                        setattr(self, field.name,
+                                getattr(metadata, field.name))
+
+                    # Set additional fields that will be used in model code
+                    assert (common_attn_metadata.logits_indices_padded
+                            is not None
+                            and common_attn_metadata.num_logits_indices
+                            is not None)
+                    self.logits_indices_padded = \
+                        common_attn_metadata.logits_indices_padded
+                    self.num_logits_indices = \
+                        common_attn_metadata.num_logits_indices
+
+            return KVSharingFastPrefillAttentionMetadata(
                 metadata, common_attn_metadata)
 
     attn_backend = subclass_attention_backend(
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -338,7 +338,9 @@ async def generate(
         if (self.vllm_config.cache_config.kv_sharing_fast_prefill
                 and sampling_params.prompt_logprobs):
             raise ValueError(
-                "Fast prefill produces incorrect logprobs for prompt tokens")
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, please disable it when the requests need "
+                "prompt logprobs")
 
         try:
             # We start the output_handler on the first call to generate() so
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1498,6 +1498,12 @@ def execute_model(
             return self.kv_connector_no_forward(scheduler_output,
                                                 self.vllm_config)
 
+        if self.cache_config.kv_sharing_fast_prefill:
+            assert not self.input_batch.num_prompt_logprobs, (
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, tokens, please disable it when the requests "
+                "need prompt logprobs")
+
         # Prepare the decoder inputs.
         (attn_metadata, logits_indices, spec_decode_metadata,
          num_scheduled_tokens_np, spec_decode_common_attn_metadata,
@@ -3136,6 +3142,19 @@ def maybe_add_kv_sharing_layers_to_kv_cache_groups(
             self.runner_only_attn_layers,
         )
 
+        if self.cache_config.kv_sharing_fast_prefill:
+            # In You Only Cache Once (https://arxiv.org/abs/2405.05254) or other
+            # similar KV sharing setups, only the layers that generate KV caches
+            # are involved in the prefill phase, enabling prefill to early exit.
+            attn_layers = get_layers_from_vllm_config(self.vllm_config,
+                                                      Attention)
+            for layer_name in reversed(attn_layers):
+                if layer_name in self.shared_kv_cache_layers:
+                    self.kv_sharing_fast_prefill_eligible_layers.add(
+                        layer_name)
+                else:
+                    break
+
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -3144,8 +3163,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             cache size of each layer
         """
         kv_cache_config = deepcopy(kv_cache_config)
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
-        self.maybe_add_kv_sharing_fast_prefill_layers(attn_layers)
         self.kv_cache_config = kv_cache_config
         self.may_reinitialize_input_batch(kv_cache_config)
         self.may_add_encoder_only_layers_to_kv_cache_config()
@@ -3189,26 +3206,6 @@ def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
             self.kv_cache_config.kv_cache_groups.append(
                 KVCacheGroupSpec(layer_names=layer_names, kv_cache_spec=spec))
 
-    def maybe_add_kv_sharing_fast_prefill_layers(self,
-                                                 attn_layers: dict[str,
-                                                                   Attention]):
-        """
-        In You Only Cache Once (https://arxiv.org/abs/2405.05254), or other 
-        similar KV sharing setups, the layers that re-use the shared KV cache 
-        (cross-decoder layers) can skip prefill, as only the earlier layers 
-        that generate KV caches are involved in the prefill phase.
-        """
-        if not self.cache_config.kv_sharing_fast_prefill:
-            # Optimization disabled, return
-            return
-
-        # Iterate in reversed order and add layers that re-use KV cache
-        for layer_name in reversed(attn_layers):
-            if layer_name in self.shared_kv_cache_layers:
-                self.kv_sharing_fast_prefill_eligible_layers.add(layer_name)
-            else:
-                break
-
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
         Generates the KVCacheSpec by parsing the kv cache format from each