Fix rebase conflicts

sarckk · sarckk · commit 9ef64fddea3e · 2025-08-22T19:12:51.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
@@ -11,8 +11,7 @@
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig, QuantizationConfig
 from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
-    make_local_attention_virtual_batches,
+    CommonAttentionMetadata, make_local_attention_virtual_batches,
     subclass_attention_backend)
 
 from ..layer import Attention
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -6,8 +6,8 @@
 from abc import abstractmethod
 from collections.abc import Hashable
 from dataclasses import dataclass, fields, make_dataclass
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Generic, Optional,
-                    Protocol, TypeVar)
+from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Optional, Protocol,
+                    TypeVar)
 
 import numpy as np
 import torch
@@ -613,29 +613,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
     return common_attn_metadata
 
 
-def subclass_attention_metadata_builder(
-    name_prefix: str,
-    builder_cls: type[AttentionMetadataBuilder[M]],
-    build: Callable[
-        [AttentionMetadataBuilder[M], int, CommonAttentionMetadata, bool],
-        AttentionMetadata,
-    ],
-) -> type[AttentionMetadataBuilder[M]]:
-    """
-    Return a new subclass of `builder_cls` whose .build(...) method
-    is monkey patched to a custom build function.
-    """
-    name: str = name_prefix + builder_cls.__name__  # type: ignore
-
-    Wrapped = type(
-        name,
-        (builder_cls, ),  # inherit from the original
-        {
-            "build": build,
-        })
-    return Wrapped  # type: ignore
-
-
 def subclass_attention_backend(
         name_prefix: str, attention_backend_cls: type[AttentionBackend],
         builder_cls: type[AttentionMetadataBuilder[M]]
@@ -826,35 +803,29 @@ def create_kv_sharing_fast_prefill_attn_metadata_subclass(
     return attn_metadata_i
 
 
-@functools.lru_cache
 def create_fast_prefill_custom_backend(
     prefix: str,
     underlying_attn_backend: AttentionBackend,
 ) -> type[AttentionBackend]:
 
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> AttentionMetadata:
-        new_common_attn_metadata =\
-        make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
-        metadata = super(self.__class__,
-                         self).build(common_prefix_len,
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class FastPrefillAttentionBuilder(underlying_builder):  # type: ignore
+
+        def build(self,
+                  common_prefix_len: int,
+                  common_attn_metadata: CommonAttentionMetadata,
+                  fast_build: bool = False) -> AttentionMetadata:
+            new_common_attn_metadata =\
+            make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
+            metadata = super().build(common_prefix_len,
                                      new_common_attn_metadata, fast_build)
-        return create_kv_sharing_fast_prefill_attn_metadata_subclass(
-            metadata, common_attn_metadata)
+            return create_kv_sharing_fast_prefill_attn_metadata_subclass(
+                metadata, common_attn_metadata)
 
-    # Dynamically create a new attention backend that wraps the
-    # underlying attention backend but applies
-    # `build_preproces_fn` before calling `build(...)`
-    builder_cls = subclass_attention_metadata_builder(
-        name_prefix=prefix,
-        builder_cls=underlying_attn_backend.get_builder_cls(),
-        build=build,
-    )
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
         attention_backend_cls=underlying_attn_backend,
-        builder_cls=builder_cls)
+        builder_cls=FastPrefillAttentionBuilder)
 
     return attn_backend
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2993,9 +2993,7 @@ def _reshape_kv_cache_tensors(
         for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
             attn_backend = group.backend
             for layer_name in group.layer_names:
-                if (
-                    layer_name in self.runner_only_attn_layers
-                ):
+                if layer_name in self.runner_only_attn_layers:
                     continue
                 raw_tensor = kv_cache_raw_tensors[layer_name]
                 assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
@@ -3110,26 +3108,12 @@ def initialize_kv_cache_tensors(
         kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
                                                    kv_cache_raw_tensors)
 
-        # Setup `kv_cache_config` and `kv_caches` for models
-        # with cross-layer KV sharing
-        if self.shared_kv_cache_layers:
-            initialize_kv_cache_for_kv_sharing(
-                self.shared_kv_cache_layers,
-                kv_cache_config.kv_cache_groups,
-                kv_caches,
-                self.attn_groups,
-                self.runner_only_attn_layers,
-            )
-            attn_layers = get_layers_from_vllm_config(self.vllm_config,
-                                                      Attention)
-            # Iterate in reversed order and add layers that re-use KV cache
-            # e.g. in YOCO-like KV sharing setups (e.g. Gemma3n)
-            for layer_name in reversed(attn_layers):
-                if layer_name in self.shared_kv_cache_layers:
-                    self.kv_sharing_fast_prefill_eligible_layers.add(
-                        layer_name)
-                else:
-                    break
+        # Set up cross-layer KV cache sharing
+        for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
+        ):
+            logger.debug("%s reuses KV cache of %s", layer_name,
+                         target_layer_name)
+            kv_caches[layer_name] = kv_caches[target_layer_name]
 
         bind_kv_cache(kv_caches,
                       self.compilation_config.static_forward_context,
@@ -3149,6 +3133,7 @@ def maybe_add_kv_sharing_layers_to_kv_cache_groups(
         add_kv_sharing_layers_to_kv_cache_groups(
             self.shared_kv_cache_layers,
             kv_cache_config.kv_cache_groups,
+            self.runner_only_attn_layers,
         )
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
@@ -201,9 +201,6 @@ def gather_mm_placeholders(
 def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],
-    kv_caches: dict[str, torch.Tensor],
-    # Optional for now to avoid breaking TPU
-    attn_groups: Optional[list[list[AttentionGroup]]] = None,
     runner_only_attn_layers: Optional[set[str]] = None,
 ) -> None:
     """