fixes

tlrmchlsmth · tlrmchlsmth · commit f87a8e2eb046 · 2024-08-29T17:54:20.000Z
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -11,7 +11,7 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="state-spaces/mamba-370m-hf")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
@@ -6,13 +6,13 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder)
+from vllm.attention.backends.utils import CommonAttentionState
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUBuilder
 
-# Placeholder attention backend for models like Mamba that don't have attention.
-# Mainly exists to sidestep get_attn_backend.
-# The attention metadata is still needed for Mamba.
+# Placeholder attention backend for models like Mamba and embedding models that
+# lack attention.
 
 
 class PlaceholderAttentionBackend(AttentionBackend):
@@ -34,6 +34,10 @@ def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]:
     def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]:
         return PlaceholderAttentionMetadata
 
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -118,11 +122,15 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
         assert self.context_lens_tensor is not None
         assert self.seq_start_loc is not None
 
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+
         self._cached_prefill_metadata = PlaceholderAttentionMetadata(
             num_prefills=self.num_prefills,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
-            slot_mapping=None,
+            slot_mapping=slot_mapping,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -131,7 +139,7 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             query_start_loc=self.query_start_loc[:self.num_prefills + 1],
             seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
             context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
-            block_tables=None,
+            block_tables=block_tables,
             use_cuda_graph=False,
         )
         return self._cached_prefill_metadata
@@ -145,11 +153,15 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             return self._cached_decode_metadata
         assert self.seq_lens_tensor is not None
 
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+
         self._cached_decode_metadata = PlaceholderAttentionMetadata(
             num_prefills=0,
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=None,
+            slot_mapping=slot_mapping,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_query_len=None,
@@ -158,7 +170,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             query_start_loc=None,
             seq_start_loc=None,
             context_lens_tensor=None,
-            block_tables=None,
+            block_tables=block_tables,
             use_cuda_graph=self.use_cuda_graph,
         )
         return self._cached_decode_metadata
@@ -266,9 +278,13 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                      dtype=query_start_loc.dtype,
                      out=query_start_loc[1:])
 
+        # Placeholders
+        slot_mapping = torch.empty(0)
+        block_tables = torch.empty(0)
+
         return PlaceholderAttentionMetadata(
             num_prefills=self.num_prefills,
-            slot_mapping=None,
+            slot_mapping=slot_mapping,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
@@ -279,7 +295,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             query_start_loc=query_start_loc,
             seq_start_loc=seq_start_loc,
             context_lens_tensor=context_lens_tensor,
-            block_tables=None,
+            block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
         )
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -78,10 +78,10 @@ def __init__(
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
-        attn_backend = get_attn_backend(num_heads, head_size, num_kv_heads,
-                                        sliding_window, dtype, kv_cache_dtype,
-                                        block_size, is_attention_free,
-                                        blocksparse_params is not None)
+        attn_backend = get_attn_backend(head_size, sliding_window, dtype,
+                                        kv_cache_dtype, block_size,
+                                        is_attention_free, blocksparse_params
+                                        is not None)
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -89,14 +89,12 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
 
 @lru_cache(maxsize=None)
 def get_attn_backend(
-    num_heads: int,
     head_size: int,
-    num_kv_heads: int,
     sliding_window: Optional[int],
     dtype: torch.dtype,
     kv_cache_dtype: Optional[str],
     block_size: int,
-    is_attention_free: bool,  #TODO: pass in from all users 
+    is_attention_free: bool,
     is_blocksparse: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
@@ -107,9 +105,8 @@ def get_attn_backend(
             BlocksparseFlashAttentionBackend)
         return BlocksparseFlashAttentionBackend
 
-    backend = which_attn_to_use(num_heads, head_size, num_kv_heads,
-                                sliding_window, dtype, kv_cache_dtype,
-                                block_size, is_attention_free)
+    backend = which_attn_to_use(head_size, sliding_window, dtype,
+                                kv_cache_dtype, block_size, is_attention_free)
     if backend == _Backend.FLASH_ATTN:
         from vllm.attention.backends.flash_attn import (  # noqa: F401
             FlashAttentionBackend)
@@ -157,9 +154,7 @@ def get_attn_backend(
 
 
 def which_attn_to_use(
-    num_heads: int,
     head_size: int,
-    num_kv_heads: int,
     sliding_window: Optional[int],
     dtype: torch.dtype,
     kv_cache_dtype: Optional[str],
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
@@ -616,10 +616,9 @@ def forward(self,
             num_mamba_layers = sum(
                 [layer_type == "mamba" for layer_type in layers_type])
 
-            self.mamba_cache = MambaCacheManager(self.lm_head.weight.dtype,
-                                                num_mamba_layers,
-                                                max_batch_size,
-                                                *self._get_mamba_cache_shape())
+            self.mamba_cache = MambaCacheManager(
+                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
+                *self._get_mamba_cache_shape())
 
         if "seqlen_agnostic_capture_inputs" not in kwargs:
             # We get here only on Prefill/Eager mode runs
@@ -645,7 +644,8 @@ def forward(self,
                                    mamba_cache_tensors[1])
         return hidden_states
 
-    def _get_mamba_cache_shape(self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
         world_size = get_tensor_model_parallel_world_size()
         hidden_size = self.config.hidden_size
         conv_state_shape = (
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
@@ -440,10 +440,9 @@ def forward(self,
             max_batch_size = (_get_graph_batch_size(
                 self.scheduler_config.max_num_seqs) if self.scheduler_config
                               else max(_BATCH_SIZES_TO_CAPTURE) + 2)
-            self.mamba_cache = MambaCacheManager(self.lm_head.weight.dtype,
-                                                self.config.num_hidden_layers,
-                                                max_batch_size,
-                                                *self._get_mamba_cache_shape())
+            self.mamba_cache = MambaCacheManager(
+                self.lm_head.weight.dtype, self.config.num_hidden_layers,
+                max_batch_size, *self._get_mamba_cache_shape())
 
         if "seqlen_agnostic_capture_inputs" not in kwargs:
             # We get here only on Prefill/Eager mode runs
@@ -471,7 +470,8 @@ def forward(self,
 
         return hidden_states
 
-    def _get_mamba_cache_shape(self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
         world_size = get_tensor_model_parallel_world_size()
         conv_state_shape = (
             self.config.intermediate_size // world_size,
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
@@ -1,14 +1,12 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 
 import torch
 
-from vllm.distributed import get_tensor_model_parallel_world_size
-
 
 class MambaCacheManager:
 
     def __init__(self, dtype, num_mamba_layers, max_batch_size,
-                           conv_state_shape, temporal_state_shape):
+                 conv_state_shape, temporal_state_shape):
 
         conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                  conv_state_shape,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -52,12 +52,12 @@ def __init__(
             self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
 
         # Get attention backend.
-        self.attn_backend = get_attn_backend(
-            model_config.get_num_attention_heads(parallel_config),
-            self.head_size, self.num_kv_heads,
-            model_config.get_sliding_window(), model_config.dtype,
-            cache_config.cache_dtype, self.block_size,
-            model_config.is_attention_free())
+        self.attn_backend = get_attn_backend(self.head_size,
+                                             model_config.get_sliding_window(),
+                                             model_config.dtype,
+                                             cache_config.cache_dtype,
+                                             self.block_size,
+                                             model_config.is_attention_free())
 
         # Initialize the cache.
         self.gpu_cache = self._allocate_kv_cache(
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
@@ -103,13 +103,12 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free(),
         )
 
         # Multi-modal data support
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
@@ -55,13 +55,12 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
 
         # Get attention backend.
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             cache_config.cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free(),
         )
 
         # Initialize the cache.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -17,7 +17,6 @@
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
-from vllm.attention.backends.utils import CommonAttentionState
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ObservabilityConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
@@ -864,23 +863,16 @@ def __init__(
         self.graph_block_tables = np.zeros(
             (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
             dtype=np.int32)
-        num_attn_heads = self.model_config.get_num_attention_heads(
-            self.parallel_config)
         self.attn_backend = get_attn_backend(
-            num_attn_heads,
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
             self.model_config.is_attention_free(),
-        ) if num_attn_heads else None
-        if self.attn_backend:
-            self.attn_state = self.attn_backend.get_state_cls()(
-                weakref.proxy(self))
-        else:
-            self.attn_state = CommonAttentionState(weakref.proxy(self))
+        )
+        self.attn_state = self.attn_backend.get_state_cls()(
+            weakref.proxy(self))
 
         # Multi-modal data support
         self.input_registry = input_registry
@@ -1635,21 +1627,10 @@ def forward(
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
         self.input_buffers["positions"].copy_(positions, non_blocking=True)
-"""
+
         if self.backend_name != "No attention":
             self.input_buffers["slot_mapping"].copy_(
                 attn_metadata.slot_mapping, non_blocking=True)
-        if self.backend_name != "flashinfer":
-            self.input_buffers["seq_lens_tensor"].copy_(
-                attn_metadata.decode_metadata.seq_lens_tensor,
-                non_blocking=True)
-            if self.backend_name != "No attention":
-                self.input_buffers["block_tables"].copy_(
-                    attn_metadata.decode_metadata.block_tables,
-                    non_blocking=True)
-"""
-        self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
-                                                 non_blocking=True)
         self.attn_state.prepare_graph_input_buffers(self.input_buffers,
                                                     attn_metadata)
         if "seqlen_agnostic_capture_inputs" in self.input_buffers:
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
@@ -71,9 +71,7 @@ def __init__(
         self.block_size = cache_config.block_size
 
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
@@ -61,13 +61,12 @@ def __init__(
 
         # Get attention backend.
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.head_size,
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.cache_config.cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free(),
         )
 
         # Initialize the cache.
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
@@ -106,13 +106,12 @@ def __init__(
             (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq),
             dtype=np.int32)
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.cache_config.cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free(),
             False,
         )
 
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
@@ -371,13 +371,12 @@ def __init__(
         self.block_size = cache_config.block_size
 
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
+            model_config.is_attention_free(),
         )
 
         # Multi-modal data support