add KVTransferParams class back to base.py and change block handling in nixl connector

skaulintel · skaulintel · commit c6ac72517ce8 · 2025-08-06T22:18:05.000Z
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -32,7 +32,7 @@
 
 import enum
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
 
 import torch
 
@@ -46,6 +46,12 @@
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
+# s_tensor_list, d_tensor_list, s_indices, d_indices, direction
+CopyBlocksOp = Callable[[
+    dict[str, torch.Tensor], dict[
+        str, torch.Tensor], list[int], list[int], Literal["h2d", "d2h"]
+], None]
+
 logger = init_logger(__name__)
 
 
@@ -60,7 +66,7 @@ class KVTransferParams:
     """
     Abstract KVTransferParams used to send KVTransfer
     parameters between instances of vLLM.
-    
+
     Specific instances of KVConnector customize this
     method for serializing / deserializing msgs sent
     via the HTTP protocol.
@@ -72,7 +78,7 @@ def from_raw_dict(
                                     Any]]) -> Optional["KVTransferParams"]:
         return None
 
-class KVConnectorMetadata:
+class KVConnectorMetadata(ABC):  # noqa: B024
     """
     Abstract Metadata used to communicate between the
     Scheduler KVConnector and Worker KVConnector.
@@ -87,7 +93,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         logger.warning(
             "Initializing KVConnectorBase_V1. This API is experimental and "
             "subject to change in the future as we iterate the design.")
-        self._connector_metadata = KVConnectorMetadata()
+        self._connector_metadata: Optional[KVConnectorMetadata] = None
         self._vllm_config = vllm_config
         self._role = role
 
@@ -118,7 +124,7 @@ def clear_connector_metadata(self) -> None:
         This function should be called by the model runner every time 
         after the model execution.
         """
-        self._connector_metadata = KVConnectorMetadata()
+        self._connector_metadata = None
 
     def _get_connector_metadata(self) -> KVConnectorMetadata:
         """Get the connector metadata.
@@ -128,6 +134,9 @@ def _get_connector_metadata(self) -> KVConnectorMetadata:
         Returns:
             ConnectorMetadata: the connector metadata.
         """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
         return self._connector_metadata
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
@@ -140,6 +149,13 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         return
 
+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
+        """
+        Set the xPU-specific ops for copying KV between host and device.
+        Needed when host buffer is used for kv transfer (e.g., in NixlConnector)
+        """
+        return
+
     @abstractmethod
     def start_load_kv(self, forward_context: "ForwardContext",
                       **kwargs) -> None:
@@ -206,7 +222,9 @@ def get_finished(
     ) -> tuple[Optional[set[str]], Optional[set[str]]]:
         """
         Notifies worker-side connector ids of requests that have
-        finished generating tokens.
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
 
         Returns:
             ids of requests that have finished asynchronous transfer
@@ -226,7 +244,7 @@ def set_kv_transfer_params(self, request: "Request"):
         kv_transfer_params = self._KVTransferParams.from_raw_dict(
             request.raw_kv_transfer_params)
         request.kv_transfer_params = kv_transfer_params
-        
+
     @abstractmethod
     def get_num_new_matched_tokens(
         self,
@@ -303,3 +321,17 @@ def request_finished(
             returned by the engine.
         """
         return False, None
+
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -734,16 +734,33 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                 block_size, kv_latent_dim = block_shape
                 self.slot_size_bytes = kv_elem_size * kv_latent_dim
             else:
+                # [2 (k and v), num_blocks, ...]
+                #if self._use_flashinfer:
+                #    # FlashInfer swaps 2<->num_blocks dimensions.
+                #    self.num_blocks = first_kv_cache.shape[0]
+                #    block_rank = 4  # [2, block_size, kv_heads, head_dim]
+                #else:
+                #    self.num_blocks = first_kv_cache.shape[1]
+                #    block_rank = 3  # [block_size, kv_heads, head_dim]
+                #block_shape = first_kv_cache.shape[-block_rank:]
+                #block_size, n_kv_heads, head_dim = block_shape[-3:]
+
+                # TODO see if below is necessary, else uncomment above
                 # [2 (k and v), num_blocks, ...]
                 if self._use_flashinfer:
                     # FlashInfer swaps 2<->num_blocks dimensions.
                     self.num_blocks = first_kv_cache.shape[0]
                     block_rank = 4  # [2, block_size, kv_heads, head_dim]
                 else:
-                    self.num_blocks = first_kv_cache.shape[1]
+                    # habana kv_cache: [2, num_blocks*block_size, kv_heads, head_dim]
+                    self.num_blocks = first_kv_cache.shape[1] // self.block_size
                     block_rank = 3  # [block_size, kv_heads, head_dim]
                 block_shape = first_kv_cache.shape[-block_rank:]
+                block_shape = list(block_shape)
+                block_shape[0] = block_shape[0] // self.num_blocks
+                block_shape = torch.Size(block_shape)
                 block_size, n_kv_heads, head_dim = block_shape[-3:]
+
                 # head size in bytes.
                 self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
             assert block_size == self.block_size