address review: clarity

NickLucche · NickLucche · commit 3e112404a5b6 · 2025-05-20T07:45:59.000Z
Signed-off-by: nicklucche &lt;nlucches@redhat.com&gt;
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -151,8 +151,18 @@ async def send_request_to_service(client_info: dict, endpoint: str,
     Send a request to a service using a client from the pool.
     """
     req_data = req_data.copy()
-    req_data['do_remote_decode'] = True
+    req_data['kv_transfer_params'] = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None
+    }
     req_data["stream"] = False
+    req_data["max_tokens"] = 1
+    if "stream_options" in req_data:
+        del req_data["stream_options"]
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         "X-Request-Id": request_id
@@ -167,22 +177,14 @@ async def send_request_to_service(client_info: dict, endpoint: str,
 
 
 async def stream_service_response(client_info: dict, endpoint: str,
-                                  req_data: dict, remote_block_ids: list[int],
-                                  remote_engine_id: str, remote_host: str,
-                                  remote_port: int, request_id: str):
+                                  req_data: dict, request_id: str):
     """
     Asynchronously stream response from a service using a client from the pool.
     """
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         "X-Request-Id": request_id
     }
-    req_data = req_data.copy()
-    req_data['do_remote_prefill'] = True
-    req_data["remote_block_ids"] = remote_block_ids
-    req_data['remote_engine_id'] = remote_engine_id
-    req_data["remote_host"] = remote_host
-    req_data["remote_port"] = remote_port
 
     async with client_info['client'].stream("POST",
                                             endpoint,
@@ -209,10 +211,9 @@ async def handle_completions(request: Request):
 
         # Extract the needed fields
         response_json = response.json()
-        remote_block_ids = response_json.get('remote_block_ids', [])
-        remote_engine_id = response_json.get('remote_engine_id', '')
-        remote_host = response_json.get('remote_host', '')
-        remote_port = response_json.get('remote_port', 0)
+        kv_transfer_params = response_json.get('kv_transfer_params', {})
+        if kv_transfer_params:
+            req_data["kv_transfer_params"] = kv_transfer_params
 
         # Get the next decode client in round-robin fashion
         decode_client_info = get_next_client(request.app, 'decode')
@@ -221,15 +222,10 @@ async def handle_completions(request: Request):
 
         # Stream response from decode service
         async def generate_stream():
-            async for chunk in stream_service_response(
-                    decode_client_info,
-                    "/completions",
-                    req_data,
-                    remote_block_ids=remote_block_ids,
-                    remote_engine_id=remote_engine_id,
-                    remote_host=remote_host,
-                    remote_port=remote_port,
-                    request_id=request_id):
+            async for chunk in stream_service_response(decode_client_info,
+                                                       "/completions",
+                                                       req_data,
+                                                       request_id=request_id):
                 yield chunk
 
         return StreamingResponse(generate_stream(),
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -318,6 +318,10 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         logger.info("Initializing NIXL wrapper")
         logger.info("Initializing NIXL worker %s", engine_id)
 
+        # Config.
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+
         # Agent.
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
         # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
@@ -378,7 +382,8 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self._tp_size: dict[str, int] = {self.engine_id: self.world_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
         # finish reading before safely freeing the blocks.
-        self.consumer_notification_counts_by_req = defaultdict(int)
+        self.consumer_notification_counts_by_req: dict[str,
+                                                       int] = defaultdict(int)
 
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
@@ -424,41 +429,39 @@ def _nixl_handshake(self, host: str, port: int):
         # a hack to keep us moving. We will switch when moving to etcd
         # or where we have a single ZMQ socket in the scheduler.
 
-        def handshake(sock, rank: int) -> NixlAgentMetadata:
+        def handshake(path: str, rank: int) -> NixlAgentMetadata:
             # Send query for the request.
-            sock.send(GET_META_MSG)
-            metadata_bytes = sock.recv()
-            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            metadata = decoder.decode(metadata_bytes)
-            got_metadata_time = time.perf_counter()
-
-            # Register Remote agent.
-            self.add_remote_agent(metadata, rank)
-            setup_agent_time = time.perf_counter()
-
-            logger.debug("NIXL handshake: get metadata took: %s",
-                         got_metadata_time - start_time)
-            logger.debug("NIXL handshake: add agent took: %s",
-                         setup_agent_time - got_metadata_time)
-            return metadata
+            with zmq_ctx(zmq.REQ, path) as sock:
+                sock.send(GET_META_MSG)
+                metadata_bytes = sock.recv()
+                decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+                metadata = decoder.decode(metadata_bytes)
+                got_metadata_time = time.perf_counter()
+
+                # Register Remote agent.
+                self.add_remote_agent(metadata, rank)
+                setup_agent_time = time.perf_counter()
+
+                logger.debug("NIXL handshake: get metadata took: %s",
+                             got_metadata_time - start_time)
+                logger.debug("NIXL handshake: add agent took: %s",
+                             setup_agent_time - got_metadata_time)
+                return metadata
 
         # Handshake with remote agent-rank0 first to get the tp_size of remote
         path = f"tcp://{host}:{port}"
         logger.debug("Querying master rank metadata on path: %s", path)
-        with zmq_ctx(zmq.REQ, path) as sock:
-            metadata = handshake(sock, 0)
+        metadata = handshake(path, 0)
 
         # Handshake only with the other TP remote the current local rank will
         # pull from. With homogeneous TP it happens to be the same rank_i.
-        d_workers_per_p_worker = self._tp_size[
-            self.engine_id] // metadata.tp_size
-        p_remote_rank = self.rank // d_workers_per_p_worker
+        tp_rate = self._tp_size[self.engine_id] // metadata.tp_size
+        p_remote_rank = self.rank // tp_rate
         if p_remote_rank > 0:
             path = f"tcp://{host}:{port + p_remote_rank}"
             logger.debug("Querying metadata on path: %s at remote rank %s",
                          path, p_remote_rank)
-            with zmq_ctx(zmq.REQ, path) as sock:
-                metadata = handshake(sock, p_remote_rank)
+            _ = handshake(path, p_remote_rank)
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in nixl."""
@@ -473,17 +476,17 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             self.num_blocks = first_kv_cache.shape[0]
             block_rank = 2  # [block_size, latent_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
-            self.block_size, kv_latent_dim = block_shape
+            block_size, kv_latent_dim = block_shape
             self.slot_size_bytes = kv_elem_size * kv_latent_dim
         else:
             # [2 (k and v), num_blocks, block_size, kv_heads, head_dim]
             self.num_blocks = first_kv_cache.shape[1]
             block_rank = 3  # [block_size, kv_heads, head_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
-            self.block_size, n_kv_heads, head_dim = block_shape
+            block_size, n_kv_heads, head_dim = block_shape
             # head size in bytes.
             self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
-
+        assert block_size == self.block_size
         # TODO(tms): self.block_len needs to be per-layer for sliding window,
         # hybrid attn, etc
         # block size in bytes