[NIXL] fix cpu PD after physical <> logical block_size PR (#28904)

xuechendi · web-flow · commit c3e29786209d · 2025-11-18T14:03:23.000-05:00
Signed-off-by: Chendi Xue &lt;chendi.xue@intel.com&gt;
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -55,7 +55,7 @@ DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128}
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
 
-SMI_BIN=$(which nvidia-smi || which rocm-smi)
+SMI_BIN=$(which nvidia-smi || which rocm-smi || echo "")
 
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
@@ -91,8 +91,13 @@ get_model_args() {
 get_num_gpus() {
   if [[ "$SMI_BIN" == *"nvidia"* ]]; then
     echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
-  else
+  elif [[ "$SMI_BIN" == *"rocm"* ]]; then
     echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  else
+    # works for non-cuda platforms,
+    # assuming at least 1 device and
+    # let system to decide which card to use
+    echo "1"
   fi
 }
 
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
@@ -95,6 +95,7 @@ def install_system_dependencies():
         "meson",
         "libtool",
         "libtool-bin",
+        "pkg-config",
     ]
     run_command(["apt-get", "update"])
     run_command(["apt-get", "install", "-y"] + apt_packages)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1161,6 +1161,14 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         # to better exploit the memory layout (ie num_blocks is the first dim).
         split_k_and_v = self.kv_topo.split_k_and_v
         tensor_size_bytes = None
+
+        # TODO (NickLucche): Get kernel_block_size in a cleaner way
+        # NHD default "view" for non-MLA cache
+        if self.device_type == "cpu":
+            block_size_position = -2
+        else:
+            block_size_position = -2 if self.use_mla else -3
+
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
         self.slot_size_per_layer = list[int]()  # HD bytes in kv terms
@@ -1175,9 +1183,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                 if base_addr in seen_base_addresses:
                     continue
 
-                # TODO (NickLucche): Get kernel_block_size in a cleaner way
-                # NHD default "view" for non-MLA cache
-                kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3]
+                kernel_block_size = cache.shape[block_size_position]
 
                 if self.block_size != kernel_block_size:
                     logger.info_once(

Original file line number	Diff line number	Diff line change
`@@ -95,6 +95,7 @@ def install_system_dependencies():`
`95`	`95`	`"meson",`
`96`	`96`	`"libtool",`
`97`	`97`	`"libtool-bin",`
	`98`	`+ "pkg-config",`
`98`	`99`	`]`
`99`	`100`	`run_command(["apt-get", "update"])`
`100`	`101`	`run_command(["apt-get", "install", "-y"] + apt_packages)`