[feat] cherry-pick KVComp in NPU -- HBM version into the 0.2.0-release branch (#619)

wangwenxin0312 · leideng · web-flow · commit be5bcbcb0e76 · 2026-01-05T11:32:49.000+08:00
# Purpose

What this PR does / why we need it?
KVComp in NPU -- HBM version in 0.2.0-release

Co-authored-by: Lei Deng &lt;ldeng.sjtu@gmail.com&gt;
diff --git a/examples/offline_inference_kvcomphbm.py b/examples/offline_inference_kvcomphbm.py
@@ -77,7 +77,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):
                     },
                 }
             ],
-            "ucm_sparse_config": {"GSA": {}},
+            "ucm_sparse_config": {"KvCompOnDevice": {}},
         },
     )
 
diff --git a/examples/ucm_config_example.yaml b/examples/ucm_config_example.yaml
@@ -31,8 +31,7 @@ load_only_first_rank: false
   # Or for GSA:
   # GSA: {}
   # Or for KvCompOnDevice:
-  # GSA:
-  #   "kvcompOnDevice_config_path": "workspace/unified-cache-management/ucm/sparse/kvcomp/configs/kvcomp_qwen3_32B_config.json"
+  # KvCompOnDevice: {}
 
 
 # Whether to use layerwise loading/saving (optional, default: True for UCMConnector)
diff --git a/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch b/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
@@ -11,7 +11,7 @@ Subject: [PATCH] modify ascend patch for register_kv_cache
  4 files changed, 204 insertions(+), 16 deletions(-)
 
 diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
-index 7d7f488f..ea982244 100644
+index 7d7f488..ea98224 100644
 --- a/vllm_ascend/attention/attention_v1.py
 +++ b/vllm_ascend/attention/attention_v1.py
 @@ -24,6 +24,9 @@ import torch_npu
@@ -129,7 +129,7 @@ index 7d7f488f..ea982244 100644
  def unified_attention_with_output_fake(
      query: torch.Tensor,
 diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
-index f50fe56e..ae8f50bf 100644
+index f50fe56..ae8f50b 100644
 --- a/vllm_ascend/attention/mla_v1.py
 +++ b/vllm_ascend/attention/mla_v1.py
 @@ -13,10 +13,12 @@ from vllm.distributed import get_tensor_model_parallel_world_size
@@ -396,7 +396,7 @@ index eabcdbcc..2762fbc7 100644
 +        ucm_sparse.request_finished_in_worker(request_id)
 \ No newline at end of file
 diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
-index df03d508..5d5d9b5a 100644
+index df03d50..5d5d9b5 100644
 --- a/vllm_ascend/worker/worker_v1.py
 +++ b/vllm_ascend/worker/worker_v1.py
 @@ -17,6 +17,7 @@
@@ -468,5 +468,231 @@ index df03d508..5d5d9b5a 100644
      def _init_profiler(self):
          # Torch profiler. Enabled and configured through env vars:
 -- 
-2.50.1.windows.1
+2.43.0
+
+
+From 9b685ed319bae31f5e56d596499fbd7c7f60c4b0 Mon Sep 17 00:00:00 2001
+From: ldeng <ldeng.sjtu@gmail.com>
+Date: Mon, 29 Dec 2025 17:59:28 +0800
+Subject: [PATCH 2/4] update attention_v1 for kvcomp in NPU
+
+---
+ vllm_ascend/attention/attention_v1.py | 59 +++++++++++++++++++--------
+ 1 file changed, 43 insertions(+), 16 deletions(-)
+
+diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
+index ea98224..b924d8e 100644
+--- a/vllm_ascend/attention/attention_v1.py
++++ b/vllm_ascend/attention/attention_v1.py
+@@ -37,7 +37,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
+                                nd_to_nz_2d, nd_to_nz_spec)
+ 
+ from ucm.sparse.state import get_ucm_sparse, has_ucm_sparse
+-
++import os
+ 
+ class AscendAttentionBackend(AttentionBackend):
+     accept_output_buffer: bool = True
+@@ -132,8 +132,9 @@ class AscendMetadata:
+     # the computed tokens + new tokens None if it is a decoding.
+     query_start_loc: torch.Tensor
+     query_lens: torch.Tensor
++    query_lens_device: torch.Tensor # (ldeng) added for KVComp
+     seq_lens: torch.Tensor
+-
++    seq_lens_device: torch.Tensor # (ldeng) added for KVComp
+     # max value of number of tokens across dp group
+     max_num_tokens_across_dp: int = 0
+ 
+@@ -182,15 +183,22 @@ class AscendAttentionMetadataBuilder:
+             block_table[:num_reqs])
+ 
+         query_lens = self.runner.query_lens
++        query_lens_device = query_lens.pin_memory().to(self.runner.device, non_blocking=True)
+         seq_lens = self.runner.seq_lens_cpu[:num_reqs]
++        seq_lens_device = seq_lens.pin_memory().to(self.runner.device, non_blocking=True)
+         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+             self.runner.device, non_blocking=True)
+         attn_mask = self.runner.attn_mask
+         attn_state = self.runner.attn_state
+         query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
+-        query_start_loc = query_start_loc_cpu.to(self.runner.device,
++        query_start_loc = query_start_loc_cpu.pin_memory().to(self.runner.device,
+                                                  non_blocking=True)
+ 
++        if has_ucm_sparse():
++            ucm_sparse = get_ucm_sparse()
++            if os.getenv("VLLM_HASH_ATTENTION", "0") == "1":
++                ucm_sparse.build_decode_attention_meta_npu(query_lens, seq_lens, block_table)
++
+         if is_310p():
+             if attn_state == AscendAttentionState.PrefillNoCache:
+                 mask_nz = nd_to_nz_2d(attn_mask)
+@@ -206,7 +214,9 @@ class AscendAttentionMetadataBuilder:
+             block_tables=block_table,
+             query_start_loc=query_start_loc,
+             query_lens=query_lens,
++            query_lens_device=query_lens_device,
+             seq_lens=seq_lens,
++            seq_lens_device=seq_lens_device,
+             max_query_len=max_query_len,
+             slot_mapping=slot_mapping,
+             attn_mask=attn_mask,
+@@ -279,8 +289,17 @@ class AscendAttentionBackendImpl(AttentionImpl):
+             shape = [batch_size * seq_len, num_heads, head_size]
+         """
+         num_tokens = query.shape[0]
+-        use_kv_cache_int8 = kv_cache.numel(
+-        ) > 0 and kv_cache[0].dtype == torch.int8
++
++        # In NPU, forward could be called directly, not by unified_ascend_attention_with_output
++        actual_cache = kv_cache[0] if isinstance(kv_cache, tuple) else kv_cache
++        if actual_cache is not None:
++            use_kv_cache_int8 = actual_cache.numel() > 0 and actual_cache.dtype == torch.int8
++        else:
++            use_kv_cache_int8 = False
++        kv_cache = actual_cache 
++
++        #use_kv_cache_int8 = kv_cache.numel(
++        #) > 0 and kv_cache[0].dtype == torch.int8
+         if output is None:
+             output = torch.empty(num_tokens,
+                                  self.num_heads,
+@@ -449,14 +468,20 @@ def unified_ascend_attention_with_output(
+     output: torch.Tensor,
+     layer_name: str,
+ ) -> None:
+-    wait_for_kv_layer_from_connector(layer_name)
++    # wait_for_kv_layer_from_connector(layer_name)
+ 
+     forward_context: ForwardContext = get_forward_context()
+     attn_metadata = forward_context.attn_metadata
+     self = forward_context.no_compile_layers[layer_name]
+     kv_cache = self.kv_cache[forward_context.virtual_engine]
+-    if not self.use_mla:
+-        query, _, _, _ = maybe_execute_sparse_attention_begin(query, key, value, layer_name, forward_context)
++
++    # In NPU, during dummy_run, kv_cache could be a empty tensor, so we need to check the length of kv_cache
++    if os.getenv("VLLM_HASH_ATTENTION", "0") == "1" and len(kv_cache) > 0:
++        kv_cache, k_hash = kv_cache
++    else:
++        k_hash = None
++    if attn_metadata is not None:
++        maybe_execute_sparse_attention_begin(query, key, value, layer_name, forward_context, output, k_hash=k_hash)
+     self.impl.forward(self,
+                       query,
+                       key,
+@@ -465,9 +490,10 @@ def unified_ascend_attention_with_output(
+                       attn_metadata,
+                       output,
+                       trace_flag=False)
+-    if not self.use_mla:
++
++    if attn_metadata is not None:
+         maybe_execute_sparse_attention_finished(query, key, value, output, layer_name, forward_context)
+-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
++    # maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+     return
+ 
+ def wait_for_kv_layer_from_connector(layer_name: str):
+@@ -506,19 +532,20 @@ def maybe_execute_sparse_attention_begin(
+         forward_context: ForwardContext,
+         output: Optional[torch.Tensor] = None,
+         phase: Optional[str] = None,
++        k_hash: Optional[torch.Tensor] = None,
++        decode_ql_nope: Optional[torch.Tensor] = None,
++        decode_q_pe: Optional[torch.Tensor] = None,
+ ):
+     if not has_ucm_sparse():
+-            return query, key, value, output
++        return
+ 
+     ucm_sparse = get_ucm_sparse()
+ 
+     attn_metadata = forward_context.attn_metadata
+     if attn_metadata is None:
+-        return query, key, value, output
++        return
+ 
+-    return ucm_sparse.attention_begin(
+-        query, key, value, layer_name, forward_context, output, phase
+-    )
++    ucm_sparse.attention_begin(query, key, value, layer_name, forward_context, output, phase, k_hash, decode_ql_nope, decode_q_pe)
+ 
+ def maybe_execute_sparse_attention_finished(
+         query: torch.Tensor,
+@@ -555,4 +582,4 @@ direct_register_custom_op(
+     mutates_args=["output"],
+     fake_impl=unified_attention_with_output_fake,
+     dispatch_key="PrivateUse1",
+-)
++)
+\ No newline at end of file
+-- 
+2.43.0
+
+
+From d82f08b6981ca4d01432309b659bea57b9d18576 Mon Sep 17 00:00:00 2001
+From: ldeng <ldeng.sjtu@gmail.com>
+Date: Mon, 29 Dec 2025 18:04:02 +0800
+Subject: [PATCH 3/4] call initialize_kv_hash_cache_tensors_npu to allocate
+ hashk cache in NPUModelRunner when VLLM_HASH_ATTENTION is enabled for KVComp
+
+---
+ vllm_ascend/worker/model_runner_v1.py | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
+index 782b9a3..766316e 100644
+--- a/vllm_ascend/worker/model_runner_v1.py
++++ b/vllm_ascend/worker/model_runner_v1.py
+@@ -1993,6 +1993,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+                     # KV cache specs.
+                     raise ValueError("Unknown KV cache spec type.")
+ 
++        if has_ucm_sparse():
++            ucm_sparse = get_ucm_sparse()
++            if os.getenv("VLLM_HASH_ATTENTION", "0") == "1":
++                ucm_sparse.initialize_kv_hash_cache_tensors_npu(kv_caches, self.device)
++
+         bind_kv_cache(
+             kv_caches,
+             self.vllm_config.compilation_config.static_forward_context,
+-- 
+2.43.0
+
+
+From 3d28d477a4887e7d5c909a66b448086994751567 Mon Sep 17 00:00:00 2001
+From: ldeng <ldeng.sjtu@gmail.com>
+Date: Mon, 29 Dec 2025 18:04:50 +0800
+Subject: [PATCH 4/4] uncomment connector
+
+---
+ vllm_ascend/attention/attention_v1.py | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
+index b924d8e..ece7d17 100644
+--- a/vllm_ascend/attention/attention_v1.py
++++ b/vllm_ascend/attention/attention_v1.py
+@@ -468,7 +468,7 @@ def unified_ascend_attention_with_output(
+     output: torch.Tensor,
+     layer_name: str,
+ ) -> None:
+-    # wait_for_kv_layer_from_connector(layer_name)
++    wait_for_kv_layer_from_connector(layer_name)
+ 
+     forward_context: ForwardContext = get_forward_context()
+     attn_metadata = forward_context.attn_metadata
+@@ -493,7 +493,7 @@ def unified_ascend_attention_with_output(
+ 
+     if attn_metadata is not None:
+         maybe_execute_sparse_attention_finished(query, key, value, output, layer_name, forward_context)
+-    # maybe_save_kv_layer_to_connector(layer_name, kv_cache)
++    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+     return
+ 
+ def wait_for_kv_layer_from_connector(layer_name: str):
+-- 
+2.43.0
 
diff --git a/ucm/integration/vllm/ucm_connector.py b/ucm/integration/vllm/ucm_connector.py
@@ -229,7 +229,7 @@ def _generate_storage_backends(
         return backends
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
-        if os.getenv("VLLM_HASH_ATTENTION") == "1":
+        if os.getenv("VLLM_HASH_ATTENTION", "0") == "1":
             for layer_name, value in kv_caches.items():
                 kv_cache, k_hash = value
                 self.kv_caches[layer_name] = kv_cache
diff --git a/ucm/sparse/factory.py b/ucm/sparse/factory.py
@@ -48,7 +48,7 @@ def create_sparse_method(
 UcmSparseFactory.register_sparse_method("ESA", "ucm.sparse.esa.esa", "ESA")
 UcmSparseFactory.register_sparse_method("KvComp", "ucm.sparse.kvcomp.kvcomp", "KvComp")
 UcmSparseFactory.register_sparse_method(
-    "GSA", "ucm.sparse.kvcomp.kvcomp_hbm", "KvCompOnDevice"
+    "KvCompOnDevice", "ucm.sparse.kvcomp.kvcomp_hbm", "KvCompOnDevice"
 )
 # UcmSparseFactory.register_sparse_method("GSA", "ucm.sparse.gsa.gsa", "GSA")
 UcmSparseFactory.register_sparse_method(
diff --git a/ucm/sparse/kvcomp/CMakeLists.txt b/ucm/sparse/kvcomp/CMakeLists.txt
@@ -47,5 +47,9 @@ else()
     message(STATUS "Skipping numactl build...")
 endif()
 
-add_subdirectory(hash_retrieval)
-add_subdirectory(ham_dist)
+string(TOLOWER "$ENV{PLATFORM}" PLATFORM_ENV)
+if(PLATFORM_ENV STREQUAL "cuda")
+    message(STATUS "Building kvcomp for CUDA...")
+    add_subdirectory(hash_retrieval)
+    add_subdirectory(ham_dist)
+endif()
diff --git a/ucm/sparse/kvcomp/hamming_topk.py b/ucm/sparse/kvcomp/hamming_topk.py
@@ -1,6 +1,7 @@
 import torch
 
-from ucm.sparse.kvcomp.ham_dist import hamming
+if hasattr(torch, "cuda") and torch.cuda.is_available():
+    from ucm.sparse.kvcomp.ham_dist import hamming
 
 
 @torch.compile()
diff --git a/ucm/sparse/kvcomp/kvcomp_hbm.py b/ucm/sparse/kvcomp/kvcomp_hbm.py

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):`
`77`	`77`	`},`
`78`	`78`	`}`
`79`	`79`	`],`
`80`		`- "ucm_sparse_config": {"GSA": {}},`
	`80`	`+ "ucm_sparse_config": {"KvCompOnDevice": {}},`
`81`	`81`	`},`
`82`	`82`	`)`
`83`	`83`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ def create_sparse_method(`
`48`	`48`	`UcmSparseFactory.register_sparse_method("ESA", "ucm.sparse.esa.esa", "ESA")`
`49`	`49`	`UcmSparseFactory.register_sparse_method("KvComp", "ucm.sparse.kvcomp.kvcomp", "KvComp")`
`50`	`50`	`UcmSparseFactory.register_sparse_method(`
`51`		`- "GSA", "ucm.sparse.kvcomp.kvcomp_hbm", "KvCompOnDevice"`
	`51`	`+ "KvCompOnDevice", "ucm.sparse.kvcomp.kvcomp_hbm", "KvCompOnDevice"`
`52`	`52`	`)`
`53`	`53`	`# UcmSparseFactory.register_sparse_method("GSA", "ucm.sparse.gsa.gsa", "GSA")`
`54`	`54`	`UcmSparseFactory.register_sparse_method(`