update the vllm-ascend patch for KVComp in NPU

leideng · leideng · commit 8e28f909393a · 2025-12-29T18:29:16.000+08:00
diff --git a/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch b/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
@@ -1,7 +1,7 @@
-From c92cb68fd1fa6215cd6d5b207b95c841ac20dbe1 Mon Sep 17 00:00:00 2001
+From 3ce6a6053ca9854d95828ab600e01b848a253a56 Mon Sep 17 00:00:00 2001
 From: wenxinwang <wangwenxin21@huawei.com>
 Date: Tue, 23 Dec 2025 19:21:33 -0800
-Subject: [PATCH] sparse patch for vllm-ascend
+Subject: [PATCH 1/4] sparse patch for vllm-ascend
 
 ---
  vllm_ascend/attention/attention_v1.py | 80 ++++++++++++++++++++++
@@ -11,7 +11,7 @@ Subject: [PATCH] sparse patch for vllm-ascend
  4 files changed, 201 insertions(+), 16 deletions(-)
 
 diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
-index 7d7f488f..ea982244 100644
+index 7d7f488..ea98224 100644
 --- a/vllm_ascend/attention/attention_v1.py
 +++ b/vllm_ascend/attention/attention_v1.py
 @@ -24,6 +24,9 @@ import torch_npu
@@ -129,7 +129,7 @@ index 7d7f488f..ea982244 100644
  def unified_attention_with_output_fake(
      query: torch.Tensor,
 diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
-index f50fe56e..ae8f50bf 100644
+index f50fe56..ae8f50b 100644
 --- a/vllm_ascend/attention/mla_v1.py
 +++ b/vllm_ascend/attention/mla_v1.py
 @@ -13,10 +13,12 @@ from vllm.distributed import get_tensor_model_parallel_world_size
@@ -185,7 +185,7 @@ index f50fe56e..ae8f50bf 100644
  
          return output_padded
 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
-index eabcdbcc..782b9a3b 100644
+index eabcdbc..782b9a3 100644
 --- a/vllm_ascend/worker/model_runner_v1.py
 +++ b/vllm_ascend/worker/model_runner_v1.py
 @@ -39,7 +39,10 @@ from vllm.config import CompilationLevel, VllmConfig
@@ -386,7 +386,7 @@ index eabcdbcc..782b9a3b 100644
 +        ucm_sparse.request_finished_in_worker(request_id)
 \ No newline at end of file
 diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
-index df03d508..5d5d9b5a 100644
+index df03d50..5d5d9b5 100644
 --- a/vllm_ascend/worker/worker_v1.py
 +++ b/vllm_ascend/worker/worker_v1.py
 @@ -17,6 +17,7 @@
@@ -458,5 +458,231 @@ index df03d508..5d5d9b5a 100644
      def _init_profiler(self):
          # Torch profiler. Enabled and configured through env vars:
 -- 
-2.34.1
+2.43.0
+
+
+From 9b685ed319bae31f5e56d596499fbd7c7f60c4b0 Mon Sep 17 00:00:00 2001
+From: ldeng <ldeng.sjtu@gmail.com>
+Date: Mon, 29 Dec 2025 17:59:28 +0800
+Subject: [PATCH 2/4] update attention_v1 for kvcomp in NPU
+
+---
+ vllm_ascend/attention/attention_v1.py | 59 +++++++++++++++++++--------
+ 1 file changed, 43 insertions(+), 16 deletions(-)
+
+diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
+index ea98224..b924d8e 100644
+--- a/vllm_ascend/attention/attention_v1.py
++++ b/vllm_ascend/attention/attention_v1.py
+@@ -37,7 +37,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
+                                nd_to_nz_2d, nd_to_nz_spec)
+ 
+ from ucm.sparse.state import get_ucm_sparse, has_ucm_sparse
+-
++import os
+ 
+ class AscendAttentionBackend(AttentionBackend):
+     accept_output_buffer: bool = True
+@@ -132,8 +132,9 @@ class AscendMetadata:
+     # the computed tokens + new tokens None if it is a decoding.
+     query_start_loc: torch.Tensor
+     query_lens: torch.Tensor
++    query_lens_device: torch.Tensor # (ldeng) added for KVComp
+     seq_lens: torch.Tensor
+-
++    seq_lens_device: torch.Tensor # (ldeng) added for KVComp
+     # max value of number of tokens across dp group
+     max_num_tokens_across_dp: int = 0
+ 
+@@ -182,15 +183,22 @@ class AscendAttentionMetadataBuilder:
+             block_table[:num_reqs])
+ 
+         query_lens = self.runner.query_lens
++        query_lens_device = query_lens.pin_memory().to(self.runner.device, non_blocking=True)
+         seq_lens = self.runner.seq_lens_cpu[:num_reqs]
++        seq_lens_device = seq_lens.pin_memory().to(self.runner.device, non_blocking=True)
+         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+             self.runner.device, non_blocking=True)
+         attn_mask = self.runner.attn_mask
+         attn_state = self.runner.attn_state
+         query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
+-        query_start_loc = query_start_loc_cpu.to(self.runner.device,
++        query_start_loc = query_start_loc_cpu.pin_memory().to(self.runner.device,
+                                                  non_blocking=True)
+ 
++        if has_ucm_sparse():
++            ucm_sparse = get_ucm_sparse()
++            if os.getenv("VLLM_HASH_ATTENTION", "0") == "1":
++                ucm_sparse.build_decode_attention_meta_npu(query_lens, seq_lens, block_table)
++
+         if is_310p():
+             if attn_state == AscendAttentionState.PrefillNoCache:
+                 mask_nz = nd_to_nz_2d(attn_mask)
+@@ -206,7 +214,9 @@ class AscendAttentionMetadataBuilder:
+             block_tables=block_table,
+             query_start_loc=query_start_loc,
+             query_lens=query_lens,
++            query_lens_device=query_lens_device,
+             seq_lens=seq_lens,
++            seq_lens_device=seq_lens_device,
+             max_query_len=max_query_len,
+             slot_mapping=slot_mapping,
+             attn_mask=attn_mask,
+@@ -279,8 +289,17 @@ class AscendAttentionBackendImpl(AttentionImpl):
+             shape = [batch_size * seq_len, num_heads, head_size]
+         """
+         num_tokens = query.shape[0]
+-        use_kv_cache_int8 = kv_cache.numel(
+-        ) > 0 and kv_cache[0].dtype == torch.int8
++
++        # In NPU, forward could be called directly, not by unified_ascend_attention_with_output
++        actual_cache = kv_cache[0] if isinstance(kv_cache, tuple) else kv_cache
++        if actual_cache is not None:
++            use_kv_cache_int8 = actual_cache.numel() > 0 and actual_cache.dtype == torch.int8
++        else:
++            use_kv_cache_int8 = False
++        kv_cache = actual_cache 
++
++        #use_kv_cache_int8 = kv_cache.numel(
++        #) > 0 and kv_cache[0].dtype == torch.int8
+         if output is None:
+             output = torch.empty(num_tokens,
+                                  self.num_heads,
+@@ -449,14 +468,20 @@ def unified_ascend_attention_with_output(
+     output: torch.Tensor,
+     layer_name: str,
+ ) -> None:
+-    wait_for_kv_layer_from_connector(layer_name)
++    # wait_for_kv_layer_from_connector(layer_name)
+ 
+     forward_context: ForwardContext = get_forward_context()
+     attn_metadata = forward_context.attn_metadata
+     self = forward_context.no_compile_layers[layer_name]
+     kv_cache = self.kv_cache[forward_context.virtual_engine]
+-    if not self.use_mla:
+-        query, _, _, _ = maybe_execute_sparse_attention_begin(query, key, value, layer_name, forward_context)
++
++    # In NPU, during dummy_run, kv_cache could be a empty tensor, so we need to check the length of kv_cache
++    if os.getenv("VLLM_HASH_ATTENTION", "0") == "1" and len(kv_cache) > 0:
++        kv_cache, k_hash = kv_cache
++    else:
++        k_hash = None
++    if attn_metadata is not None:
++        maybe_execute_sparse_attention_begin(query, key, value, layer_name, forward_context, output, k_hash=k_hash)
+     self.impl.forward(self,
+                       query,
+                       key,
+@@ -465,9 +490,10 @@ def unified_ascend_attention_with_output(
+                       attn_metadata,
+                       output,
+                       trace_flag=False)
+-    if not self.use_mla:
++
++    if attn_metadata is not None:
+         maybe_execute_sparse_attention_finished(query, key, value, output, layer_name, forward_context)
+-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
++    # maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+     return
+ 
+ def wait_for_kv_layer_from_connector(layer_name: str):
+@@ -506,19 +532,20 @@ def maybe_execute_sparse_attention_begin(
+         forward_context: ForwardContext,
+         output: Optional[torch.Tensor] = None,
+         phase: Optional[str] = None,
++        k_hash: Optional[torch.Tensor] = None,
++        decode_ql_nope: Optional[torch.Tensor] = None,
++        decode_q_pe: Optional[torch.Tensor] = None,
+ ):
+     if not has_ucm_sparse():
+-            return query, key, value, output
++        return
+ 
+     ucm_sparse = get_ucm_sparse()
+ 
+     attn_metadata = forward_context.attn_metadata
+     if attn_metadata is None:
+-        return query, key, value, output
++        return
+ 
+-    return ucm_sparse.attention_begin(
+-        query, key, value, layer_name, forward_context, output, phase
+-    )
++    ucm_sparse.attention_begin(query, key, value, layer_name, forward_context, output, phase, k_hash, decode_ql_nope, decode_q_pe)
+ 
+ def maybe_execute_sparse_attention_finished(
+         query: torch.Tensor,
+@@ -555,4 +582,4 @@ direct_register_custom_op(
+     mutates_args=["output"],
+     fake_impl=unified_attention_with_output_fake,
+     dispatch_key="PrivateUse1",
+-)
++)
+\ No newline at end of file
+-- 
+2.43.0
+
+
+From d82f08b6981ca4d01432309b659bea57b9d18576 Mon Sep 17 00:00:00 2001
+From: ldeng <ldeng.sjtu@gmail.com>
+Date: Mon, 29 Dec 2025 18:04:02 +0800
+Subject: [PATCH 3/4] call initialize_kv_hash_cache_tensors_npu to allocate
+ hashk cache in NPUModelRunner when VLLM_HASH_ATTENTION is enabled for KVComp
+
+---
+ vllm_ascend/worker/model_runner_v1.py | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
+index 782b9a3..766316e 100644
+--- a/vllm_ascend/worker/model_runner_v1.py
++++ b/vllm_ascend/worker/model_runner_v1.py
+@@ -1993,6 +1993,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+                     # KV cache specs.
+                     raise ValueError("Unknown KV cache spec type.")
+ 
++        if has_ucm_sparse():
++            ucm_sparse = get_ucm_sparse()
++            if os.getenv("VLLM_HASH_ATTENTION", "0") == "1":
++                ucm_sparse.initialize_kv_hash_cache_tensors_npu(kv_caches, self.device)
++
+         bind_kv_cache(
+             kv_caches,
+             self.vllm_config.compilation_config.static_forward_context,
+-- 
+2.43.0
+
+
+From 3d28d477a4887e7d5c909a66b448086994751567 Mon Sep 17 00:00:00 2001
+From: ldeng <ldeng.sjtu@gmail.com>
+Date: Mon, 29 Dec 2025 18:04:50 +0800
+Subject: [PATCH 4/4] uncomment connector
+
+---
+ vllm_ascend/attention/attention_v1.py | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
+index b924d8e..ece7d17 100644
+--- a/vllm_ascend/attention/attention_v1.py
++++ b/vllm_ascend/attention/attention_v1.py
+@@ -468,7 +468,7 @@ def unified_ascend_attention_with_output(
+     output: torch.Tensor,
+     layer_name: str,
+ ) -> None:
+-    # wait_for_kv_layer_from_connector(layer_name)
++    wait_for_kv_layer_from_connector(layer_name)
+ 
+     forward_context: ForwardContext = get_forward_context()
+     attn_metadata = forward_context.attn_metadata
+@@ -493,7 +493,7 @@ def unified_ascend_attention_with_output(
+ 
+     if attn_metadata is not None:
+         maybe_execute_sparse_attention_finished(query, key, value, output, layer_name, forward_context)
+-    # maybe_save_kv_layer_to_connector(layer_name, kv_cache)
++    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+     return
+ 
+ def wait_for_kv_layer_from_connector(layer_name: str):
+-- 
+2.43.0