[bugfix]cherry pick from 0.2.0-release sparse patch & cmake (#581)

wangwenxin0312 · web-flow · commit d608be8051cf · 2025-12-25T09:01:35.000+08:00
[bugfix] sparse patch &amp; cmake
diff --git a/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch b/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
@@ -1,12 +1,11 @@
-From 2f4adc594410c61a8f7bc5172ef443eca72d230e Mon Sep 17 00:00:00 2001
+From 8cb493f9ece884cbc2ba71e367bed2b4116ae1b3 Mon Sep 17 00:00:00 2001
 From: wenxinwang <wangwenxin21@huawei.com>
-Date: Tue, 23 Dec 2025 18:35:23 -0800
+Date: Tue, 23 Dec 2025 19:44:21 -0800
 Subject: [PATCH] kvcomp qwen deepseek
 
 ---
- vllm/attention/layer.py                    | 64 ++++++++++++++++-
- vllm/envs.py                               |  2 +-
- vllm/model_executor/models/llama.py        | 23 ++++++-
+ vllm/attention/layer.py                    | 63 ++++++++++++++++-
+ vllm/model_executor/models/llama.py        | 21 +++++-
  vllm/model_executor/models/qwen2.py        | 23 ++++++-
  vllm/v1/attention/backends/flash_attn.py   |  7 ++
  vllm/v1/attention/backends/mla/common.py   | 15 +++-
@@ -18,35 +17,29 @@ Subject: [PATCH] kvcomp qwen deepseek
  vllm/v1/worker/block_table.py              | 13 ++++
  vllm/v1/worker/gpu_model_runner.py         | 80 +++++++++++++++++++---
  vllm/v1/worker/gpu_worker.py               |  2 +
- 14 files changed, 278 insertions(+), 22 deletions(-)
+ 13 files changed, 275 insertions(+), 20 deletions(-)
 
 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
-index f0ad68b16..bc48849b1 100644
+index f0ad68b16..ba93960de 100644
 --- a/vllm/attention/layer.py
 +++ b/vllm/attention/layer.py
-@@ -2,12 +2,12 @@
- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
- """Attention layer."""
- from typing import Any, Dict, List, Optional
--
- import torch
- import torch.nn as nn
+@@ -8,6 +8,7 @@ import torch.nn as nn
  import torch.nn.functional as F
  
  import vllm.envs as envs
 +import os
  from vllm.attention import AttentionType
  from vllm.attention.selector import backend_name_to_enum, get_attn_backend
  from vllm.config import CacheConfig, get_current_vllm_config
-@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+@@ -22,6 +23,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
  from vllm.platforms import _Backend, current_platform
  from vllm.utils import direct_register_custom_op
  from vllm.v1.attention.backends.utils import validate_kv_sharing_target
 +from ucm.sparse.state import get_ucm_sparse, has_ucm_sparse
  
  
  class Attention(nn.Module):
-@@ -409,9 +410,10 @@ def unified_attention(
+@@ -409,9 +411,10 @@ def unified_attention(
          attn_metadata = attn_metadata[layer_name]
      self = forward_context.no_compile_layers[layer_name]
      kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -58,7 +51,7 @@ index f0ad68b16..bc48849b1 100644
      maybe_save_kv_layer_to_connector(layer_name, kv_cache)
      return output
  
-@@ -449,6 +451,15 @@ def unified_attention_with_output(
+@@ -449,6 +452,15 @@ def unified_attention_with_output(
          attn_metadata = attn_metadata[layer_name]
      self = forward_context.no_compile_layers[layer_name]
      kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -74,7 +67,7 @@ index f0ad68b16..bc48849b1 100644
      self.impl.forward(self,
                        query,
                        key,
-@@ -457,6 +468,10 @@ def unified_attention_with_output(
+@@ -457,6 +469,10 @@ def unified_attention_with_output(
                        attn_metadata,
                        output=output,
                        output_scale=output_scale)
@@ -85,7 +78,7 @@ index f0ad68b16..bc48849b1 100644
  
      maybe_save_kv_layer_to_connector(layer_name, kv_cache)
  
-@@ -479,3 +494,48 @@ direct_register_custom_op(
+@@ -479,3 +495,48 @@ direct_register_custom_op(
      fake_impl=unified_attention_with_output_fake,
      dispatch_key=current_platform.dispatch_key,
  )
@@ -134,31 +127,16 @@ index f0ad68b16..bc48849b1 100644
 +        return
 +
 +    ucm_sparse.attention_finished(query, key, value, attn_output, layer_name, forward_context, phase)
-diff --git a/vllm/envs.py b/vllm/envs.py
-index 0cc6792d7..495adfbf7 100644
---- a/vllm/envs.py
-+++ b/vllm/envs.py
-@@ -953,7 +953,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
-     # generations on machines < 100 for compressed-tensors
-     # models
-     "VLLM_USE_NVFP4_CT_EMULATIONS":
--    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
-+    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
- }
- 
- # --8<-- [end:env-vars-definition]
 diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
-index 5d5080479..c5af4491d 100644
+index 5d5080479..39cb2f4fb 100644
 --- a/vllm/model_executor/models/llama.py
 +++ b/vllm/model_executor/models/llama.py
-@@ -54,7 +54,14 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+@@ -54,7 +54,12 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                      is_pp_missing_parameter,
                      make_empty_intermediate_tensors_factory, make_layers,
                      maybe_prefix)
 -
 +from ucm.sparse.state import (
-+            get_ucm_sparse,
-+            has_ucm_sparse,
 +            maybe_execute_sparse_ffn_begin,
 +            maybe_execute_sparse_ffn_finished,
 +            maybe_execute_sparse_layer_begin,
@@ -167,7 +145,7 @@ index 5d5080479..c5af4491d 100644
  
  class LlamaMLP(nn.Module):
  
-@@ -305,10 +312,16 @@ class LlamaDecoderLayer(nn.Module):
+@@ -305,10 +310,16 @@ class LlamaDecoderLayer(nn.Module):
          hidden_states = self.self_attn(positions=positions,
                                         hidden_states=hidden_states)
  
@@ -184,7 +162,7 @@ index 5d5080479..c5af4491d 100644
          return hidden_states, residual
  
  
-@@ -387,9 +400,17 @@ class LlamaModel(nn.Module):
+@@ -387,9 +398,17 @@ class LlamaModel(nn.Module):
          aux_hidden_states = []
          for idx, layer in enumerate(
                  self.layers[self.start_layer:self.end_layer]):
diff --git a/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch b/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
@@ -1,14 +1,14 @@
-From 140050ef0ca76c5fe35f8050a49917b067102250 Mon Sep 17 00:00:00 2001
+From c92cb68fd1fa6215cd6d5b207b95c841ac20dbe1 Mon Sep 17 00:00:00 2001
 From: wenxinwang <wangwenxin21@huawei.com>
-Date: Tue, 23 Dec 2025 18:44:45 -0800
+Date: Tue, 23 Dec 2025 19:21:33 -0800
 Subject: [PATCH] sparse patch for vllm-ascend
 
 ---
- vllm_ascend/attention/attention_v1.py |  80 ++++++++++++++++++++
- vllm_ascend/attention/mla_v1.py       |  14 +++-
- vllm_ascend/worker/model_runner_v1.py | 104 +++++++++++++++++++++++---
- vllm_ascend/worker/worker_v1.py       |  25 ++++++-
- 4 files changed, 205 insertions(+), 18 deletions(-)
+ vllm_ascend/attention/attention_v1.py | 80 ++++++++++++++++++++++
+ vllm_ascend/attention/mla_v1.py       | 14 +++-
+ vllm_ascend/worker/model_runner_v1.py | 98 ++++++++++++++++++++++++---
+ vllm_ascend/worker/worker_v1.py       | 25 +++++--
+ 4 files changed, 201 insertions(+), 16 deletions(-)
 
 diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
 index 7d7f488f..ea982244 100644
@@ -185,7 +185,7 @@ index f50fe56e..ae8f50bf 100644
  
          return output_padded
 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
-index eabcdbcc..5129ecc8 100644
+index eabcdbcc..782b9a3b 100644
 --- a/vllm_ascend/worker/model_runner_v1.py
 +++ b/vllm_ascend/worker/model_runner_v1.py
 @@ -39,7 +39,10 @@ from vllm.config import CompilationLevel, VllmConfig
@@ -317,15 +317,7 @@ index eabcdbcc..5129ecc8 100644
              attn_state=attn_state)
          self.attn_state = attn_state  # type: ignore
  
-@@ -1100,6 +1129,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
-             positions = self.positions[:padded_batch_size]
- 
-         # Run forward pass
-+        finished_dumping = None
-         with set_forward_context(attn_metadata,
-                                  self.vllm_config,
-                                  num_tokens=num_input_tokens):
-@@ -1125,6 +1155,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+@@ -1125,6 +1154,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                      assert self.model is not None
                      maybe_converting_weight_acl_format(self.model,
                                                         ACL_FORMAT_FRACTAL_ND)
@@ -334,42 +326,16 @@ index eabcdbcc..5129ecc8 100644
  
                      hidden_states = self.model(
                          input_ids=input_ids,
-@@ -1133,6 +1165,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+@@ -1133,6 +1164,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                          inputs_embeds=inputs_embeds,
                          **model_kwargs,
                      )
-+                    finished_dumping = self.maybe_wait_for_kv_save()
++                    self.maybe_wait_for_kv_save()
 +                    logits_indices = self.maybe_execute_ucm_sparse_finished(logits_indices)
  
          use_spec_decode = len(
              scheduler_output.scheduled_spec_decode_tokens) > 0
-@@ -1163,7 +1197,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
- 
-         return (attn_metadata, hidden_states, spec_decode_metadata, positions,
-                 total_num_scheduled_tokens, logits_indices, aux_hidden_states,
--                num_scheduled_tokens)
-+                num_scheduled_tokens, finished_dumping)
- 
-     def _get_cumsum_and_arange(
-         self,
-@@ -1400,7 +1434,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
-                 return EMPTY_MODEL_RUNNER_OUTPUT
-             (attn_metadata, hidden_states, spec_decode_metadata, positions,
-              num_scheduled_tokens, logits_indices, aux_hidden_states,
--             num_scheduled_tokens_np) = (self._process_reqs(
-+             num_scheduled_tokens_np, finished_dumping) = (self._process_reqs(
-                  scheduler_output, intermediate_tensors))
- 
-         with ProfileExecuteDuration().capture_async("post process"):
-@@ -1561,6 +1595,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
-                 logprobs=logprobs_lists,
-                 prompt_logprobs_dict=prompt_logprobs_dict,
-                 pooler_output=[],
-+                finished_dumping=finished_dumping
-             )
- 
-         durations = ProfileExecuteDuration().pop_captured_sync()
-@@ -2369,3 +2404,48 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+@@ -2369,3 +2402,48 @@ class NPUModelRunner(LoRAModelRunnerMixin):
              if batch_size <= padded_batch_size < selected_batch_size:
                  selected_batch_size = padded_batch_size
          return selected_batch_size
diff --git a/ucm/sparse/kvcomp/ham_dist/CMakeLists.txt b/ucm/sparse/kvcomp/ham_dist/CMakeLists.txt
@@ -85,7 +85,7 @@ set_target_properties(hamming PROPERTIES
 # ---- Includes / Link dirs / Link libs ----
 target_include_directories(hamming PRIVATE ${INCLUDE_DIRS})
 target_link_directories(hamming PRIVATE ${LIBRARY_DIRS})
-target_link_libraries(hamming PRIVATE Torch::Torch Python::Module)
+target_link_libraries(hamming PRIVATE ${LIBRARIES} Python::Module)
 
 # ---- Extra compile options (keep your original intent) ----
 target_compile_options(hamming PRIVATE