Skip to content

Commit d608be8

Browse files
[bugfix]cherry pick from 0.2.0-release sparse patch & cmake (#581)
[bugfix] sparse patch & cmake
1 parent 9727314 commit d608be8

File tree

3 files changed

+29
-85
lines changed

3 files changed

+29
-85
lines changed

ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch

Lines changed: 16 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
From 2f4adc594410c61a8f7bc5172ef443eca72d230e Mon Sep 17 00:00:00 2001
1+
From 8cb493f9ece884cbc2ba71e367bed2b4116ae1b3 Mon Sep 17 00:00:00 2001
22
From: wenxinwang <[email protected]>
3-
Date: Tue, 23 Dec 2025 18:35:23 -0800
3+
Date: Tue, 23 Dec 2025 19:44:21 -0800
44
Subject: [PATCH] kvcomp qwen deepseek
55

66
---
7-
vllm/attention/layer.py | 64 ++++++++++++++++-
8-
vllm/envs.py | 2 +-
9-
vllm/model_executor/models/llama.py | 23 ++++++-
7+
vllm/attention/layer.py | 63 ++++++++++++++++-
8+
vllm/model_executor/models/llama.py | 21 +++++-
109
vllm/model_executor/models/qwen2.py | 23 ++++++-
1110
vllm/v1/attention/backends/flash_attn.py | 7 ++
1211
vllm/v1/attention/backends/mla/common.py | 15 +++-
@@ -18,35 +17,29 @@ Subject: [PATCH] kvcomp qwen deepseek
1817
vllm/v1/worker/block_table.py | 13 ++++
1918
vllm/v1/worker/gpu_model_runner.py | 80 +++++++++++++++++++---
2019
vllm/v1/worker/gpu_worker.py | 2 +
21-
14 files changed, 278 insertions(+), 22 deletions(-)
20+
13 files changed, 275 insertions(+), 20 deletions(-)
2221

2322
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
24-
index f0ad68b16..bc48849b1 100644
23+
index f0ad68b16..ba93960de 100644
2524
--- a/vllm/attention/layer.py
2625
+++ b/vllm/attention/layer.py
27-
@@ -2,12 +2,12 @@
28-
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
29-
"""Attention layer."""
30-
from typing import Any, Dict, List, Optional
31-
-
32-
import torch
33-
import torch.nn as nn
26+
@@ -8,6 +8,7 @@ import torch.nn as nn
3427
import torch.nn.functional as F
3528

3629
import vllm.envs as envs
3730
+import os
3831
from vllm.attention import AttentionType
3932
from vllm.attention.selector import backend_name_to_enum, get_attn_backend
4033
from vllm.config import CacheConfig, get_current_vllm_config
41-
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
34+
@@ -22,6 +23,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
4235
from vllm.platforms import _Backend, current_platform
4336
from vllm.utils import direct_register_custom_op
4437
from vllm.v1.attention.backends.utils import validate_kv_sharing_target
4538
+from ucm.sparse.state import get_ucm_sparse, has_ucm_sparse
4639

4740

4841
class Attention(nn.Module):
49-
@@ -409,9 +410,10 @@ def unified_attention(
42+
@@ -409,9 +411,10 @@ def unified_attention(
5043
attn_metadata = attn_metadata[layer_name]
5144
self = forward_context.no_compile_layers[layer_name]
5245
kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -58,7 +51,7 @@ index f0ad68b16..bc48849b1 100644
5851
maybe_save_kv_layer_to_connector(layer_name, kv_cache)
5952
return output
6053

61-
@@ -449,6 +451,15 @@ def unified_attention_with_output(
54+
@@ -449,6 +452,15 @@ def unified_attention_with_output(
6255
attn_metadata = attn_metadata[layer_name]
6356
self = forward_context.no_compile_layers[layer_name]
6457
kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -74,7 +67,7 @@ index f0ad68b16..bc48849b1 100644
7467
self.impl.forward(self,
7568
query,
7669
key,
77-
@@ -457,6 +468,10 @@ def unified_attention_with_output(
70+
@@ -457,6 +469,10 @@ def unified_attention_with_output(
7871
attn_metadata,
7972
output=output,
8073
output_scale=output_scale)
@@ -85,7 +78,7 @@ index f0ad68b16..bc48849b1 100644
8578

8679
maybe_save_kv_layer_to_connector(layer_name, kv_cache)
8780

88-
@@ -479,3 +494,48 @@ direct_register_custom_op(
81+
@@ -479,3 +495,48 @@ direct_register_custom_op(
8982
fake_impl=unified_attention_with_output_fake,
9083
dispatch_key=current_platform.dispatch_key,
9184
)
@@ -134,31 +127,16 @@ index f0ad68b16..bc48849b1 100644
134127
+ return
135128
+
136129
+ ucm_sparse.attention_finished(query, key, value, attn_output, layer_name, forward_context, phase)
137-
diff --git a/vllm/envs.py b/vllm/envs.py
138-
index 0cc6792d7..495adfbf7 100644
139-
--- a/vllm/envs.py
140-
+++ b/vllm/envs.py
141-
@@ -953,7 +953,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
142-
# generations on machines < 100 for compressed-tensors
143-
# models
144-
"VLLM_USE_NVFP4_CT_EMULATIONS":
145-
- lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
146-
+ lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
147-
}
148-
149-
# --8<-- [end:env-vars-definition]
150130
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
151-
index 5d5080479..c5af4491d 100644
131+
index 5d5080479..39cb2f4fb 100644
152132
--- a/vllm/model_executor/models/llama.py
153133
+++ b/vllm/model_executor/models/llama.py
154-
@@ -54,7 +54,14 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
134+
@@ -54,7 +54,12 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
155135
is_pp_missing_parameter,
156136
make_empty_intermediate_tensors_factory, make_layers,
157137
maybe_prefix)
158138
-
159139
+from ucm.sparse.state import (
160-
+ get_ucm_sparse,
161-
+ has_ucm_sparse,
162140
+ maybe_execute_sparse_ffn_begin,
163141
+ maybe_execute_sparse_ffn_finished,
164142
+ maybe_execute_sparse_layer_begin,
@@ -167,7 +145,7 @@ index 5d5080479..c5af4491d 100644
167145

168146
class LlamaMLP(nn.Module):
169147

170-
@@ -305,10 +312,16 @@ class LlamaDecoderLayer(nn.Module):
148+
@@ -305,10 +310,16 @@ class LlamaDecoderLayer(nn.Module):
171149
hidden_states = self.self_attn(positions=positions,
172150
hidden_states=hidden_states)
173151

@@ -184,7 +162,7 @@ index 5d5080479..c5af4491d 100644
184162
return hidden_states, residual
185163

186164

187-
@@ -387,9 +400,17 @@ class LlamaModel(nn.Module):
165+
@@ -387,9 +398,17 @@ class LlamaModel(nn.Module):
188166
aux_hidden_states = []
189167
for idx, layer in enumerate(
190168
self.layers[self.start_layer:self.end_layer]):

ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch

Lines changed: 12 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
From 140050ef0ca76c5fe35f8050a49917b067102250 Mon Sep 17 00:00:00 2001
1+
From c92cb68fd1fa6215cd6d5b207b95c841ac20dbe1 Mon Sep 17 00:00:00 2001
22
From: wenxinwang <[email protected]>
3-
Date: Tue, 23 Dec 2025 18:44:45 -0800
3+
Date: Tue, 23 Dec 2025 19:21:33 -0800
44
Subject: [PATCH] sparse patch for vllm-ascend
55

66
---
7-
vllm_ascend/attention/attention_v1.py | 80 ++++++++++++++++++++
8-
vllm_ascend/attention/mla_v1.py | 14 +++-
9-
vllm_ascend/worker/model_runner_v1.py | 104 +++++++++++++++++++++++---
10-
vllm_ascend/worker/worker_v1.py | 25 ++++++-
11-
4 files changed, 205 insertions(+), 18 deletions(-)
7+
vllm_ascend/attention/attention_v1.py | 80 ++++++++++++++++++++++
8+
vllm_ascend/attention/mla_v1.py | 14 +++-
9+
vllm_ascend/worker/model_runner_v1.py | 98 ++++++++++++++++++++++++---
10+
vllm_ascend/worker/worker_v1.py | 25 +++++--
11+
4 files changed, 201 insertions(+), 16 deletions(-)
1212

1313
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
1414
index 7d7f488f..ea982244 100644
@@ -185,7 +185,7 @@ index f50fe56e..ae8f50bf 100644
185185

186186
return output_padded
187187
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
188-
index eabcdbcc..5129ecc8 100644
188+
index eabcdbcc..782b9a3b 100644
189189
--- a/vllm_ascend/worker/model_runner_v1.py
190190
+++ b/vllm_ascend/worker/model_runner_v1.py
191191
@@ -39,7 +39,10 @@ from vllm.config import CompilationLevel, VllmConfig
@@ -317,15 +317,7 @@ index eabcdbcc..5129ecc8 100644
317317
attn_state=attn_state)
318318
self.attn_state = attn_state # type: ignore
319319

320-
@@ -1100,6 +1129,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
321-
positions = self.positions[:padded_batch_size]
322-
323-
# Run forward pass
324-
+ finished_dumping = None
325-
with set_forward_context(attn_metadata,
326-
self.vllm_config,
327-
num_tokens=num_input_tokens):
328-
@@ -1125,6 +1155,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
320+
@@ -1125,6 +1154,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
329321
assert self.model is not None
330322
maybe_converting_weight_acl_format(self.model,
331323
ACL_FORMAT_FRACTAL_ND)
@@ -334,42 +326,16 @@ index eabcdbcc..5129ecc8 100644
334326

335327
hidden_states = self.model(
336328
input_ids=input_ids,
337-
@@ -1133,6 +1165,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
329+
@@ -1133,6 +1164,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
338330
inputs_embeds=inputs_embeds,
339331
**model_kwargs,
340332
)
341-
+ finished_dumping = self.maybe_wait_for_kv_save()
333+
+ self.maybe_wait_for_kv_save()
342334
+ logits_indices = self.maybe_execute_ucm_sparse_finished(logits_indices)
343335

344336
use_spec_decode = len(
345337
scheduler_output.scheduled_spec_decode_tokens) > 0
346-
@@ -1163,7 +1197,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
347-
348-
return (attn_metadata, hidden_states, spec_decode_metadata, positions,
349-
total_num_scheduled_tokens, logits_indices, aux_hidden_states,
350-
- num_scheduled_tokens)
351-
+ num_scheduled_tokens, finished_dumping)
352-
353-
def _get_cumsum_and_arange(
354-
self,
355-
@@ -1400,7 +1434,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
356-
return EMPTY_MODEL_RUNNER_OUTPUT
357-
(attn_metadata, hidden_states, spec_decode_metadata, positions,
358-
num_scheduled_tokens, logits_indices, aux_hidden_states,
359-
- num_scheduled_tokens_np) = (self._process_reqs(
360-
+ num_scheduled_tokens_np, finished_dumping) = (self._process_reqs(
361-
scheduler_output, intermediate_tensors))
362-
363-
with ProfileExecuteDuration().capture_async("post process"):
364-
@@ -1561,6 +1595,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
365-
logprobs=logprobs_lists,
366-
prompt_logprobs_dict=prompt_logprobs_dict,
367-
pooler_output=[],
368-
+ finished_dumping=finished_dumping
369-
)
370-
371-
durations = ProfileExecuteDuration().pop_captured_sync()
372-
@@ -2369,3 +2404,48 @@ class NPUModelRunner(LoRAModelRunnerMixin):
338+
@@ -2369,3 +2402,48 @@ class NPUModelRunner(LoRAModelRunnerMixin):
373339
if batch_size <= padded_batch_size < selected_batch_size:
374340
selected_batch_size = padded_batch_size
375341
return selected_batch_size

ucm/sparse/kvcomp/ham_dist/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ set_target_properties(hamming PROPERTIES
8585
# ---- Includes / Link dirs / Link libs ----
8686
target_include_directories(hamming PRIVATE ${INCLUDE_DIRS})
8787
target_link_directories(hamming PRIVATE ${LIBRARY_DIRS})
88-
target_link_libraries(hamming PRIVATE Torch::Torch Python::Module)
88+
target_link_libraries(hamming PRIVATE ${LIBRARIES} Python::Module)
8989

9090
# ---- Extra compile options (keep your original intent) ----
9191
target_compile_options(hamming PRIVATE

0 commit comments

Comments
 (0)