1- From 2f4adc594410c61a8f7bc5172ef443eca72d230e Mon Sep 17 00:00:00 2001
1+ From 8cb493f9ece884cbc2ba71e367bed2b4116ae1b3 Mon Sep 17 00:00:00 2001
22From: wenxinwang <
[email protected] >
3- Date: Tue, 23 Dec 2025 18:35:23 -0800
3+ Date: Tue, 23 Dec 2025 19:44:21 -0800
44Subject: [PATCH] kvcomp qwen deepseek
55
66---
7- vllm/attention/layer.py | 64 ++++++++++++++++-
8- vllm/envs.py | 2 +-
9- vllm/model_executor/models/llama.py | 23 ++++++-
7+ vllm/attention/layer.py | 63 ++++++++++++++++-
8+ vllm/model_executor/models/llama.py | 21 +++++-
109 vllm/model_executor/models/qwen2.py | 23 ++++++-
1110 vllm/v1/attention/backends/flash_attn.py | 7 ++
1211 vllm/v1/attention/backends/mla/common.py | 15 +++-
@@ -18,35 +17,29 @@ Subject: [PATCH] kvcomp qwen deepseek
1817 vllm/v1/worker/block_table.py | 13 ++++
1918 vllm/v1/worker/gpu_model_runner.py | 80 +++++++++++++++++++---
2019 vllm/v1/worker/gpu_worker.py | 2 +
21- 14 files changed, 278 insertions(+), 22 deletions(-)
20+ 13 files changed, 275 insertions(+), 20 deletions(-)
2221
2322diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
24- index f0ad68b16..bc48849b1 100644
23+ index f0ad68b16..ba93960de 100644
2524--- a/vllm/attention/layer.py
2625+++ b/vllm/attention/layer.py
27- @@ -2,12 +2,12 @@
28- # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
29- """Attention layer."""
30- from typing import Any, Dict, List, Optional
31- -
32- import torch
33- import torch.nn as nn
26+ @@ -8,6 +8,7 @@ import torch.nn as nn
3427 import torch.nn.functional as F
3528
3629 import vllm.envs as envs
3730+ import os
3831 from vllm.attention import AttentionType
3932 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
4033 from vllm.config import CacheConfig, get_current_vllm_config
41- @@ -22,6 +22 ,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
34+ @@ -22,6 +23 ,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
4235 from vllm.platforms import _Backend, current_platform
4336 from vllm.utils import direct_register_custom_op
4437 from vllm.v1.attention.backends.utils import validate_kv_sharing_target
4538+ from ucm.sparse.state import get_ucm_sparse, has_ucm_sparse
4639
4740
4841 class Attention(nn.Module):
49- @@ -409,9 +410 ,10 @@ def unified_attention(
42+ @@ -409,9 +411 ,10 @@ def unified_attention(
5043 attn_metadata = attn_metadata[layer_name]
5144 self = forward_context.no_compile_layers[layer_name]
5245 kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -58,7 +51,7 @@ index f0ad68b16..bc48849b1 100644
5851 maybe_save_kv_layer_to_connector(layer_name, kv_cache)
5952 return output
6053
61- @@ -449,6 +451 ,15 @@ def unified_attention_with_output(
54+ @@ -449,6 +452 ,15 @@ def unified_attention_with_output(
6255 attn_metadata = attn_metadata[layer_name]
6356 self = forward_context.no_compile_layers[layer_name]
6457 kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -74,7 +67,7 @@ index f0ad68b16..bc48849b1 100644
7467 self.impl.forward(self,
7568 query,
7669 key,
77- @@ -457,6 +468 ,10 @@ def unified_attention_with_output(
70+ @@ -457,6 +469 ,10 @@ def unified_attention_with_output(
7871 attn_metadata,
7972 output=output,
8073 output_scale=output_scale)
@@ -85,7 +78,7 @@ index f0ad68b16..bc48849b1 100644
8578
8679 maybe_save_kv_layer_to_connector(layer_name, kv_cache)
8780
88- @@ -479,3 +494 ,48 @@ direct_register_custom_op(
81+ @@ -479,3 +495 ,48 @@ direct_register_custom_op(
8982 fake_impl=unified_attention_with_output_fake,
9083 dispatch_key=current_platform.dispatch_key,
9184 )
@@ -134,31 +127,16 @@ index f0ad68b16..bc48849b1 100644
134127+ return
135128+
136129+ ucm_sparse.attention_finished(query, key, value, attn_output, layer_name, forward_context, phase)
137- diff --git a/vllm/envs.py b/vllm/envs.py
138- index 0cc6792d7..495adfbf7 100644
139- --- a/vllm/envs.py
140- +++ b/vllm/envs.py
141- @@ -953,7 +953,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
142- # generations on machines < 100 for compressed-tensors
143- # models
144- "VLLM_USE_NVFP4_CT_EMULATIONS":
145- - lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
146- + lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
147- }
148-
149- # --8<-- [end:env-vars-definition]
150130diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
151- index 5d5080479..c5af4491d 100644
131+ index 5d5080479..39cb2f4fb 100644
152132--- a/vllm/model_executor/models/llama.py
153133+++ b/vllm/model_executor/models/llama.py
154- @@ -54,7 +54,14 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
134+ @@ -54,7 +54,12 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
155135 is_pp_missing_parameter,
156136 make_empty_intermediate_tensors_factory, make_layers,
157137 maybe_prefix)
158138-
159139+ from ucm.sparse.state import (
160- + get_ucm_sparse,
161- + has_ucm_sparse,
162140+ maybe_execute_sparse_ffn_begin,
163141+ maybe_execute_sparse_ffn_finished,
164142+ maybe_execute_sparse_layer_begin,
@@ -167,7 +145,7 @@ index 5d5080479..c5af4491d 100644
167145
168146 class LlamaMLP(nn.Module):
169147
170- @@ -305,10 +312 ,16 @@ class LlamaDecoderLayer(nn.Module):
148+ @@ -305,10 +310 ,16 @@ class LlamaDecoderLayer(nn.Module):
171149 hidden_states = self.self_attn(positions=positions,
172150 hidden_states=hidden_states)
173151
@@ -184,7 +162,7 @@ index 5d5080479..c5af4491d 100644
184162 return hidden_states, residual
185163
186164
187- @@ -387,9 +400 ,17 @@ class LlamaModel(nn.Module):
165+ @@ -387,9 +398 ,17 @@ class LlamaModel(nn.Module):
188166 aux_hidden_states = []
189167 for idx, layer in enumerate(
190168 self.layers[self.start_layer:self.end_layer]):
0 commit comments