|
1 | | -From c92cb68fd1fa6215cd6d5b207b95c841ac20dbe1 Mon Sep 17 00:00:00 2001 |
2 | | -From: wenxinwang <wangwenxin21@huawei.com> |
3 | | -Date: Tue, 23 Dec 2025 19:21:33 -0800 |
4 | | -Subject: [PATCH] sparse patch for vllm-ascend |
| 1 | +From 57681500369b33dc3ac9a2cc97ad10980bab56fc Mon Sep 17 00:00:00 2001 |
| 2 | +From: qyh <qiuyuhao1@huawei.com> |
| 3 | +Date: Wed, 31 Dec 2025 17:15:08 +0800 |
| 4 | +Subject: [PATCH] modify ascend patch for register_kv_cache |
5 | 5 |
|
6 | 6 | --- |
7 | | - vllm_ascend/attention/attention_v1.py | 80 ++++++++++++++++++++++ |
8 | | - vllm_ascend/attention/mla_v1.py | 14 +++- |
9 | | - vllm_ascend/worker/model_runner_v1.py | 98 ++++++++++++++++++++++++--- |
10 | | - vllm_ascend/worker/worker_v1.py | 25 +++++-- |
11 | | - 4 files changed, 201 insertions(+), 16 deletions(-) |
| 7 | + vllm_ascend/attention/attention_v1.py | 80 ++++++++++++++++++++ |
| 8 | + vllm_ascend/attention/mla_v1.py | 14 +++- |
| 9 | + vllm_ascend/worker/model_runner_v1.py | 101 +++++++++++++++++++++++--- |
| 10 | + vllm_ascend/worker/worker_v1.py | 25 ++++++- |
| 11 | + 4 files changed, 204 insertions(+), 16 deletions(-) |
12 | 12 |
|
13 | 13 | diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py |
14 | 14 | index 7d7f488f..ea982244 100644 |
@@ -185,7 +185,7 @@ index f50fe56e..ae8f50bf 100644 |
185 | 185 |
|
186 | 186 | return output_padded |
187 | 187 | diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py |
188 | | -index eabcdbcc..782b9a3b 100644 |
| 188 | +index eabcdbcc..2762fbc7 100644 |
189 | 189 | --- a/vllm_ascend/worker/model_runner_v1.py |
190 | 190 | +++ b/vllm_ascend/worker/model_runner_v1.py |
191 | 191 | @@ -39,7 +39,10 @@ from vllm.config import CompilationLevel, VllmConfig |
@@ -335,7 +335,17 @@ index eabcdbcc..782b9a3b 100644 |
335 | 335 |
|
336 | 336 | use_spec_decode = len( |
337 | 337 | scheduler_output.scheduled_spec_decode_tokens) > 0 |
338 | | -@@ -2369,3 +2402,48 @@ class NPUModelRunner(LoRAModelRunnerMixin): |
| 338 | +@@ -1965,6 +1998,9 @@ class NPUModelRunner(LoRAModelRunnerMixin): |
| 339 | + self.vllm_config.compilation_config.static_forward_context, |
| 340 | + self.kv_caches) |
| 341 | + |
| 342 | ++ if has_kv_transfer_group(): |
| 343 | ++ get_kv_transfer_group().register_kv_caches(kv_caches) |
| 344 | ++ |
| 345 | + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: |
| 346 | + """ |
| 347 | + Generates the KVCacheSpec by parsing the kv cache format from each |
| 348 | +@@ -2369,3 +2405,48 @@ class NPUModelRunner(LoRAModelRunnerMixin): |
339 | 349 | if batch_size <= padded_batch_size < selected_batch_size: |
340 | 350 | selected_batch_size = padded_batch_size |
341 | 351 | return selected_batch_size |
@@ -458,5 +468,5 @@ index df03d508..5d5d9b5a 100644 |
458 | 468 | def _init_profiler(self): |
459 | 469 | # Torch profiler. Enabled and configured through env vars: |
460 | 470 | -- |
461 | | -2.34.1 |
| 471 | +2.50.1.windows.1 |
462 | 472 |
|
0 commit comments