From cfdf958e210d0f4d708a35bb380975471d300bea Mon Sep 17 00:00:00 2001
From: qyh <qiuyuhao1@huawei.com>
Date: Wed, 31 Dec 2025 17:24:31 +0800
Subject: [PATCH] fix ascend patch and change version

---
 setup.py                                      |  2 +-
 .../vllm/patch/0.9.2/vllm-ascend-adapt.patch  | 34 ++++++++++++-------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/setup.py b/setup.py
index 4d7fca0e3..cdc3de172 100644
--- a/setup.py
+++ b/setup.py
@@ -139,7 +139,7 @@ def build_cmake(self, ext: CMakeExtension):
 
 setup(
     name="uc-manager",
-    version="0.2.0rc2",
+    version="0.2.0",
     description="Unified Cache Management",
     author="Unified Cache Team",
     packages=find_packages(),
diff --git a/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch b/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
index cdf567167..f74e7cb0d 100644
--- a/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
+++ b/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
@@ -1,14 +1,14 @@
-From c92cb68fd1fa6215cd6d5b207b95c841ac20dbe1 Mon Sep 17 00:00:00 2001
-From: wenxinwang <wangwenxin21@huawei.com>
-Date: Tue, 23 Dec 2025 19:21:33 -0800
-Subject: [PATCH] sparse patch for vllm-ascend
+From 57681500369b33dc3ac9a2cc97ad10980bab56fc Mon Sep 17 00:00:00 2001
+From: qyh <qiuyuhao1@huawei.com>
+Date: Wed, 31 Dec 2025 17:15:08 +0800
+Subject: [PATCH] modify ascend patch for register_kv_cache
 
 ---
- vllm_ascend/attention/attention_v1.py | 80 ++++++++++++++++++++++
- vllm_ascend/attention/mla_v1.py       | 14 +++-
- vllm_ascend/worker/model_runner_v1.py | 98 ++++++++++++++++++++++++---
- vllm_ascend/worker/worker_v1.py       | 25 +++++--
- 4 files changed, 201 insertions(+), 16 deletions(-)
+ vllm_ascend/attention/attention_v1.py |  80 ++++++++++++++++++++
+ vllm_ascend/attention/mla_v1.py       |  14 +++-
+ vllm_ascend/worker/model_runner_v1.py | 101 +++++++++++++++++++++++---
+ vllm_ascend/worker/worker_v1.py       |  25 ++++++-
+ 4 files changed, 204 insertions(+), 16 deletions(-)
 
 diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
 index 7d7f488f..ea982244 100644
@@ -185,7 +185,7 @@ index f50fe56e..ae8f50bf 100644
  
          return output_padded
 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
-index eabcdbcc..782b9a3b 100644
+index eabcdbcc..2762fbc7 100644
 --- a/vllm_ascend/worker/model_runner_v1.py
 +++ b/vllm_ascend/worker/model_runner_v1.py
 @@ -39,7 +39,10 @@ from vllm.config import CompilationLevel, VllmConfig
@@ -335,7 +335,17 @@ index eabcdbcc..782b9a3b 100644
  
          use_spec_decode = len(
              scheduler_output.scheduled_spec_decode_tokens) > 0
-@@ -2369,3 +2402,48 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+@@ -1965,6 +1998,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
+             self.vllm_config.compilation_config.static_forward_context,
+             self.kv_caches)
+ 
++        if has_kv_transfer_group():
++            get_kv_transfer_group().register_kv_caches(kv_caches)
++
+     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
+         """
+         Generates the KVCacheSpec by parsing the kv cache format from each
+@@ -2369,3 +2405,48 @@ class NPUModelRunner(LoRAModelRunnerMixin):
              if batch_size <= padded_batch_size < selected_batch_size:
                  selected_batch_size = padded_batch_size
          return selected_batch_size
@@ -458,5 +468,5 @@ index df03d508..5d5d9b5a 100644
      def _init_profiler(self):
          # Torch profiler. Enabled and configured through env vars:
 -- 
-2.34.1
+2.50.1.windows.1