A5 support reshape and cache in CP situation (#7636)

lenghuixing0330 · web-flow · commit 1a7cbdbaad30 · 2026-03-26T17:19:20.000+08:00
### What this PR does / why we need it? In the A5 scenario, the CP is supported. The A5 reshape and cache operators need to go through the aclnn operator Therefore, the routing of DeviceAdaptor is added. In addition, the input of the A5 aclnn operator should be continuous. There are some non-contiguous operations, such as slicing with intervals. `slot_mapping = attn_metadata.slot_mapping[: num_decode_tokens * self.pcp_size : self.pcp_size]`, where `slot_mapping` is non-contiguous and needs to be contiguous.Therefore, the continuity of key, value, and slot-mapping is fixed. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.18.0 - vLLM main: vllm-project/vllm@ed359c4 --------- Signed-off-by: lenghuixing0330 <2531948770@qq.com>
diff --git a/vllm_ascend/attention/context_parallel/attention_cp.py b/vllm_ascend/attention/context_parallel/attention_cp.py
@@ -49,6 +49,7 @@
     split_decodes_and_prefills,
 )
 from vllm_ascend.compilation.acl_graph import get_graph_params, update_graph_params_workspaces
+from vllm_ascend.device.device_op import DeviceOperator
 from vllm_ascend.utils import cp_chunkedprefill_comm_stream, weak_ref_tensors
 
 
@@ -752,12 +753,12 @@ def reshape_and_cache(
 
             if has_decode:
                 slot_mapping = attn_metadata.slot_mapping[: num_decode_tokens * self.pcp_size : self.pcp_size]
-                torch_npu._npu_reshape_and_cache(
+                DeviceOperator.reshape_and_cache(
                     key=key[:num_decode_tokens],
                     value=value[:num_decode_tokens],
                     key_cache=self.key_cache,
                     value_cache=self.value_cache,
-                    slot_indices=slot_mapping,
+                    slot_mapping=slot_mapping,
                 )
 
             if has_prefill:
@@ -784,12 +785,12 @@ def reshape_and_cache(
                 slot_mapping = attn_metadata.slot_mapping[
                     self.pcp_size * num_decode_tokens : attn_metadata.num_actual_tokens_pcp_padded
                 ]
-                torch_npu._npu_reshape_and_cache(
+                DeviceOperator.reshape_and_cache(
                     key=prefill_key,
                     value=prefill_value,
                     key_cache=self.key_cache,
                     value_cache=self.value_cache,
-                    slot_indices=slot_mapping,
+                    slot_mapping=slot_mapping,
                 )
             if self.is_kv_producer:
                 attn_metadata.reshape_cache_event.record()
diff --git a/vllm_ascend/device/device_op.py b/vllm_ascend/device/device_op.py
@@ -204,7 +204,11 @@ class A5DeviceAdaptor(BaseDeviceAdaptor):
     @classmethod
     def reshape_and_cache(cls, key, value, key_cache, value_cache, slot_mapping):
         torch_npu.npu_scatter_pa_kv_cache(
-            key=key, value=value.contiguous(), key_cache=key_cache, value_cache=value_cache, slot_mapping=slot_mapping
+            key=key.contiguous(),
+            value=value.contiguous(),
+            key_cache=key_cache,
+            value_cache=value_cache,
+            slot_mapping=slot_mapping.contiguous(),
         )
 
     @staticmethod