fix e2e

p00465316 · p00465316 · commit d4a3fbeb4818 · 2025-08-09T15:32:16.000+08:00
Signed-off-by: p00465316 &lt;panchao13@huawei.com&gt;
diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py
@@ -378,8 +378,9 @@ def forward(
             shape = [batch_size * seq_len, num_heads, head_size]
         """
         num_tokens = query.shape[0]
-        use_kv_cache_quant = len(
-            kv_cache) > 0 and kv_cache[0].dtype == torch.int8
+        use_kv_cache_quant = (kv_cache is not None and len(kv_cache) > 0
+                              and kv_cache[0].numel() > 0
+                              and kv_cache[0].dtype == torch.int8)
         if output is None:
             output = torch.empty(num_tokens,
                                  self.num_heads,
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
@@ -373,9 +373,7 @@ def forward(
 
         if not self.use_aclgraph:
             hidden_states = self.mlp(
-                hidden_states,
-                attn_metadata,
-                _metadata_for_padding=_metadata_for_padding)
+                hidden_states, _metadata_for_padding=_metadata_for_padding)
         else:
             hidden_states = self.mlp(hidden_states)
 
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
@@ -305,7 +305,8 @@ def rope_forward(
     is_prefill: Optional[bool] = True,
     is_qwen_torchair: Optional[bool] = False,
 ):
-    if not get_ascend_config().torchair_graph_config.enabled or is_prefill:
+    if (not get_ascend_config().torchair_graph_config.enabled
+            or not is_qwen_torchair or is_prefill):
         return rope_forward_oot(self, positions, query, key, offsets,
                                 is_neox_style_override,
                                 is_qwen_torchair)  # type: ignore