Skip to content

Commit e7d32ed

Browse files
authored
[BugFix] Fix the problem that torchair doesn't support tp > 4. (vllm-project#1508)
This PR removes the restriction that TP cannot be greater than 4 in torchair scenario, because current newest version of CANN has fixed this bug. - vLLM version: v0.10.0 - vLLM main: vllm-project/vllm@04ff4be Signed-off-by: whx-sjtu <[email protected]>
1 parent 4a008c4 commit e7d32ed

File tree

1 file changed

+0
-11
lines changed

1 file changed

+0
-11
lines changed

vllm_ascend/attention/mla_v1.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
if TYPE_CHECKING:
2929
from vllm.v1.core.sched.output import SchedulerOutput
3030

31-
_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
32-
3331

3432
class AscendMLABackend(AttentionBackend):
3533

@@ -548,15 +546,6 @@ def __init__(
548546
self.spec_token_num = speculative_config.num_speculative_tokens
549547
assert self.spec_token_num > 0
550548

551-
# TODO: support numHeads / numKvHeads < 16 in MLA kernel
552-
if self.torchair_graph_enabled:
553-
assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
554-
("The allowed number of queries per kv when enabling both MLA and Graph mode"
555-
" only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
556-
" as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
557-
" please make sure after the tensor parallel split, num_heads / num_kv_heads in "
558-
"{32, 64, 128}.")
559-
560549
def _v_up_proj_and_o_proj(self, x, enable_multistream_mla: bool = False):
561550
# Convert from (B, N, L) to (N, B, L)
562551
x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)

0 commit comments

Comments
 (0)