Skip to content

Commit f12f76d

Browse files
authored
Drop 0.10.2 (#3284)
Drop v0.10.2 support, we support vLLM 0.11.0rc3 now. - vLLM version: v0.11.0rc3 - vLLM main: vllm-project/vllm@releases/v0.11.0 Signed-off-by: wangxiyuan <[email protected]>
1 parent 2dde126 commit f12f76d

17 files changed

+202
-653
lines changed

vllm_ascend/models/qwen2_5_vl.py

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@
4242
from vllm.model_executor.models.utils import maybe_prefix
4343
from vllm.multimodal import MULTIMODAL_REGISTRY
4444

45-
from vllm_ascend.utils import vllm_version_is
46-
4745
MIN_PAD_SIZE = 64 # min_size to pad weight
4846
MAX_PAD_SIZE = 128 # max_size to pad weight
4947

@@ -498,20 +496,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
498496
super().__init__(vllm_config=vllm_config, prefix=prefix)
499497
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
500498
quant_config = vllm_config.quant_config
501-
if vllm_version_is("0.10.2"):
502-
self.visual = AscendQwen2_5_VisionTransformer(
503-
vision_config=config.vision_config,
504-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
505-
quant_config=self._maybe_ignore_quant_config(quant_config),
506-
prefix=maybe_prefix(prefix, "visual"),
507-
)
508-
else:
509-
self.visual = AscendQwen2_5_VisionTransformer(
510-
vision_config=config.vision_config,
511-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
512-
quant_config=quant_config,
513-
prefix=maybe_prefix(prefix, "visual"),
514-
)
499+
self.visual = AscendQwen2_5_VisionTransformer(
500+
vision_config=config.vision_config,
501+
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
502+
quant_config=quant_config,
503+
prefix=maybe_prefix(prefix, "visual"),
504+
)
515505

516506
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
517507

vllm_ascend/models/qwen2_5_vl_without_padding.py

Lines changed: 19 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
from vllm.multimodal import MULTIMODAL_REGISTRY
6969

7070
from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
71-
from vllm_ascend.utils import vllm_version_is
7271

7372

7473
class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
@@ -484,20 +483,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
484483
super().__init__(vllm_config=vllm_config, prefix=prefix)
485484
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
486485
quant_config = vllm_config.quant_config
487-
if vllm_version_is("0.10.2"):
488-
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
489-
vision_config=config.vision_config,
490-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
491-
quant_config=self._maybe_ignore_quant_config(quant_config),
492-
prefix=maybe_prefix(prefix, "visual"),
493-
)
494-
else:
495-
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
496-
vision_config=config.vision_config,
497-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
498-
quant_config=quant_config,
499-
prefix=maybe_prefix(prefix, "visual"),
500-
)
486+
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
487+
vision_config=config.vision_config,
488+
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
489+
quant_config=quant_config,
490+
prefix=maybe_prefix(prefix, "visual"),
491+
)
501492

502493
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
503494

@@ -563,20 +554,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
563554
super().__init__(vllm_config=vllm_config, prefix=prefix)
564555
config: Qwen3VLConfig = vllm_config.model_config.hf_config
565556
quant_config = vllm_config.quant_config
566-
if vllm_version_is("0.10.2"):
567-
self.visual = AscendQwen3_VisionTransformer(
568-
config.vision_config,
569-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
570-
quant_config=self._maybe_ignore_quant_config(quant_config),
571-
prefix=maybe_prefix(prefix, "visual"),
572-
use_data_parallel=self.use_data_parallel)
573-
else:
574-
self.visual = AscendQwen3_VisionTransformer(
575-
config.vision_config,
576-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
577-
quant_config=quant_config,
578-
prefix=maybe_prefix(prefix, "visual"),
579-
use_data_parallel=self.use_data_parallel)
557+
self.visual = AscendQwen3_VisionTransformer(
558+
config.vision_config,
559+
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
560+
quant_config=quant_config,
561+
prefix=maybe_prefix(prefix, "visual"),
562+
use_data_parallel=self.use_data_parallel)
580563

581564

582565
@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
@@ -613,19 +596,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
613596
multimodal_config = vllm_config.model_config.multimodal_config
614597
self.multimodal_config = multimodal_config
615598
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
616-
if vllm_version_is("0.10.2"):
617-
self.visual = AscendQwen3_VisionTransformer(
618-
config.vision_config,
619-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
620-
quant_config=self._maybe_ignore_quant_config(quant_config),
621-
prefix=maybe_prefix(prefix, "visual"),
622-
use_data_parallel=self.use_data_parallel,
623-
)
624-
else:
625-
self.visual = AscendQwen3_VisionTransformer(
626-
config.vision_config,
627-
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
628-
quant_config=quant_config,
629-
prefix=maybe_prefix(prefix, "visual"),
630-
use_data_parallel=self.use_data_parallel,
631-
)
599+
self.visual = AscendQwen3_VisionTransformer(
600+
config.vision_config,
601+
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
602+
quant_config=quant_config,
603+
prefix=maybe_prefix(prefix, "visual"),
604+
use_data_parallel=self.use_data_parallel,
605+
)

vllm_ascend/models/qwen2_vl.py

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@
4040
from vllm.model_executor.models.utils import maybe_prefix
4141
from vllm.multimodal import MULTIMODAL_REGISTRY
4242

43-
from vllm_ascend.utils import vllm_version_is
44-
4543
MIN_PAD_SIZE = 64 # min_size to pad weight
4644
MAX_PAD_SIZE = 128 # max_size to pad weight
4745

@@ -345,18 +343,9 @@ class AscendQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
345343

346344
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
347345
super().__init__(vllm_config=vllm_config, prefix=prefix)
348-
if vllm_version_is("0.10.2"):
349-
self.visual = AscendQwen2VisionTransformer(
350-
self.config.vision_config,
351-
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
352-
quant_config=self._maybe_ignore_quant_config(
353-
vllm_config.quant_config),
354-
prefix=maybe_prefix(prefix, "visual"),
355-
)
356-
else:
357-
self.visual = AscendQwen2VisionTransformer(
358-
self.config.vision_config,
359-
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
360-
quant_config=vllm_config.quant_config,
361-
prefix=maybe_prefix(prefix, "visual"),
362-
)
346+
self.visual = AscendQwen2VisionTransformer(
347+
self.config.vision_config,
348+
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
349+
quant_config=vllm_config.quant_config,
350+
prefix=maybe_prefix(prefix, "visual"),
351+
)

vllm_ascend/models/qwen3_moe.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
4848

4949
from vllm_ascend.ops.fused_moe import AscendFusedMoE
50-
from vllm_ascend.utils import vllm_version_is
5150

5251

5352
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -170,14 +169,8 @@ def __init__(
170169
quant_config=quant_config,
171170
prefix=f"{prefix}.mlp")
172171
else:
173-
if vllm_version_is("0.10.2"):
174-
self.mlp = Qwen3MoeSparseMoeBlock(
175-
config=config,
176-
quant_config=quant_config,
177-
prefix=f"{prefix}.mlp")
178-
else:
179-
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
180-
prefix=f"{prefix}.mlp")
172+
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
173+
prefix=f"{prefix}.mlp")
181174
else:
182175
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
183176
intermediate_size=config.intermediate_size,

vllm_ascend/ops/fused_moe.py

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@
4343
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
4444
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
4545
get_all_reduce_merge_state,
46-
get_rm_router_logits_state, is_310p,
47-
vllm_version_is)
46+
get_rm_router_logits_state, is_310p)
4847

4948

5049
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
@@ -275,25 +274,14 @@ def __init__(
275274
if self.scoring_func != "softmax" and not self.use_grouped_topk:
276275
raise ValueError("Only softmax scoring function is supported for "
277276
"non-grouped topk.")
278-
if vllm_version_is("0.10.2"):
279-
moe = FusedMoEConfig.make(
280-
num_experts=self.global_num_experts,
281-
experts_per_token=top_k,
282-
hidden_dim=hidden_size,
283-
num_local_experts=self.local_num_experts,
284-
moe_parallel_config=self.moe_parallel_config,
285-
# TODO (bnell): this needs to be fixed for quantized types.
286-
in_dtype=params_dtype,
287-
quant_config=quant_config)
288-
else:
289-
moe = FusedMoEConfig(
290-
num_experts=self.global_num_experts,
291-
experts_per_token=top_k,
292-
hidden_dim=hidden_size,
293-
num_local_experts=self.local_num_experts,
294-
moe_parallel_config=self.moe_parallel_config,
295-
in_dtype=params_dtype,
296-
)
277+
moe = FusedMoEConfig(
278+
num_experts=self.global_num_experts,
279+
experts_per_token=top_k,
280+
hidden_dim=hidden_size,
281+
num_local_experts=self.local_num_experts,
282+
moe_parallel_config=self.moe_parallel_config,
283+
in_dtype=params_dtype,
284+
)
297285
self.moe_config = moe
298286
# TODO: The self.moe_config.tp_size here is not correct, fixme soon
299287

vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626
from vllm.forward_context import get_forward_context
2727
from vllm.model_executor.layers.fused_moe import FusedMoEConfig
2828

29-
from vllm_ascend.utils import vllm_version_is
30-
3129

3230
class FusedMoEPrepareAndFinalize(ABC):
3331
"""
@@ -416,12 +414,8 @@ def prepare(self,
416414
self.enable_shared_expert_dp = enable_shared_expert_dp
417415

418416
if self.moe_config.dp_size > 1:
419-
if vllm_version_is("0.10.2"):
420-
self.cu_tokens_across_dp_cpu = get_forward_context(
421-
).dp_metadata.cu_tokens_across_dp_cpu
422-
else:
423-
self.cu_tokens_across_dp_cpu = get_forward_context(
424-
).dp_metadata.cu_tokens_across_sp(1)
417+
self.cu_tokens_across_dp_cpu = get_forward_context(
418+
).dp_metadata.cu_tokens_across_sp(1)
425419
hidden_states = self._naive_multicast(hidden_states,
426420
self.cu_tokens_across_dp_cpu)
427421
if rm_router_logits:

vllm_ascend/patch/worker/patch_common/patch_attention_layer.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
1717
from vllm.platforms import current_platform
1818

19-
from vllm_ascend.utils import vllm_version_is
20-
2119

2220
class AscendAttention(Attention, nn.Module, AttentionLayerBase):
2321
"""Attention layer.
@@ -69,12 +67,10 @@ def __init__(
6967
if cache_config is not None:
7068
kv_cache_dtype = cache_config.cache_dtype
7169
block_size = cache_config.block_size
72-
is_attention_free = cache_config.is_attention_free
7370
calculate_kv_scales = cache_config.calculate_kv_scales
7471
else:
7572
kv_cache_dtype = "auto"
7673
block_size = 16
77-
is_attention_free = False
7874
calculate_kv_scales = False
7975
if num_kv_heads is None:
8076
num_kv_heads = num_heads
@@ -135,23 +131,13 @@ def __init__(
135131
# weight and activation dtype.
136132
dtype = torch.get_default_dtype()
137133
if attn_backend is None:
138-
if vllm_version_is("0.10.2"):
139-
self.attn_backend = get_attn_backend(head_size,
140-
dtype,
141-
kv_cache_dtype,
142-
block_size,
143-
is_attention_free,
144-
use_mla=use_mla,
145-
use_sfa=use_sfa,
146-
has_sink=self.has_sink)
147-
else:
148-
self.attn_backend = get_attn_backend(head_size,
149-
dtype,
150-
kv_cache_dtype,
151-
block_size,
152-
use_mla=use_mla,
153-
use_sfa=use_sfa,
154-
has_sink=self.has_sink)
134+
self.attn_backend = get_attn_backend(head_size,
135+
dtype,
136+
kv_cache_dtype,
137+
block_size,
138+
use_mla=use_mla,
139+
use_sfa=use_sfa,
140+
has_sink=self.has_sink)
155141
else:
156142
self.attn_backend = attn_backend
157143

0 commit comments

Comments
 (0)