Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 6 additions & 16 deletions vllm_ascend/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@
from vllm.model_executor.models.utils import maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm_ascend.utils import vllm_version_is

MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight

Expand Down Expand Up @@ -498,20 +496,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
if vllm_version_is("0.10.2"):
self.visual = AscendQwen2_5_VisionTransformer(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=self._maybe_ignore_quant_config(quant_config),
prefix=maybe_prefix(prefix, "visual"),
)
else:
self.visual = AscendQwen2_5_VisionTransformer(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
)
self.visual = AscendQwen2_5_VisionTransformer(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
)

def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:

Expand Down
64 changes: 19 additions & 45 deletions vllm_ascend/models/qwen2_5_vl_without_padding.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
from vllm_ascend.utils import vllm_version_is


class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
Expand Down Expand Up @@ -484,20 +483,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
if vllm_version_is("0.10.2"):
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=self._maybe_ignore_quant_config(quant_config),
prefix=maybe_prefix(prefix, "visual"),
)
else:
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
)
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
)

def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:

Expand Down Expand Up @@ -563,20 +554,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
config: Qwen3VLConfig = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
if vllm_version_is("0.10.2"):
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=self._maybe_ignore_quant_config(quant_config),
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel)
else:
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel)
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel)


@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
Expand Down Expand Up @@ -613,19 +596,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
multimodal_config = vllm_config.model_config.multimodal_config
self.multimodal_config = multimodal_config
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
if vllm_version_is("0.10.2"):
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=self._maybe_ignore_quant_config(quant_config),
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel,
)
else:
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel,
)
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel,
)
23 changes: 6 additions & 17 deletions vllm_ascend/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
from vllm.model_executor.models.utils import maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm_ascend.utils import vllm_version_is

MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight

Expand Down Expand Up @@ -345,18 +343,9 @@ class AscendQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
if vllm_version_is("0.10.2"):
self.visual = AscendQwen2VisionTransformer(
self.config.vision_config,
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
quant_config=self._maybe_ignore_quant_config(
vllm_config.quant_config),
prefix=maybe_prefix(prefix, "visual"),
)
else:
self.visual = AscendQwen2VisionTransformer(
self.config.vision_config,
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
quant_config=vllm_config.quant_config,
prefix=maybe_prefix(prefix, "visual"),
)
self.visual = AscendQwen2VisionTransformer(
self.config.vision_config,
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
quant_config=vllm_config.quant_config,
prefix=maybe_prefix(prefix, "visual"),
)
11 changes: 2 additions & 9 deletions vllm_ascend/models/qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)

from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.utils import vllm_version_is


class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
Expand Down Expand Up @@ -170,14 +169,8 @@ def __init__(
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
if vllm_version_is("0.10.2"):
self.mlp = Qwen3MoeSparseMoeBlock(
config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
prefix=f"{prefix}.mlp")
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
Expand Down
30 changes: 9 additions & 21 deletions vllm_ascend/ops/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
get_all_reduce_merge_state,
get_rm_router_logits_state, is_310p,
vllm_version_is)
get_rm_router_logits_state, is_310p)


class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
Expand Down Expand Up @@ -275,25 +274,14 @@ def __init__(
if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.")
if vllm_version_is("0.10.2"):
moe = FusedMoEConfig.make(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
# TODO (bnell): this needs to be fixed for quantized types.
in_dtype=params_dtype,
quant_config=quant_config)
else:
moe = FusedMoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
in_dtype=params_dtype,
)
moe = FusedMoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
in_dtype=params_dtype,
)
self.moe_config = moe
# TODO: The self.moe_config.tp_size here is not correct, fixme soon

Expand Down
10 changes: 2 additions & 8 deletions vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.fused_moe import FusedMoEConfig

from vllm_ascend.utils import vllm_version_is


class FusedMoEPrepareAndFinalize(ABC):
"""
Expand Down Expand Up @@ -416,12 +414,8 @@ def prepare(self,
self.enable_shared_expert_dp = enable_shared_expert_dp

if self.moe_config.dp_size > 1:
if vllm_version_is("0.10.2"):
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_dp_cpu
else:
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_sp(1)
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_sp(1)
hidden_states = self._naive_multicast(hidden_states,
self.cu_tokens_across_dp_cpu)
if rm_router_logits:
Expand Down
28 changes: 7 additions & 21 deletions vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.platforms import current_platform

from vllm_ascend.utils import vllm_version_is


class AscendAttention(Attention, nn.Module, AttentionLayerBase):
"""Attention layer.
Expand Down Expand Up @@ -69,12 +67,10 @@ def __init__(
if cache_config is not None:
kv_cache_dtype = cache_config.cache_dtype
block_size = cache_config.block_size
is_attention_free = cache_config.is_attention_free
calculate_kv_scales = cache_config.calculate_kv_scales
else:
kv_cache_dtype = "auto"
block_size = 16
is_attention_free = False
calculate_kv_scales = False
if num_kv_heads is None:
num_kv_heads = num_heads
Expand Down Expand Up @@ -135,23 +131,13 @@ def __init__(
# weight and activation dtype.
dtype = torch.get_default_dtype()
if attn_backend is None:
if vllm_version_is("0.10.2"):
self.attn_backend = get_attn_backend(head_size,
dtype,
kv_cache_dtype,
block_size,
is_attention_free,
use_mla=use_mla,
use_sfa=use_sfa,
has_sink=self.has_sink)
else:
self.attn_backend = get_attn_backend(head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla=use_mla,
use_sfa=use_sfa,
has_sink=self.has_sink)
self.attn_backend = get_attn_backend(head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla=use_mla,
use_sfa=use_sfa,
has_sink=self.has_sink)
else:
self.attn_backend = attn_backend

Expand Down
Loading
Loading