diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index f240fd1c7e..33b4be2b58 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -42,8 +42,6 @@ from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import vllm_version_is - MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -498,20 +496,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - if vllm_version_is("0.10.2"): - self.visual = AscendQwen2_5_VisionTransformer( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) - else: - self.visual = AscendQwen2_5_VisionTransformer( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) + self.visual = AscendQwen2_5_VisionTransformer( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py index f62009b341..6c3bbc8cfa 100644 --- a/vllm_ascend/models/qwen2_5_vl_without_padding.py +++ b/vllm_ascend/models/qwen2_5_vl_without_padding.py @@ -68,7 +68,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding -from vllm_ascend.utils import vllm_version_is class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention): @@ -484,20 +483,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - if vllm_version_is("0.10.2"): - self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) - else: - self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) + self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: @@ -563,20 +554,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) config: Qwen3VLConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - if vllm_version_is("0.10.2"): - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel) - else: - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel) + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel) @MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, @@ -613,19 +596,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): multimodal_config = vllm_config.model_config.multimodal_config self.multimodal_config = multimodal_config self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" - if vllm_version_is("0.10.2"): - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel, - ) - else: - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel, - ) + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, + ) diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index 9648e070b1..d9b3e035bd 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -40,8 +40,6 @@ from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import vllm_version_is - MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -345,18 +343,9 @@ class AscendQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) - if vllm_version_is("0.10.2"): - self.visual = AscendQwen2VisionTransformer( - self.config.vision_config, - norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config( - vllm_config.quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) - else: - self.visual = AscendQwen2VisionTransformer( - self.config.vision_config, - norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), - quant_config=vllm_config.quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) + self.visual = AscendQwen2VisionTransformer( + self.config.vision_config, + norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), + quant_config=vllm_config.quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index bc0a04ebb5..d04349c2f6 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -47,7 +47,6 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm_ascend.ops.fused_moe import AscendFusedMoE -from vllm_ascend.utils import vllm_version_is class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): @@ -170,14 +169,8 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.mlp") else: - if vllm_version_is("0.10.2"): - self.mlp = Qwen3MoeSparseMoeBlock( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - else: - self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config, - prefix=f"{prefix}.mlp") + self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config, + prefix=f"{prefix}.mlp") else: self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 97489f9ac3..045fd0fbba 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -43,8 +43,7 @@ from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, get_all_reduce_merge_state, - get_rm_router_logits_state, is_310p, - vllm_version_is) + get_rm_router_logits_state, is_310p) class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): @@ -275,25 +274,14 @@ def __init__( if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " "non-grouped topk.") - if vllm_version_is("0.10.2"): - moe = FusedMoEConfig.make( - num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - num_local_experts=self.local_num_experts, - moe_parallel_config=self.moe_parallel_config, - # TODO (bnell): this needs to be fixed for quantized types. - in_dtype=params_dtype, - quant_config=quant_config) - else: - moe = FusedMoEConfig( - num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - num_local_experts=self.local_num_experts, - moe_parallel_config=self.moe_parallel_config, - in_dtype=params_dtype, - ) + moe = FusedMoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + in_dtype=params_dtype, + ) self.moe_config = moe # TODO: The self.moe_config.tp_size here is not correct, fixme soon diff --git a/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py b/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py index 3d800e4a94..109ba2f9d2 100644 --- a/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py +++ b/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py @@ -26,8 +26,6 @@ from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe import FusedMoEConfig -from vllm_ascend.utils import vllm_version_is - class FusedMoEPrepareAndFinalize(ABC): """ @@ -416,12 +414,8 @@ def prepare(self, self.enable_shared_expert_dp = enable_shared_expert_dp if self.moe_config.dp_size > 1: - if vllm_version_is("0.10.2"): - self.cu_tokens_across_dp_cpu = get_forward_context( - ).dp_metadata.cu_tokens_across_dp_cpu - else: - self.cu_tokens_across_dp_cpu = get_forward_context( - ).dp_metadata.cu_tokens_across_sp(1) + self.cu_tokens_across_dp_cpu = get_forward_context( + ).dp_metadata.cu_tokens_across_sp(1) hidden_states = self._naive_multicast(hidden_states, self.cu_tokens_across_dp_cpu) if rm_router_logits: diff --git a/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py b/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py index 6f4ad36427..b778d8a832 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py +++ b/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py @@ -16,8 +16,6 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform -from vllm_ascend.utils import vllm_version_is - class AscendAttention(Attention, nn.Module, AttentionLayerBase): """Attention layer. @@ -69,12 +67,10 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype block_size = cache_config.block_size - is_attention_free = cache_config.is_attention_free calculate_kv_scales = cache_config.calculate_kv_scales else: kv_cache_dtype = "auto" block_size = 16 - is_attention_free = False calculate_kv_scales = False if num_kv_heads is None: num_kv_heads = num_heads @@ -135,23 +131,13 @@ def __init__( # weight and activation dtype. dtype = torch.get_default_dtype() if attn_backend is None: - if vllm_version_is("0.10.2"): - self.attn_backend = get_attn_backend(head_size, - dtype, - kv_cache_dtype, - block_size, - is_attention_free, - use_mla=use_mla, - use_sfa=use_sfa, - has_sink=self.has_sink) - else: - self.attn_backend = get_attn_backend(head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla=use_mla, - use_sfa=use_sfa, - has_sink=self.has_sink) + self.attn_backend = get_attn_backend(head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla=use_mla, + use_sfa=use_sfa, + has_sink=self.has_sink) else: self.attn_backend = attn_backend diff --git a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py index 793fef1859..b456e66301 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py +++ b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py @@ -27,154 +27,72 @@ from vllm.platforms import _Backend, current_platform from vllm.utils import resolve_obj_by_qualname -from vllm_ascend.utils import vllm_version_is -if vllm_version_is("0.10.2"): +def get_attn_backend( # type: ignore[misc] + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: Optional[str], + block_size: int, + use_mla: bool = False, + use_sfa: bool = False, + has_sink: bool = False, +) -> type[AttentionBackend]: + """Selects which attention backend to use and lazily imports it.""" + # Accessing envs.* behind an @lru_cache decorator can cause the wrong + # value to be returned from the cache if the value changes between calls. + # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the + # private function. + return _cached_get_attn_backend( + head_size=head_size, + dtype=dtype, + kv_cache_dtype=kv_cache_dtype, + block_size=block_size, + use_v1=envs.VLLM_USE_V1, + use_mla=use_mla, + use_sfa=use_sfa, + has_sink=has_sink, + ) - def get_attn_backend( - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - is_attention_free: bool = False, - use_mla: bool = False, - use_sfa: bool = False, - has_sink: bool = False, - ) -> type[AttentionBackend]: - """Selects which attention backend to use and lazily imports it.""" - # Accessing envs.* behind an @lru_cache decorator can cause the wrong - # value to be returned from the cache if the value changes between calls. - # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the - # private function. - return _cached_get_attn_backend( - head_size=head_size, - dtype=dtype, - kv_cache_dtype=kv_cache_dtype, - block_size=block_size, - is_attention_free=is_attention_free, - use_v1=envs.VLLM_USE_V1, - use_mla=use_mla, - use_sfa=use_sfa, - has_sink=has_sink, - ) - @cache - def _cached_get_attn_backend( - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - is_attention_free: bool, - use_v1: bool = False, - use_mla: bool = False, - use_sfa: bool = False, - has_sink: bool = False, - ) -> type[AttentionBackend]: - # If there are no attention layers (e.g. we are running Mamba), - # use the placeholder NO_ATTENTION - if is_attention_free: - from vllm.attention.backends.placeholder_attn import \ - PlaceholderAttentionBackend - return PlaceholderAttentionBackend +@cache +def _cached_get_attn_backend( + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: Optional[str], + block_size: int, + use_v1: bool = False, + use_mla: bool = False, + use_sfa: bool = False, + has_sink: bool = False, +) -> type[AttentionBackend]: + # Check whether a particular choice of backend was + # previously forced. + # + # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND + # ENVIRONMENT VARIABLE. + selected_backend = None + backend_by_global_setting: Optional[_Backend] = ( + get_global_forced_attn_backend()) + if backend_by_global_setting is not None: + selected_backend = backend_by_global_setting + else: + # Check the environment variable and override if specified + backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + if backend_by_env_var is not None: + selected_backend = backend_name_to_enum(backend_by_env_var) + if selected_backend is None: + raise ValueError( + f"Invalid attention backend: '{backend_by_env_var}'. " + f"Valid backends are: {list(_Backend.__members__.keys())}") - # Check whether a particular choice of backend was - # previously forced. - # - # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND - # ENVIRONMENT VARIABLE. - selected_backend = None - backend_by_global_setting: Optional[_Backend] = ( - get_global_forced_attn_backend()) - if backend_by_global_setting is not None: - selected_backend = backend_by_global_setting - else: - # Check the environment variable and override if specified - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - raise ValueError( - f"Invalid attention backend: '{backend_by_env_var}'. " - f"Valid backends are: {list(_Backend.__members__.keys())}" - ) - - # get device-specific attn_backend - attention_cls = current_platform.get_attn_backend_cls( - selected_backend, head_size, dtype, kv_cache_dtype, block_size, - use_v1, use_mla, use_sfa, has_sink) - if not attention_cls: - raise ValueError( - f"Invalid attention backend for {current_platform.device_name}" - ) - return resolve_obj_by_qualname(attention_cls) -else: - - def get_attn_backend( # type: ignore[misc] - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - use_mla: bool = False, - use_sfa: bool = False, - has_sink: bool = False, - ) -> type[AttentionBackend]: - """Selects which attention backend to use and lazily imports it.""" - # Accessing envs.* behind an @lru_cache decorator can cause the wrong - # value to be returned from the cache if the value changes between calls. - # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the - # private function. - return _cached_get_attn_backend( - head_size=head_size, - dtype=dtype, - kv_cache_dtype=kv_cache_dtype, - block_size=block_size, - use_v1=envs.VLLM_USE_V1, - use_mla=use_mla, - use_sfa=use_sfa, - has_sink=has_sink, - ) - - @cache - def _cached_get_attn_backend( - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - use_v1: bool = False, - use_mla: bool = False, - use_sfa: bool = False, - has_sink: bool = False, - ) -> type[AttentionBackend]: - # Check whether a particular choice of backend was - # previously forced. - # - # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND - # ENVIRONMENT VARIABLE. - selected_backend = None - backend_by_global_setting: Optional[_Backend] = ( - get_global_forced_attn_backend()) - if backend_by_global_setting is not None: - selected_backend = backend_by_global_setting - else: - # Check the environment variable and override if specified - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - raise ValueError( - f"Invalid attention backend: '{backend_by_env_var}'. " - f"Valid backends are: {list(_Backend.__members__.keys())}" - ) - - # get device-specific attn_backend - attention_cls = current_platform.get_attn_backend_cls( - selected_backend, head_size, dtype, kv_cache_dtype, block_size, - use_v1, use_mla, use_sfa, has_sink) - if not attention_cls: - raise ValueError( - f"Invalid attention backend for {current_platform.device_name}" - ) - return resolve_obj_by_qualname(attention_cls) + # get device-specific attn_backend + attention_cls = current_platform.get_attn_backend_cls( + selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, + use_mla, use_sfa, has_sink) + if not attention_cls: + raise ValueError( + f"Invalid attention backend for {current_platform.device_name}") + return resolve_obj_by_qualname(attention_cls) vllm.attention.get_attn_backend = get_attn_backend diff --git a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py index 10705d3d81..ec3da9d714 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py +++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py @@ -1,11 +1,10 @@ import torch from torch.nn.parameter import Parameter from vllm.logger import init_logger +from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.utils import set_weight_attrs from vllm.utils import GiB_bytes -from vllm_ascend.utils import vllm_version_is - logger = init_logger(__name__) @@ -39,6 +38,4 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, set_weight_attrs(weight, extra_weight_attrs) -if not vllm_version_is("0.10.2"): - from vllm.model_executor.layers.linear import UnquantizedLinearMethod - UnquantizedLinearMethod.create_weights = create_weights +UnquantizedLinearMethod.create_weights = create_weights diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index f1581df068..a85895a16c 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -32,7 +32,7 @@ from vllm_ascend.torchair.utils import (check_torchair_cache_exist, delete_torchair_cache_file) from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p, - update_aclgraph_sizes, vllm_version_is) + update_aclgraph_sizes) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -131,10 +131,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config scheduler_config = vllm_config.scheduler_config ascend_scheduler_config = ascend_config.ascend_scheduler_config - if vllm_version_is("0.10.2"): - structured_outputs_config = vllm_config.decoding_config - else: - structured_outputs_config = vllm_config.structured_outputs_config + structured_outputs_config = vllm_config.structured_outputs_config if (model_config is not None and not model_config.use_mla and not scheduler_config.async_scheduling): @@ -212,9 +209,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: vllm_config._set_cudagraph_sizes() # TODO: Full graph is fully supported later, and the default value will be set to full graph. - if not vllm_version_is("0.10.2"): - if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: - compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: + compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: compilation_config.level = CompilationLevel.NO_COMPILATION diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 9cceda6c96..37abdd4965 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -3,13 +3,9 @@ from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.sampler import Sampler -from vllm_ascend.utils import is_310p, vllm_version_is +from vllm_ascend.utils import is_310p -if vllm_version_is("0.10.2"): - from vllm.config import LogprobsMode - DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS -else: - DEFAULT_LOGPROBS_MODE = "raw_logprobs" +DEFAULT_LOGPROBS_MODE = "raw_logprobs" class AscendSampler(Sampler): @@ -69,18 +65,10 @@ def forward_native(self, logits, generators, k, p): """Override pytorch native implementation to torch_npu""" logits = self._apply_top_k_top_p(logits, k, p) logits_to_return = None - if vllm_version_is("0.10.2"): - if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: - logits_to_return = logits - elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: - logits_to_return = logits.log_softmax(dim=-1, - dtype=torch.float32) - else: - if self.logprobs_mode == "processed_logits": - logits_to_return = logits - elif self.logprobs_mode == "processed_logprobs": - logits_to_return = logits.log_softmax(dim=-1, - dtype=torch.float32) + if self.logprobs_mode == "processed_logits": + logits_to_return = logits + elif self.logprobs_mode == "processed_logprobs": + logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) return random_sample(probs, generators), logits_to_return diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index d14dc6d2a4..7959a71c28 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -21,7 +21,6 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType -from vllm_ascend.utils import vllm_version_is PADDING_SLOT_ID = -1 @@ -352,10 +351,7 @@ def _get_eagle_atten_dict( decode_token_per_req=self.runner.decode_token_per_req, num_computed_tokens_cpu=None, seq_lens=None) - if vllm_version_is("0.10.2"): - builder = self.runner.attn_groups[0][0].metadata_builder - else: - builder = self.runner.attn_groups[0][0].get_metadata_builder() + builder = self.runner.attn_groups[0][0].get_metadata_builder() attn_metadata_i = builder.build(0, common_attn_metadata, self.runner.get_model()) for layer_name in kv_cache_group_spec.layer_names: @@ -447,10 +443,7 @@ def _propose( num_computed_tokens_cpu=None, seq_lens=None) # FIXME(woosuk): The below two ops cause synchronization. Optimize. - if vllm_version_is("0.10.2"): - builder = self.runner.attn_groups[0][0].metadata_builder - else: - builder = self.runner.attn_groups[0][0].get_metadata_builder() + builder = self.runner.attn_groups[0][0].get_metadata_builder() attn_metadata = builder.build(0, common_attn_metadata, self.runner.get_model()) if self.use_cuda_graph and \ @@ -479,10 +472,7 @@ def _propose( hidden_states=self.hidden_states[:num_input_tokens], ) sample_hidden_states = last_hidden_states[last_token_indices] - if vllm_version_is("0.10.2"): - logits = self.model.compute_logits(sample_hidden_states, None) - else: - logits = self.model.compute_logits(sample_hidden_states) + logits = self.model.compute_logits(sample_hidden_states) draft_token_ids = logits.argmax(dim=-1) # Early exit if there is only one draft token to be generated. @@ -586,12 +576,7 @@ def _propose( hidden_states=self.hidden_states[:input_batch_size], ) hidden_states = hidden_states[:batch_size] - if vllm_version_is("0.10.2"): - logits = self.model.compute_logits( - last_hidden_states[:batch_size], None) - else: - logits = self.model.compute_logits( - last_hidden_states[:batch_size]) + logits = self.model.compute_logits(last_hidden_states[:batch_size]) # TODO(wenlong): get more than one token for tree attention draft_token_ids = logits.argmax(dim=-1) diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index d0a0d507fa..6889efb45a 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -24,8 +24,7 @@ TorchairDeepSeekMTP from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, TorchairCommonAttentionMetadata) -from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable, - vllm_version_is) +from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable PADDING_SLOT_ID = -1 @@ -400,10 +399,7 @@ def _propose( seq_lens=None) if not self.torchair_graph_enabled: - if vllm_version_is("0.10.2"): - builder = self.runner.attn_groups[0][0].metadata_builder - else: - builder = self.runner.attn_groups[0][0].get_metadata_builder() + builder = self.runner.attn_groups[0][0].get_metadata_builder() attn_metadata_mtp = builder.build(0, common_attn_metadata, self.runner.get_model()) diff --git a/vllm_ascend/torchair/models/qwen3_moe.py b/vllm_ascend/torchair/models/qwen3_moe.py index c6aad6add4..4f013d2c18 100644 --- a/vllm_ascend/torchair/models/qwen3_moe.py +++ b/vllm_ascend/torchair/models/qwen3_moe.py @@ -56,7 +56,6 @@ from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding, init_metadata_for_sp) -from vllm_ascend.utils import vllm_version_is class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): @@ -312,14 +311,8 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.mlp") else: - if vllm_version_is("0.10.2"): - self.mlp = Qwen3MoeSparseMoeBlock( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - else: - self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config, - prefix=f"{prefix}.mlp") + self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config, + prefix=f"{prefix}.mlp") else: self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, diff --git a/vllm_ascend/torchair/ops/torchair_fused_moe.py b/vllm_ascend/torchair/ops/torchair_fused_moe.py index bd25a79562..d833341ac5 100644 --- a/vllm_ascend/torchair/ops/torchair_fused_moe.py +++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py @@ -50,8 +50,7 @@ from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, get_all_reduce_merge_state, get_ascend_soc_version, - get_rm_router_logits_state, is_310p, - vllm_version_is) + get_rm_router_logits_state, is_310p) def torchair_fused_experts_with_mc2( @@ -1061,26 +1060,14 @@ def __init__( if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " "non-grouped topk.") - - if vllm_version_is("0.10.2"): - self.moe = FusedMoEConfig.make( - num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - num_local_experts=self.local_num_experts, - moe_parallel_config=self.moe_parallel_config, - # TODO (bnell): this needs to be fixed for quantized types. - in_dtype=params_dtype, - quant_config=quant_config) - else: - self.moe = FusedMoEConfig( - num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - num_local_experts=self.local_num_experts, - moe_parallel_config=self.moe_parallel_config, - in_dtype=params_dtype, - ) + self.moe = FusedMoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + in_dtype=params_dtype, + ) if quant_config is None: self.quant_method = TorchairAscendUnquantizedFusedMoEMethod( self.moe) @@ -1242,12 +1229,8 @@ def forward(self, router_logits = get_dp_group().all_gather(router_logits, 0) elif fused_moe_state == FusedMoEState.NaiveMulticast: - if vllm_version_is("0.10.2"): - cu_tokens_across_dp_cpu = get_forward_context( - ).dp_metadata.cu_tokens_across_dp_cpu - else: - cu_tokens_across_dp_cpu = get_forward_context( - ).dp_metadata.cu_tokens_across_sp(1) + cu_tokens_across_dp_cpu = get_forward_context( + ).dp_metadata.cu_tokens_across_sp(1) hidden_states = self.naive_multicast(hidden_states, cu_tokens_across_dp_cpu) if self.rm_router_logits: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 08484fe409..3576fc5d21 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -78,10 +78,12 @@ # yapf: disable from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, - KVCacheSpec, MambaSpec) + KVCacheSpec, MambaSpec, + UniformTypeKVCacheSpecs) # yapf: enable from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, - DraftTokenIds, LogprobsTensors, ModelRunnerOutput) + DraftTokenIds, LogprobsTensors, ModelRunnerOutput, + PoolerOutput) from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata @@ -121,7 +123,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, AscendSocVersion, ProfileExecuteDuration, get_ascend_soc_version, is_310p, - lmhead_tp_enable, vllm_version_is) + lmhead_tp_enable) from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: @@ -143,13 +145,6 @@ else: ACL_FORMAT = ACL_FORMAT_FRACTAL_ND -if not vllm_version_is("0.10.2"): - from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs - from vllm.v1.outputs import PoolerOutput -else: - from vllm.sequence import PoolerOutput - UniformTypeKVCacheSpecs = None - @dataclass class GraphCaptureContext: @@ -308,23 +303,13 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): dtype=self.dtype, device=self.device) # Set up Attention - if vllm_version_is("0.10.2"): - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - use_sfa=self.ascend_config.use_sfa) - else: - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - use_mla=self.model_config.use_mla, - use_sfa=self.ascend_config.use_sfa) + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + self.block_size, + use_mla=self.model_config.use_mla, + use_sfa=self.ascend_config.use_sfa) if torch.version.cann.startswith("8.3"): self.attn_mask_builder = AttentionMaskBuilder( self.scheduler_config.max_num_batched_tokens, self.dtype, @@ -602,12 +587,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: to_update.apply(pooling_params) backward_kwargs = {} - if vllm_version_is("0.10.2"): - backward_kwargs["mm_kwargs"] = new_req_data.mm_kwargs - backward_kwargs["mm_hashes"] = new_req_data.mm_hashes - backward_kwargs["mm_positions"] = new_req_data.mm_positions - else: - backward_kwargs["mm_features"] = new_req_data.mm_features + backward_kwargs["mm_features"] = new_req_data.mm_features self.requests[req_id] = CachedRequestState( req_id=req_id, @@ -624,10 +604,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: - if vllm_version_is("0.10.2"): - self._init_mrope_positions_0102(self.requests[req_id]) - else: - self._init_mrope_positions(self.requests[req_id]) + self._init_mrope_positions(self.requests[req_id]) req_ids_to_add.append(req_id) @@ -759,39 +736,6 @@ def _init_mrope_positions(self, req_state: CachedRequestState): use_audio_in_video=use_audio_in_video, ) - def _init_mrope_positions_0102(self, req_state: CachedRequestState): - image_grid_thw = [] - video_grid_thw = [] - second_per_grid_ts = [] - audio_feature_lengths = [] - use_audio_in_video = False - assert req_state.mm_kwargs is not None - for mm_item in req_state.mm_kwargs: - mm_input = mm_item.get_data() - if mm_input.get("image_grid_thw") is not None: - image_grid_thw.append(mm_input["image_grid_thw"].tolist()) - if mm_input.get("video_grid_thw") is not None: - video_grid_thw.append(mm_input["video_grid_thw"].tolist()) - if mm_input.get("second_per_grid_ts") is not None: - second_per_grid_ts.append(mm_input["second_per_grid_ts"]) - if mm_input.get("audio_feature_lengths") is not None: - audio_feature_lengths.append(mm_input["audio_feature_lengths"]) - if mm_input.get("use_audio_in_video") is True: - use_audio_in_video = True - - hf_config = self.model_config.hf_config - - req_state.mrope_positions, req_state.mrope_position_delta = \ - MRotaryEmbedding.get_input_positions_tensor( - req_state.prompt_token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - def _sync_metadata_across_dp( self, num_tokens: int, with_prefill: bool, enable_dbo: bool ) -> tuple[int, Optional[torch.Tensor], bool, bool]: @@ -966,12 +910,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): return # Batch the multi-modal inputs. - if vllm_version_is("0.10.2"): - mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler_0102( - scheduler_output) - else: - mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( - scheduler_output) + mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( + scheduler_output) encoder_outputs = [] for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( @@ -1003,31 +943,6 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): is_embed=pos_info.is_embed, ) - # TODO: remove this once we drop support for vLLM 0.10.2 - def _batch_mm_kwargs_from_scheduler_0102( - self, - scheduler_output: "SchedulerOutput", - ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]: - scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs - if not scheduled_encoder_inputs: - return [], [] - # Batch the multi-modal inputs. - mm_kwargs = list[MultiModalKwargsItem]() - # list of tuple (mm_hash, position_info) - mm_hashes_pos = list[tuple[str, PlaceholderRange]]() - for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): - req_state = self.requests[req_id] - assert req_state.mm_hashes is not None - assert req_state.mm_kwargs is not None - assert req_state.mm_positions is not None - for mm_input_id in encoder_input_ids: - mm_hash = req_state.mm_hashes[mm_input_id] - mm_kwargs.append(req_state.mm_kwargs[mm_input_id]) - mm_hashes_pos.append( - (mm_hash, req_state.mm_positions[mm_input_id])) - - return mm_kwargs, mm_hashes_pos - def _batch_mm_kwargs_from_scheduler( self, scheduler_output: "SchedulerOutput", @@ -1067,20 +982,11 @@ def _gather_mm_embeddings( ) -> list[torch.Tensor]: def _iter_mm_features(req_state: CachedRequestState): - if vllm_version_is("0.10.2"): - # legacy path (to be removed later) - assert req_state.mm_hashes is not None - assert req_state.mm_positions is not None - for mm_hash, pos_info in zip(req_state.mm_hashes, - req_state.mm_positions): - yield mm_hash, pos_info, getattr(pos_info, "is_embed", - None) - else: - assert req_state.mm_features is not None - for mm_feature in req_state.mm_features: - pos_info = mm_feature.mm_position - yield mm_feature.identifier, pos_info, getattr( - pos_info, "is_embed", None) + assert req_state.mm_features is not None + for mm_feature in req_state.mm_features: + pos_info = mm_feature.mm_position + yield mm_feature.identifier, pos_info, getattr( + pos_info, "is_embed", None) mm_embeds: list[torch.Tensor] = [] @@ -1527,10 +1433,7 @@ def _prepare_inputs( for attn_group in self.attn_groups[kv_cache_group_id]: common_prefix_len = 0 extra_attn_metadata_args = {} - if vllm_version_is("0.10.2"): - builder = attn_group.metadata_builder - else: - builder = attn_group.get_metadata_builder() + builder = attn_group.get_metadata_builder() if isinstance(builder, GDNAttentionMetadataBuilder): if use_spec_decode: extra_attn_metadata_args = dict( @@ -1809,29 +1712,21 @@ def _pool( device=hidden_states.device) seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs] - if vllm_version_is("0.10.2"): - # Pooling models D2H & synchronize occurs in pooler.py:build_output - raw_pooler_output = self.model.pooler( - hidden_states=hidden_states, pooling_metadata=pooling_metadata) - else: - model = cast(VllmModelForPooling, self.model) - raw_pooler_output = model.pooler( - hidden_states=hidden_states, - pooling_metadata=pooling_metadata, - ) - raw_pooler_output = json_map_leaves( - lambda x: x.to("cpu", non_blocking=True), - raw_pooler_output, - ) - torch.npu.synchronize() + model = cast(VllmModelForPooling, self.model) + raw_pooler_output = model.pooler( + hidden_states=hidden_states, + pooling_metadata=pooling_metadata, + ) + raw_pooler_output = json_map_leaves( + lambda x: x.to("cpu", non_blocking=True), + raw_pooler_output, + ) + torch.npu.synchronize() pooler_output: list[Optional[torch.Tensor]] = [] for raw_output, seq_len, prompt_len in zip( raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens): - if vllm_version_is("0.10.2"): - output = raw_output.data if seq_len == prompt_len else None - else: - output = raw_output if seq_len == prompt_len else None + output = raw_output if seq_len == prompt_len else None pooler_output.append(output) return ModelRunnerOutput( @@ -2006,8 +1901,7 @@ def execute_model( num_scheduled_tokens_np, finished_sending, finished_recving, kv_connector_output) sample_hidden_states = hidden_states[logits_indices] - logits = self._compute_logits_wrapper(sample_hidden_states, - None) + logits = self.model.compute_logits(sample_hidden_states) if broadcast_pp_output: model_output_broadcast_data = { "logits": logits.contiguous(), @@ -2302,10 +2196,7 @@ def _build_attention_metadata(self, create_mixed_batch, num_reqs, ) for attn_group in self.attn_groups[kv_cache_group_id]: - if vllm_version_is("0.10.2"): - builder = attn_group.metadata_builder - else: - builder = attn_group.get_metadata_builder() + builder = attn_group.get_metadata_builder() attn_metadata_i = builder.build_for_graph_capture( common_attn_metadata) for layer_name in kv_cache_group_spec.layer_names: @@ -2463,8 +2354,8 @@ def _dummy_run( dtype=torch.int32) def dummy_compute_logits(hidden_states): - return self._compute_logits_wrapper( - hidden_states[dummy_indices], None) + return self.model.compute_logits( + hidden_states[dummy_indices]) with set_ascend_forward_context( attn_metadata, @@ -2542,18 +2433,13 @@ def profile_run(self) -> None: logit_indices = np.cumsum(num_scheduled_tokens) - 1 # TODO: need to rum a dummy sampler for generate task hidden_states = hidden_states[logit_indices] - output = self._compute_logits_wrapper(hidden_states, None) + output = self.model.compute_logits(hidden_states) NPUPlatform.synchronize() del hidden_states, output self.encoder_cache.clear() gc.collect() - def _compute_logits_wrapper(self, hidden_states, sampling_metadata): - if vllm_version_is("0.10.2"): - return self.model.compute_logits(hidden_states, sampling_metadata) - return self.model.compute_logits(hidden_states) - def _dummy_pooler_run_task( self, hidden_states: torch.Tensor, @@ -2615,10 +2501,7 @@ def _dummy_pooler_run( for task in self.get_supported_pooling_tasks(): # Run a full batch with each task to ensure none of them OOMs output = self._dummy_pooler_run_task(hidden_states, task) - if vllm_version_is("0.10.2"): - output_size[task] = output.get_data_nbytes() - else: - output_size[task] = sum(o.nbytes for o in output) + output_size[task] = sum(o.nbytes for o in output) del output # Allow GC max_task = max(output_size.items(), key=lambda x: x[1])[0] @@ -2657,16 +2540,8 @@ def load_model(self) -> None: self.model.get_eagle3_aux_hidden_state_layers()) if self.lora_config: - if vllm_version_is("0.10.2"): - self.model = self.load_lora_model(self.model, - self.model_config, - self.scheduler_config, - self.lora_config, - self.device) - else: - self.model = self.load_lora_model(self.model, - self.vllm_config, - self.device) + self.model = self.load_lora_model(self.model, self.vllm_config, + self.device) logger.info("Loading model weights took %.4f GB", m.consumed_memory / float(2**30)) @@ -2694,17 +2569,10 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: self.initialize_attn_backend(kv_cache_config) self.use_hybrid_blocks = (len(self.attn_groups) > 1) # NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`. - if vllm_version_is("0.10.2"): - self.need_accepted_tokens = any([ - isinstance( - self.kv_cache_config.kv_cache_groups[0].kv_cache_spec, - MambaSpec) for attn_group in self.attn_groups - ]) - else: - self.need_accepted_tokens = any([ - isinstance(attn_group[0].kv_cache_spec, MambaSpec) - for attn_group in self.attn_groups - ]) + self.need_accepted_tokens = any([ + isinstance(attn_group[0].kv_cache_spec, MambaSpec) + for attn_group in self.attn_groups + ]) self.may_reinitialize_input_batch(kv_cache_config) @@ -2737,11 +2605,8 @@ def initialize_kv_cache_tensors_deepseek_sfa( kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size kv_caches: Dict[str, torch.Tensor] = {} - for group in self._kv_cache_spec_attn_group_iterator_dispatcher(): - if vllm_version_is("0.10.2"): - kv_cache_spec, group = group - else: - kv_cache_spec = group.kv_cache_spec + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec attn_backend = group.backend for layer_name in group.layer_names: if layer_name in self.runner_only_attn_layers: @@ -2846,11 +2711,8 @@ def initialize_kv_cache_tensors_deepseek_mla( kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size kv_caches: Dict[str, torch.Tensor] = {} - for group in self._kv_cache_spec_attn_group_iterator_dispatcher(): - if vllm_version_is("0.10.2"): - kv_cache_spec, group = group - else: - kv_cache_spec = group.kv_cache_spec + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec attn_backend = group.backend for layer_name in group.layer_names: if layer_name in self.runner_only_attn_layers: @@ -2996,11 +2858,8 @@ def initialize_kv_cache_tensors( )), "Some layers are not correctly initialized" kv_caches: Dict[str, torch.Tensor] = {} - for group in self._kv_cache_spec_attn_group_iterator_dispatcher(): - if vllm_version_is("0.10.2"): - kv_cache_spec, group = group - else: - kv_cache_spec = group.kv_cache_spec + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec attn_backend = group.backend for layer_name in group.layer_names: if layer_name in self.runner_only_attn_layers: @@ -3211,50 +3070,6 @@ def get_attn_backends_for_group( for k, v in attn_backend_layers.items() } - def get_attn_backends_for_layers( - layer_names: list[str] - ) -> dict[type[AttentionBackend], list[str]]: - """Get attention_backend for all attention layers - TODO: Only used in v0.10.2, drop me when 0.10.2 is dropped - """ - layers = get_layers_from_vllm_config(self.vllm_config, - AttentionLayerBase, - layer_names) - attn_backends = {} - attn_backend_layers = defaultdict(list) - # Dedupe based on full class name; this is a bit safer than - # using the class itself as the key because when we create dynamic - # attention backend subclasses (e.g. ChunkedLocalAttention) unless - # they are cached correctly, there will be different objects per - # layer. - for layer_name in layer_names: - attn_backend = layers[layer_name].get_attn_backend() - key = attn_backend.full_cls_name() - attn_backends[key] = attn_backend - attn_backend_layers[key].append(layer_name) - return { - attn_backends[k]: v - for k, v in attn_backend_layers.items() - } - - def create_attn_groups_v0102( - attn_backends_map: dict[AttentionBackend, list[str]], - kv_cache_spec: KVCacheSpec, - ) -> list[AttentionGroup]: - attn_groups: list[AttentionGroup] = [] - for attn_backend, layer_names in attn_backends_map.items(): - attn_metadata_builder_i = attn_backend.get_builder_cls()( - kv_cache_spec, - layer_names, - self.vllm_config, - self.device, - ) - attn_group = AttentionGroup(attn_backend, - attn_metadata_builder_i, - layer_names) - attn_groups.append(attn_group) - return attn_groups - def create_attn_groups( attn_backends_map: dict[AttentionBackend, list[str]], ) -> list[AttentionGroup]: @@ -3274,18 +3089,10 @@ def create_attn_groups( attn_groups.append(attn_group) return attn_groups - if vllm_version_is("0.10.2"): - for kv_cache_group_spec in kv_cache_config.kv_cache_groups: - kv_cache_spec = kv_cache_group_spec.kv_cache_spec - attn_backends = get_attn_backends_for_layers( - kv_cache_group_spec.layer_names) - self.attn_groups.append( - create_attn_groups_v0102(attn_backends, kv_cache_spec)) - else: - for kv_cache_group_spec in kv_cache_config.kv_cache_groups: - attn_backends = get_attn_backends_for_group( # type: ignore - kv_cache_group_spec) - self.attn_groups.append(create_attn_groups(attn_backends)) + for kv_cache_group_spec in kv_cache_config.kv_cache_groups: + attn_backends = get_attn_backends_for_group( # type: ignore + kv_cache_group_spec) + self.attn_groups.append(create_attn_groups(attn_backends)) # Calculate reorder batch threshold (if needed) self.calculate_reorder_batch_threshold() @@ -3299,31 +3106,13 @@ def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]: for attn_groups in self.attn_groups: yield from attn_groups - def _kv_cache_spec_attn_group_iterator_v0102( - self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]: - if not self.kv_cache_config.kv_cache_groups: - return - for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups): - for attn_group in attn_groups: - yield self.kv_cache_config.kv_cache_groups[ - kv_cache_spec_id].kv_cache_spec, attn_group - - def _kv_cache_spec_attn_group_iterator_dispatcher(self): - if vllm_version_is("0.10.2"): - return self._kv_cache_spec_attn_group_iterator_v0102() - else: - return self._kv_cache_spec_attn_group_iterator() - def calculate_reorder_batch_threshold(self) -> None: """ Check that if any backends reorder batches; that the reordering is compatible (e.g., decode threshold is the same) """ for group in self._attn_group_iterator(): - if vllm_version_is("0.10.2"): - attn_metadata_builder_i = group.metadata_builder - else: - attn_metadata_builder_i = group.get_metadata_builder() + attn_metadata_builder_i = group.get_metadata_builder() if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"): # check that if any backends reorder batches; that the reordering # is compatible (e.g., decode threshold is the same) @@ -3427,10 +3216,7 @@ def initialize_aclgraph_capture(self) -> None: min_ag_builder_name = None for attn_group in self._attn_group_iterator(): - if vllm_version_is("0.10.2"): - builder = attn_group.metadata_builder - else: - builder = attn_group.get_metadata_builder() + builder = attn_group.get_metadata_builder() if builder.aclgraph_support.value < min_ag_support.value: min_ag_support = builder.aclgraph_support min_ag_builder_name = builder.__class__.__name__ @@ -3674,7 +3460,7 @@ def _get_prompt_logprobs_dict( req_idx = self.input_batch.req_id_to_index[req_id] offset = self.query_start_loc_np[req_idx].item() prompt_hidden_states = hidden_states[offset:offset + num_logits] - logits = self._compute_logits_wrapper(prompt_hidden_states, None) + logits = self.model.compute_logits(prompt_hidden_states) # Get the "target" tokens for each index. For prompt at index i, # the token at prompt index i+1 is the "sampled" token we want diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index d1ebd023c3..9375a4c660 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -39,7 +39,6 @@ from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.utils import copy_slice -from vllm_ascend.utils import vllm_version_is from vllm_ascend.worker.block_table import MultiGroupBlockTable @@ -79,12 +78,6 @@ def num_tokens(self) -> int: @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " "removed in v0.13. Please use `mm_kwargs` instead.") def mm_inputs(self) -> list[MultiModalKwargsItems]: - if vllm_version_is("0.10.2"): - assert self.mm_kwargs is not None - return [ - MultiModalKwargsItems.from_seq([item]) - for item in self.mm_kwargs - ] assert self.mm_features is not None return [ MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features