diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 0034a65375e..4efa197247a 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -224,17 +224,9 @@ def inference_mode(cls): @classmethod def update_block_size_for_backend(cls, vllm_config: VllmConfig) -> None: - cache_config = vllm_config.cache_config - if cache_config.user_specified_block_size: - # User specified --block-size; keep it. - return - model_config = vllm_config.model_config - if model_config is not None and model_config.is_hybrid: - # Hybrid attention+mamba models rely on the model-specific sizing - # logic rather than the generic platform default. - return - - super().update_block_size_for_backend(vllm_config) + # TODO: NPU still sets block_size in check_and_update_config. + # Move that logic here so block_size is chosen by the backend. + pass @classmethod def set_device(cls, device: torch.device): diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 4db163c7e0b..8a1ff2202d2 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -1097,12 +1097,12 @@ def refresh_block_size(vllm_config): if not scheduler_config or not model_config: return - # TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups. - if ( - "qwen3_next" not in model_config.hf_text_config.model_type - and "qwen3_5" not in model_config.hf_text_config.model_type - and cache_config.block_size != 128 - ): + if model_config.is_hybrid: + # Hybrid attention+mamba models rely on the model-specific sizing + # logic rather than the generic platform default. + return + + if cache_config.block_size != 128: if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill: logger.info("Block size is set to 128 if prefix cache or chunked prefill is enabled.") cache_config.block_size = 128