Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/ut/test_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ def test_check_and_update_config_cache_config_block_size(
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.cache_config.block_size = None
vllm_config.cache_config.enable_prefix_caching = True
vllm_config.cache_config.user_specified_block_size = False
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
Expand Down
14 changes: 3 additions & 11 deletions vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,17 +183,9 @@

@classmethod
def update_block_size_for_backend(cls, vllm_config: VllmConfig) -> None:
cache_config = vllm_config.cache_config
if cache_config.user_specified_block_size:
# User specified --block-size; keep it.
return
model_config = vllm_config.model_config
if model_config is not None and model_config.is_hybrid:
# Hybrid attention+mamba models rely on the model-specific sizing
# logic rather than the generic platform default.
return

super().update_block_size_for_backend(vllm_config)
# TODO: NPU still sets block_size in check_and_update_config.
# Move that logic here so block_size is chosen by the backend.
pass
Comment on lines +227 to +229
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The update_block_size_for_backend method has been refactored to a pass statement with a TODO. While the is_hybrid model logic has been correctly moved to refresh_block_size (which is called by check_and_update_config), the critical check for cache_config.user_specified_block_size has been removed from this call path. This omission means that user-defined block sizes might be unintentionally overridden, leading to unexpected behavior. The TODO also highlights that the block size selection logic is not yet fully centralized, indicating an incomplete refactoring.


@classmethod
def set_device(cls, device: torch.device):
Expand All @@ -208,7 +200,7 @@

# initialize ascend config from vllm additional_config
cls._fix_incompatible_config(vllm_config)

Check failure on line 203 in vllm_ascend/platform.py

View workflow job for this annotation

GitHub Actions / lint / pre-commit

Ruff (F811)

vllm_ascend/platform.py:203:9: F811 Redefinition of unused `apply_config_platform_defaults` from line 193
ascend_config = init_ascend_config(vllm_config)

if vllm_config.kv_transfer_config is not None:
Expand Down
20 changes: 14 additions & 6 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1091,18 +1091,26 @@ def refresh_block_size(vllm_config):
if not cache_config:
return

if cache_config.user_specified_block_size:
# User specified --block-size; keep it.
if cache_config.block_size != 128:
logger.warning(
"The user specified --block-size and the value is not 128, which can lead to performance degradation"
)
return

if cache_config.block_size is None:
cache_config.block_size = 128

if not scheduler_config or not model_config:
return

# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
if (
"qwen3_next" not in model_config.hf_text_config.model_type
and "qwen3_5" not in model_config.hf_text_config.model_type
and cache_config.block_size != 128
):
if model_config.is_hybrid:
# Hybrid attention+mamba models rely on the model-specific sizing
# logic rather than the generic platform default.
return
Comment on lines +1100 to +1103
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The refresh_block_size function now correctly handles hybrid models. However, it currently does not respect cache_config.user_specified_block_size. If a user explicitly sets --block-size, this function might still override it to 128 if enable_prefix_caching or enable_chunked_prefill is enabled. This can lead to unexpected behavior and should be addressed to ensure user configuration is prioritized. Please reintroduce the check for user_specified_block_size before applying default logic.

    if cache_config.user_specified_block_size:
        # User specified --block-size; keep it.
        return

    if model_config.is_hybrid:
        # Hybrid attention+mamba models rely on the model-specific sizing
        # logic rather than the generic platform default.
        return


if cache_config.block_size != 128:
if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill:
logger.info("Block size is set to 128 if prefix cache or chunked prefill is enabled.")
cache_config.block_size = 128
Expand Down
Loading