diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 015dd90b7c0..1cca778321c 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -17,7 +17,6 @@ from dataclasses import dataclass from enum import Enum -from typing import ClassVar import torch import torch_npu @@ -213,7 +212,7 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]): # Does this backend/builder reorder the batch? # If not, set this to None. Otherwise set it to the query # length that will be pulled into the front of the batch. - reorder_batch_threshold: ClassVar[int] = 1 + reorder_batch_threshold: int = 1 def __init__( self, @@ -242,7 +241,7 @@ def __init__( got {self.decode_threshold}" ) - AscendAttentionMetadataBuilder.reorder_batch_threshold = self.decode_threshold + self.reorder_batch_threshold = self.decode_threshold scheduler_config = vllm_config.scheduler_config self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill diff --git a/vllm_ascend/attention/context_parallel/attention_cp.py b/vllm_ascend/attention/context_parallel/attention_cp.py index af23ae90b9d..f2d3961db5c 100644 --- a/vllm_ascend/attention/context_parallel/attention_cp.py +++ b/vllm_ascend/attention/context_parallel/attention_cp.py @@ -15,8 +15,6 @@ # This file is a part of the vllm-ascend project. # -from typing import ClassVar - import numpy as np import torch import torch.distributed as dist @@ -61,11 +59,6 @@ class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder): Extends AscendAttentionMetadataBuilder with PCP/DCP metadata handling. """ - # Does this backend/builder reorder the batch? - # If not, set this to None. Otherwise set it to the query - # length that will be pulled into the front of the batch. - reorder_batch_threshold: ClassVar[int] = 1 - def __init__( self, kv_cache_spec: AttentionSpec,