File tree Expand file tree Collapse file tree 5 files changed +8
-10
lines changed Expand file tree Collapse file tree 5 files changed +8
-10
lines changed Original file line number Diff line number Diff line change 21
21
22
22
import torch
23
23
import torch_npu
24
- from vllm .config import VllmConfig
25
24
from vllm .attention .backends .abstract import (AttentionBackend , AttentionImpl ,
26
25
AttentionLayer , AttentionType )
27
26
from vllm .attention .backends .utils import CommonAttentionState
28
- from vllm .config import get_current_vllm_config
27
+ from vllm .config import VllmConfig , get_current_vllm_config
29
28
from vllm .forward_context import ForwardContext , get_forward_context
30
29
from vllm .utils import direct_register_custom_op
31
30
from vllm .v1 .core .sched .output import SchedulerOutput
32
31
from vllm .v1 .worker .gpu_input_batch import InputBatch
33
32
33
+ from vllm_ascend .attention .utils import AscendCommonAttentionMetadata
34
34
from vllm_ascend .multistream .base import MSAttentionMetadataSplitConfig
35
35
from vllm_ascend .ops .attention import vanilla_chunked_prefill
36
36
from vllm_ascend .utils import get_graph_params
37
- from vllm_ascend .attention .utils import AscendCommonAttentionMetadata
38
37
39
38
40
39
class AscendAttentionBackend (AttentionBackend ):
Original file line number Diff line number Diff line change 4
4
import numpy as np
5
5
import torch
6
6
import torch_npu
7
- from vllm_ascend import envs
8
7
from vllm .attention .backends .abstract import (AttentionBackend , AttentionLayer ,
9
8
AttentionMetadata ,
10
9
MLAAttentionImpl )
11
10
from vllm .attention .backends .utils import PAD_SLOT_ID
12
- from vllm .config import get_current_vllm_config , VllmConfig
11
+ from vllm .config import VllmConfig , get_current_vllm_config
13
12
from vllm .model_executor .layers .linear import (LinearBase ,
14
13
UnquantizedLinearMethod )
15
14
from vllm .utils import cdiv , round_down
16
15
17
- from vllm_ascend .attention .utils import (AscendCommonAttentionMetadata ,split_decodes_and_prefills )
18
-
16
+ from vllm_ascend import envs
19
17
from vllm_ascend .ascend_config import get_ascend_config
20
18
from vllm_ascend .attention .attention_v1 import AscendAttentionState
19
+ from vllm_ascend .attention .utils import (AscendCommonAttentionMetadata ,
20
+ split_decodes_and_prefills )
21
21
from vllm_ascend .multistream .base import MSAttentionMetadataSplitConfig
22
22
from vllm_ascend .multistream .context import get_multistream_comm_context
23
23
from vllm_ascend .multistream .ms_split import model_input_split_v1_mla_attn
Original file line number Diff line number Diff line change 1
1
from dataclasses import dataclass
2
-
3
2
from typing import Any , Optional
4
3
5
4
import torch
Original file line number Diff line number Diff line change 77
77
from vllm_ascend .ascend_forward_context import set_ascend_forward_context
78
78
from vllm_ascend .attention .attention import AttentionMaskBuilder
79
79
from vllm_ascend .attention .attention_v1 import AscendAttentionState
80
+ from vllm_ascend .attention .utils import AscendCommonAttentionMetadata
80
81
from vllm_ascend .distributed .utils import is_lmhead_tp
81
82
from vllm_ascend .eplb .adaptor .vllm_adaptor import VllmEplbAdaptor
82
83
from vllm_ascend .eplb .eplb_updator import EplbUpdator
87
88
check_torchair_cache_exist ,
88
89
write_kv_cache_bytes_to_file )
89
90
from vllm_ascend .worker .mtp_proposer_v1 import MtpProposer
90
- from vllm_ascend .attention .utils import AscendCommonAttentionMetadata
91
91
92
92
if TYPE_CHECKING :
93
93
import xgrammar as xgr # type: ignore[import-untyped]
Original file line number Diff line number Diff line change 16
16
17
17
from vllm_ascend .ascend_config import get_ascend_config
18
18
from vllm_ascend .ascend_forward_context import set_ascend_forward_context
19
+ from vllm_ascend .attention .utils import AscendCommonAttentionMetadata
19
20
from vllm_ascend .distributed .utils import is_lmhead_tp
20
21
from vllm_ascend .models .deepseek_mtp import CustomDeepSeekMTP
21
22
from vllm_ascend .utils import ProfileExecuteDuration
22
- from vllm_ascend .attention .utils import AscendCommonAttentionMetadata
23
23
24
24
25
25
# FIXME(woosuk): The logic here is duplicated with the main sampling code.
You can’t perform that action at this time.
0 commit comments