Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def build(
)

block_table = common_attn_metadata.block_table_tensor
seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
seq_lens = common_attn_metadata.seq_lens[:num_reqs]

slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens]
# this slot_mapping override doesn't work since vllm will override it again. We should fix it vllm.
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@
query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
seq_lens=self.seq_lens[:num_actual_reqs],
seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs] if self.seq_lens_cpu is not None else None,
num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs] if self.num_computed_tokens_cpu is not None else None,

Check failure on line 173 in vllm_ascend/attention/utils.py

View workflow job for this annotation

GitHub Actions / lint / pre-commit

Ruff (E501)

vllm_ascend/attention/utils.py:173:121: E501 Line too long (137 > 120)
num_reqs=num_actual_reqs,
num_actual_tokens=num_actual_tokens,
max_query_len=self.max_query_len,
Expand Down
25 changes: 12 additions & 13 deletions vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class SpecDecodeBaseProposer(EagleProposer):

def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None):
super().__init__(vllm_config, device, runner)

self.runner = runner
self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
self.pass_hidden_states_to_model = pass_hidden_states_to_model
self.decode_threshold = 1 + self.num_speculative_tokens
Expand Down Expand Up @@ -367,7 +367,7 @@ def dummy_run(
common_attn_metadata = AscendCommonAttentionMetadata(
query_start_loc=self.query_start_loc.gpu[: num_reqs + 1],
query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs + 1],
seq_lens_cpu=self.runner.seq_lens.cpu,
seq_lens_cpu=self.runner.optimistic_seq_lens_cpu,
seq_lens=self.runner.seq_lens.gpu[:num_reqs],
num_reqs=num_reqs,
num_actual_tokens=num_tokens,
Expand Down Expand Up @@ -531,7 +531,7 @@ def _propose(
common_attn_metadata.block_table_tensor, num_reqs_padded
)
common_attn_metadata.seq_lens = self.runner.seq_lens.gpu[:num_reqs_padded]
common_attn_metadata.seq_lens_cpu = self.runner.seq_lens.cpu[:num_reqs_padded]
common_attn_metadata.seq_lens_cpu = self.runner.optimistic_seq_lens_cpu[:num_reqs_padded]

if self.supports_mm_inputs:
mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
Expand Down Expand Up @@ -1177,11 +1177,12 @@ def attn_update_stack_num_spec_norm(
# For the requests that exceed the max model length, we set the
# sequence length to 1 to minimize their overheads in attention.
common_attn_metadata.seq_lens[:batch_size].masked_fill_(exceeds_max_model_len, 1)

common_attn_metadata.seq_lens_cpu[:batch_size] = common_attn_metadata.seq_lens_cpu[:batch_size] + 1
exceeds_mask = common_attn_metadata.seq_lens_cpu[:batch_size] >= self.max_model_len
common_attn_metadata.seq_lens_cpu[:batch_size].masked_fill_(exceeds_mask, 1)
common_attn_metadata.num_computed_tokens_cpu[:batch_size] += 1
if common_attn_metadata.seq_lens_cpu is not None:
common_attn_metadata.seq_lens_cpu[:batch_size] = common_attn_metadata.seq_lens_cpu[:batch_size] + 1
exceeds_mask = common_attn_metadata.seq_lens_cpu[:batch_size] >= self.max_model_len
common_attn_metadata.seq_lens_cpu[:batch_size].masked_fill_(exceeds_mask, 1)
if common_attn_metadata.num_computed_tokens_cpu is not None:
Comment on lines +1180 to +1184
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The addition of if ... is not None checks for common_attn_metadata.seq_lens_cpu and common_attn_metadata.num_computed_tokens_cpu is a critical improvement. This prevents AttributeError in scenarios where these attributes might be None due to the async spec decode logic, ensuring robustness and correctness.

common_attn_metadata.num_computed_tokens_cpu[:batch_size] += 1
if self.uses_mrope:
common_attn_metadata.positions[:batch_size].copy_(clamped_positions[0])
else:
Expand Down Expand Up @@ -1244,7 +1245,7 @@ def attn_update_stack_num_spec_norm(

def prepare_next_token_ids_padded(
self,
common_attn_metadata: CommonAttentionMetadata,
seq_lens_cpu: torch.Tensor,
sampled_token_ids: torch.Tensor,
requests: dict[str, CachedRequestState],
gpu_input_batch: InputBatch,
Expand All @@ -1264,11 +1265,9 @@ def prepare_next_token_ids_padded(

# Precompute get_token_id for when there is no valid next token
num_reqs = gpu_input_batch.num_reqs
seq_lens_list = seq_lens_cpu[:num_reqs].tolist()
self.backup_next_token_ids.np[:num_reqs] = np.array(
[
requests[gpu_input_batch.req_ids[i]].get_token_id(common_attn_metadata.seq_lens_cpu[i].item())
for i in range(num_reqs)
]
[requests[gpu_input_batch.req_ids[i]].get_token_id(seq_lens_list[i]) for i in range(num_reqs)]
)
self.backup_next_token_ids.copy_to_gpu(num_reqs)

Expand Down
109 changes: 33 additions & 76 deletions vllm_ascend/worker/block_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import torch
from vllm.distributed import get_dcp_group, get_pcp_group
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import PAD_SLOT_ID
from vllm.v1.utils import CpuGpuBuffer
from vllm.v1.worker.block_table import _compute_slot_mapping_kernel
from vllm.v1.worker.cp_utils import get_total_cp_world_size


Expand Down Expand Up @@ -117,80 +119,34 @@ def swap_row(self, src: int, tgt: int) -> None:

self.block_table.np[[src, tgt]] = self.block_table.np[[tgt, src]]

def compute_slot_mapping(self, req_indices: np.ndarray, positions: np.ndarray) -> None:
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
# where K is the max_num_blocks_per_req and the block size is 2.
# NOTE(woosuk): We can't simply use `token_indices // block_size`
# here because M (max_model_len) is not necessarily divisible by
# block_size.

if self.dcp_world_size * self.pcp_world_size > 1:
# Note(hc): The DCP implement store kvcache with an interleave
# style, the kvcache for the token whose token_idx is i is
# always stored on the GPU whose dcp_rank equals i % pcp_world_size:

# Use a "virtual block" which equals to world_size * block_size
# for block_table_indices calculation.
virtual_block_size = self.block_size * self.dcp_world_size * self.pcp_world_size

# IMPORTANT: In hybrid mode, positions are in logical block space,
# but we need to map them to the correct logical block table indices
logical_block_idx = positions // virtual_block_size

# Account for the expanded logical table
# (always needed with unified tensor)
# Each physical block is split into multiple logical blocks
# The logical table has been expanded to accommodate this
block_table_indices = (
req_indices * self.max_num_blocks_per_req * self.blocks_per_phys_block + logical_block_idx
)

block_numbers = self.block_table.np.ravel()[block_table_indices]
# Use virtual_block_size for mask calculation, which marks local
# tokens.
virtual_block_offsets = positions % virtual_block_size
self.current_rank = self.dcp_world_size * self.pcp_rank + self.dcp_rank
mask = (
virtual_block_offsets // self.cp_kv_cache_interleave_size % (self.dcp_world_size * self.pcp_world_size)
== self.current_rank
)
# Calculate local block_offsets
block_offsets = (
virtual_block_offsets
// (self.dcp_world_size * self.pcp_world_size * self.cp_kv_cache_interleave_size)
* self.cp_kv_cache_interleave_size
+ virtual_block_offsets % self.cp_kv_cache_interleave_size
)
# Calculate slot_mapping
slot_mapping = block_numbers * self.block_size + block_offsets
# Write final slots, use -1 for not-local
self.slot_mapping.np[: req_indices.shape[0]] = np.where(mask, slot_mapping, -1)
else:
assert self.kernel_sizes is not None
if self.block_size == self.kernel_sizes[0]:
# IMPORTANT: In hybrid mode, positions are in logical block space,
# but we need to map them to the correct logical block table indices
logical_block_idx = positions // self.block_size

# Account for the expanded logical table
# (always needed with unified tensor)
# Each physical block is split into multiple logical blocks
# The logical table has been expanded to accommodate this
block_table_indices = (
req_indices * self.max_num_blocks_per_req * self.blocks_per_phys_block + logical_block_idx
)

block_numbers = self.block_table.np.ravel()[block_table_indices]
block_offsets = positions % self.block_size
np.add(block_numbers * self.block_size, block_offsets, out=self.slot_mapping.np[: req_indices.shape[0]])
def compute_slot_mapping(
self,
num_reqs: int,
query_start_loc: torch.Tensor,
positions: torch.Tensor,
) -> None:
num_tokens = positions.shape[0]
total_cp_world_size = self.pcp_world_size * self.dcp_world_size
total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank
_compute_slot_mapping_kernel[(num_reqs + 1,)](
num_tokens,
self.max_num_batched_tokens,
query_start_loc,
positions,
self.block_table.gpu,
self.block_table.gpu.stride(0),
self.block_size,
self.slot_mapping.gpu,
TOTAL_CP_WORLD_SIZE=total_cp_world_size,
TOTAL_CP_RANK=total_cp_rank,
CP_KV_CACHE_INTERLEAVE_SIZE=self.cp_kv_cache_interleave_size,
PAD_ID=PAD_SLOT_ID,
BLOCK_SIZE=1024,
)

def commit_block_table(self, num_reqs: int) -> None:
self.block_table.copy_to_gpu(num_reqs)

def commit_slot_mapping(self, num_tokens: int) -> None:
self.slot_mapping.copy_to_gpu(num_tokens)

def clear(self) -> None:
self.block_table.fill_(0)
self.block_table.cpu.fill_(0)
Expand Down Expand Up @@ -299,18 +255,19 @@ def swap_row(self, src: int, tgt: int) -> None:
for block_table in self.block_tables:
block_table.swap_row(src, tgt)

def compute_slot_mapping(self, req_indices: np.ndarray, positions: np.ndarray) -> None:
def compute_slot_mapping(
self,
num_reqs: int,
query_start_loc: torch.Tensor,
positions: torch.Tensor,
) -> None:
for block_table in self.block_tables:
block_table.compute_slot_mapping(req_indices, positions)
block_table.compute_slot_mapping(num_reqs, query_start_loc, positions)

def commit_block_table(self, num_reqs: int) -> None:
for block_table in self.block_tables:
block_table.commit_block_table(num_reqs)

def commit_slot_mapping(self, num_tokens: int) -> None:
for block_table in self.block_tables:
block_table.commit_slot_mapping(num_tokens)

def clear(self) -> None:
for block_table in self.block_tables:
block_table.clear()
Expand Down
Loading
Loading