Skip to content

Commit e535d90

Browse files
authored
[ModelRunner V2] Misc minor simplifications and optimizations (vllm-project#33467)
Signed-off-by: Nick Hill <[email protected]>
1 parent 0b225fb commit e535d90

21 files changed

+86
-220
lines changed

vllm/v1/worker/gpu/async_utils.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,7 @@
44
import numpy as np
55
import torch
66

7-
from vllm.v1.outputs import (
8-
AsyncModelRunnerOutput,
9-
LogprobsTensors,
10-
ModelRunnerOutput,
11-
)
7+
from vllm.v1.outputs import AsyncModelRunnerOutput, LogprobsTensors, ModelRunnerOutput
128
from vllm.v1.worker.gpu.sample.output import SamplerOutput
139

1410

vllm/v1/worker/gpu/attn_utils.py

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,7 @@ def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]:
3232

3333

3434
def init_attn_backend(
35-
kv_cache_config: KVCacheConfig,
36-
vllm_config: VllmConfig,
37-
device: torch.device,
35+
kv_cache_config: KVCacheConfig, vllm_config: VllmConfig, device: torch.device
3836
):
3937
attn_backends: dict[str, type[AttentionBackend]] = {}
4038
attn_metadata_builders: list[AttentionMetadataBuilder] = []
@@ -50,10 +48,7 @@ def init_attn_backend(
5048
attn_backends[layer_name] = attn_backend
5149

5250
attn_metadata_builder = attn_backend.get_builder_cls()(
53-
kv_cache_group_spec.kv_cache_spec,
54-
layer_names,
55-
vllm_config,
56-
device,
51+
kv_cache_group_spec.kv_cache_spec, layer_names, vllm_config, device
5752
)
5853
attn_metadata_builders.append(attn_metadata_builder) # type: ignore
5954

@@ -65,10 +60,7 @@ def init_attn_backend(
6560
return attn_backends, attn_metadata_builders
6661

6762

68-
def _allocate_kv_cache(
69-
kv_cache_config: KVCacheConfig,
70-
device: torch.device,
71-
):
63+
def _allocate_kv_cache(kv_cache_config: KVCacheConfig, device: torch.device):
7264
kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
7365
for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
7466
tensor = torch.zeros(kv_cache_tensor.size, dtype=torch.int8, device=device)
@@ -141,12 +133,11 @@ def init_kv_cache(
141133

142134

143135
def build_slot_mappings_by_layer(
144-
slot_mappings: torch.Tensor,
145-
kv_cache_config: KVCacheConfig,
136+
slot_mappings: torch.Tensor, kv_cache_config: KVCacheConfig
146137
) -> dict[str, torch.Tensor]:
147138
slot_mappings_by_layer: dict[str, torch.Tensor] = {}
148-
for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
149-
slot_mapping = slot_mappings[i]
139+
kv_cache_groups = kv_cache_config.kv_cache_groups
140+
for slot_mapping, kv_cache_group in zip(slot_mappings, kv_cache_groups):
150141
for layer_name in kv_cache_group.layer_names:
151142
slot_mappings_by_layer[layer_name] = slot_mapping
152143
return slot_mappings_by_layer
@@ -188,8 +179,7 @@ def build_attn_metadata(
188179

189180
attn_metadata_builder = attn_metadata_builders[i]
190181
metadata = attn_metadata_builder.build(
191-
common_prefix_len=0,
192-
common_attn_metadata=common_attn_metadata,
182+
common_prefix_len=0, common_attn_metadata=common_attn_metadata
193183
)
194184
for layer_name in kv_cache_spec.layer_names:
195185
attn_metadata[layer_name] = metadata

vllm/v1/worker/gpu/block_table.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,7 @@ def __init__(
7171
def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
7272
# NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
7373
return torch.tensor(
74-
[t.data_ptr() for t in x],
75-
dtype=torch.uint64,
76-
device=self.device,
74+
[t.data_ptr() for t in x], dtype=torch.uint64, device=self.device
7775
)
7876

7977
def append_block_ids(
@@ -96,8 +94,7 @@ def apply_staged_writes(self) -> None:
9694
self.num_blocks.copy_to_uva()
9795

9896
def gather_block_tables(
99-
self,
100-
idx_mapping: torch.Tensor,
97+
self, idx_mapping: torch.Tensor
10198
) -> tuple[torch.Tensor, ...]:
10299
num_reqs = idx_mapping.shape[0]
103100
_gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs)](

vllm/v1/worker/gpu/buffer_utils.py

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
from collections.abc import Iterable, Sequence
4+
from functools import partial
45

56
import numpy as np
67
import torch
@@ -81,10 +82,7 @@ def copy_to_gpu(
8182

8283
class UvaBackedTensor:
8384
def __init__(
84-
self,
85-
size: int | Sequence[int],
86-
dtype: torch.dtype,
87-
max_concurrency: int = 2,
85+
self, size: int | Sequence[int], dtype: torch.dtype, max_concurrency: int = 2
8886
):
8987
self.dtype = dtype
9088
self.max_concurrency = max_concurrency
@@ -135,25 +133,16 @@ def __init__(
135133
self._staged_write_contents: list[int | float] = []
136134
self._staged_write_cu_lens: list[int] = []
137135

138-
self.write_indices = UvaBufferPool(
139-
self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
140-
)
141-
self.write_starts = UvaBufferPool(
142-
self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
143-
)
136+
new_buffer = partial(UvaBufferPool, max_concurrency=max_concurrency)
137+
138+
self.write_indices = new_buffer(self.num_rows, dtype=torch.int32)
139+
self.write_starts = new_buffer(self.num_rows, dtype=torch.int32)
144140
init_size = next_power_of_2(self.num_rows)
145-
self.write_contents = UvaBufferPool(
146-
init_size, dtype=dtype, max_concurrency=max_concurrency
147-
)
148-
self.write_cu_lens = UvaBufferPool(
149-
self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
150-
)
141+
self.write_contents = new_buffer(init_size, dtype=dtype)
142+
self.write_cu_lens = new_buffer(self.num_rows, dtype=torch.int32)
151143

152144
def stage_write(
153-
self,
154-
index: int,
155-
start: int,
156-
x: Iterable[int] | Iterable[float],
145+
self, index: int, start: int, x: Iterable[int] | Iterable[float]
157146
) -> None:
158147
assert index >= 0
159148
assert start >= 0

vllm/v1/worker/gpu/cudagraph_utils.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,7 @@
2424

2525

2626
class CudaGraphManager:
27-
def __init__(
28-
self,
29-
vllm_config: VllmConfig,
30-
uses_mrope: bool,
31-
device: torch.device,
32-
):
27+
def __init__(self, vllm_config: VllmConfig, uses_mrope: bool, device: torch.device):
3328
self.vllm_config = vllm_config
3429
self.scheduler_config = vllm_config.scheduler_config
3530
self.uses_mrope = uses_mrope
@@ -41,11 +36,7 @@ def __init__(
4136
self.dp_size = vllm_config.parallel_config.data_parallel_size
4237
self.compilation_config = vllm_config.compilation_config
4338
assert self.compilation_config is not None
44-
self.cudagraph_mode: CUDAGraphMode
45-
if self.compilation_config.cudagraph_mode is None:
46-
self.cudagraph_mode = CUDAGraphMode.NONE
47-
else:
48-
self.cudagraph_mode = self.compilation_config.cudagraph_mode
39+
self.cudagraph_mode = self.compilation_config.cudagraph_mode
4940
self.cudagraph_sizes = get_cudagraph_sizes(
5041
self.compilation_config.cudagraph_capture_sizes,
5142
self.max_num_reqs,

vllm/v1/worker/gpu/dp_utils.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,7 @@ def make_num_tokens_across_dp(dp_size: int, num_tokens: int) -> torch.Tensor | N
1313

1414

1515
def get_batch_metadata_across_dp(
16-
num_tokens: int,
17-
cudagraph_size: int,
18-
dp_size: int,
19-
dp_rank: int,
16+
num_tokens: int, cudagraph_size: int, dp_size: int, dp_rank: int
2017
) -> tuple[torch.Tensor, torch.Tensor]:
2118
assert dp_size > 1
2219
# Use CPU group to avoid CPU-GPU synchronization.
@@ -29,10 +26,7 @@ def get_batch_metadata_across_dp(
2926

3027

3128
def get_cudagraph_and_dp_padding(
32-
num_tokens: int,
33-
cudagraph_size: int | None,
34-
dp_size: int,
35-
dp_rank: int,
29+
num_tokens: int, cudagraph_size: int | None, dp_size: int, dp_rank: int
3630
) -> tuple[bool, int, torch.Tensor | None]:
3731
if dp_size == 1:
3832
if cudagraph_size is not None:

vllm/v1/worker/gpu/kv_connector.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,10 @@ def pre_forward(self, scheduler_output: "SchedulerOutput") -> None:
6565

6666
if scheduler_output.preempted_req_ids:
6767
self.kv_connector.handle_preemptions(scheduler_output.preempted_req_ids)
68-
assert scheduler_output.kv_connector_metadata is not None
69-
self.kv_connector.bind_connector_metadata(
70-
scheduler_output.kv_connector_metadata
71-
)
68+
kv_connector_metadata = scheduler_output.kv_connector_metadata
69+
assert kv_connector_metadata is not None
70+
self.kv_connector.bind_connector_metadata(kv_connector_metadata)
71+
7272
# TODO: sort out KV Connectors' use of forward_context
7373
if is_forward_context_available():
7474
self.kv_connector.start_load_kv(get_forward_context())

vllm/v1/worker/gpu/lora_utils.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@ def __init__(self, max_num_reqs: int):
1515
self.lora_requests: dict[str, LoRARequest] = {}
1616

1717
def add_request(
18-
self,
19-
req_id: str,
20-
req_index: int,
21-
lora_request: LoRARequest | None,
18+
self, req_id: str, req_index: int, lora_request: LoRARequest | None
2219
) -> None:
2320
if lora_request is not None:
2421
self.lora_requests[req_id] = lora_request
@@ -41,7 +38,7 @@ def make_lora_inputs(
4138

4239
active_lora_requests: set[LoRARequest] = set()
4340
for req_id in req_ids:
44-
lora_request = self.lora_requests.get(req_id, None)
41+
lora_request = self.lora_requests.get(req_id)
4542
if lora_request is not None:
4643
active_lora_requests.add(lora_request)
4744
return prompt_lora_mapping, token_lora_mapping, active_lora_requests

vllm/v1/worker/gpu/mm/encoder_runner.py

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,7 @@ def __init__(
2323
self.device = device
2424

2525
self.inputs_embeds = torch.zeros(
26-
max_num_tokens,
27-
hidden_size,
28-
dtype=dtype,
29-
device=device,
26+
max_num_tokens, hidden_size, dtype=dtype, device=device
3027
)
3128
self.req_id_to_mm_features: dict[str, list[MultiModalFeatureSpec]] = {}
3229
self.encoder_cache: dict[str, torch.Tensor] = {}
@@ -57,8 +54,7 @@ def remove_request(self, req_id: str) -> None:
5754
self.req_id_to_mm_features.pop(req_id, None)
5855

5956
def prepare_mm_inputs(
60-
self,
61-
scheduled_encoder_inputs: dict[str, list[int]],
57+
self, scheduled_encoder_inputs: dict[str, list[int]]
6258
) -> tuple[list[str], list[tuple[str, MultiModalKwargsItem]]]:
6359
mm_hashes: list[str] = []
6460
mm_kwargs: list[tuple[str, MultiModalKwargsItem]] = []
@@ -85,20 +81,16 @@ def execute_mm_encoder(
8581

8682
encoder_outputs: list[torch.Tensor] = []
8783
for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
88-
mm_kwargs,
89-
device=self.device,
90-
pin_memory=False,
84+
mm_kwargs, device=self.device, pin_memory=False
9185
):
9286
curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
9387
sanity_check_mm_encoder_outputs(
94-
curr_group_outputs,
95-
expected_num_items=num_items,
88+
curr_group_outputs, expected_num_items=num_items
9689
)
9790
encoder_outputs.extend(curr_group_outputs)
9891

9992
# Cache the encoder outputs by mm_hash
100-
for mm_hash, output in zip(mm_hashes, encoder_outputs):
101-
self.encoder_cache[mm_hash] = output
93+
self.encoder_cache.update(zip(mm_hashes, encoder_outputs))
10294
return encoder_outputs
10395

10496
def gather_mm_embeddings(
@@ -115,20 +107,15 @@ def gather_mm_embeddings(
115107
if all_decode:
116108
# All decode requests, so no need to gather any embeddings.
117109
return [], torch.zeros(
118-
total_num_scheduled_tokens,
119-
dtype=torch.bool,
120-
device=self.device,
110+
total_num_scheduled_tokens, dtype=torch.bool, device=self.device
121111
)
122112

123113
query_start = computed_prefill_lens.tolist()
124114
query_end = (computed_prefill_lens + num_scheduled_tokens).tolist()
125115

126116
mm_embeds: list[torch.Tensor] = []
127117
is_mm_embed = torch.zeros(
128-
total_num_scheduled_tokens,
129-
dtype=torch.bool,
130-
device="cpu",
131-
pin_memory=True,
118+
total_num_scheduled_tokens, dtype=torch.bool, device="cpu", pin_memory=True
132119
)
133120
for i, req_id in enumerate(req_ids):
134121
if not is_prefilling[i]:
@@ -189,9 +176,7 @@ def get_inputs_embeds(
189176
is_mm_embed: torch.Tensor,
190177
) -> torch.Tensor:
191178
x = model.embed_input_ids(
192-
input_ids,
193-
multimodal_embeddings=mm_embeds,
194-
is_multimodal=is_mm_embed,
179+
input_ids, multimodal_embeddings=mm_embeds, is_multimodal=is_mm_embed
195180
)
196181
# Copy to the pre-allocated buffer for CUDA graphs.
197182
self.inputs_embeds[: x.shape[0]] = x

vllm/v1/worker/gpu/mm/mrope_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,7 @@ def init_prefill_mrope_positions(
5151
mm_features: list,
5252
) -> None:
5353
prefill_mrope_positions, prefill_mrope_delta = (
54-
mrope_model.get_mrope_input_positions(
55-
prefill_token_ids,
56-
mm_features,
57-
)
54+
mrope_model.get_mrope_input_positions(prefill_token_ids, mm_features)
5855
)
5956
for i in range(3):
6057
pos = prefill_mrope_positions[i].tolist()

0 commit comments

Comments
 (0)