Skip to content

Commit 13ac9ca

Browse files
[Misc] Avoid direct access of global mm_registry in compute_encoder_budget (#15621)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 66aa4c0 commit 13ac9ca

File tree

4 files changed

+19
-7
lines changed

4 files changed

+19
-7
lines changed

vllm/v1/core/encoder_cache_manager.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import TYPE_CHECKING
44

55
from vllm.logger import init_logger
6-
from vllm.multimodal import MULTIMODAL_REGISTRY
6+
from vllm.multimodal import MultiModalRegistry
77
from vllm.v1.request import Request
88

99
if TYPE_CHECKING:
@@ -67,13 +67,15 @@ def get_freed_ids(self) -> list[tuple[str, int]]:
6767
def compute_encoder_budget(
6868
model_config: "ModelConfig",
6969
scheduler_config: "SchedulerConfig",
70+
mm_registry: MultiModalRegistry,
7071
) -> tuple[int, int]:
7172
"""Compute the encoder cache budget based on the model and scheduler
7273
configurations.
7374
7475
Args:
7576
model_config: Model configuration.
7677
scheduler_config: Scheduler configuration.
78+
mm_registry: Provides information about the token cost.
7779
7880
Returns:
7981
- Compute budget for encoder execution, in unit of number of tokens
@@ -89,21 +91,27 @@ def compute_encoder_budget(
8991
(
9092
encoder_compute_budget,
9193
encoder_cache_size,
92-
) = _compute_encoder_budget_multimodal(model_config, scheduler_config)
94+
) = _compute_encoder_budget_multimodal(
95+
model_config,
96+
scheduler_config,
97+
mm_registry,
98+
)
9399

94100
return encoder_compute_budget, encoder_cache_size
95101

96102

97103
def _compute_encoder_budget_multimodal(
98104
model_config: "ModelConfig",
99105
scheduler_config: "SchedulerConfig",
106+
mm_registry: MultiModalRegistry,
100107
) -> tuple[int, int]:
101108
"""Compute the encoder cache budget based on the model and scheduler
102109
configurations for a multimodal model.
103110
104111
Args:
105112
model_config: Model configuration.
106113
scheduler_config: Scheduler configuration.
114+
mm_registry: Provides information about the token cost.
107115
108116
Returns:
109117
- Compute budget for encoder execution, in unit of number of tokens
@@ -112,8 +120,8 @@ def _compute_encoder_budget_multimodal(
112120
in the input sequence.
113121
"""
114122

115-
max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501
116-
model_config)
123+
max_tokens_by_modality_dict = mm_registry \
124+
.get_max_tokens_per_item_by_nonzero_modality(model_config)
117125

118126
if not max_tokens_by_modality_dict:
119127
logger.warning(

vllm/v1/core/sched/scheduler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
1111
SpeculativeConfig)
1212
from vllm.logger import init_logger
13+
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
1314
from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
1415
compute_encoder_budget)
1516
from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -38,6 +39,7 @@ def __init__(
3839
speculative_config: Optional[SpeculativeConfig],
3940
log_stats: bool,
4041
structured_output_manager: StructuredOutputManager,
42+
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
4143
) -> None:
4244
self.scheduler_config = scheduler_config
4345
self.cache_config = cache_config
@@ -93,6 +95,7 @@ def __init__(
9395
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
9496
model_config=model_config,
9597
scheduler_config=scheduler_config,
98+
mm_registry=mm_registry,
9699
)
97100

98101
# NOTE(woosuk): Here, "encoder" includes the vision encoder (and

vllm/v1/worker/gpu_model_runner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def __init__(
137137
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
138138
model_config=model_config,
139139
scheduler_config=scheduler_config,
140+
mm_registry=self.mm_registry,
140141
)
141142
self.max_num_encoder_input_tokens = encoder_compute_budget
142143
self.encoder_cache_size = encoder_cache_size
@@ -1439,9 +1440,8 @@ def profile_run(self) -> None:
14391440
# NOTE: Currently model is profiled with a single non-text
14401441
# modality with the max possible input tokens even when
14411442
# it supports multiple.
1442-
max_tokens_by_modality_dict = (
1443-
MULTIMODAL_REGISTRY.
1444-
get_max_tokens_per_item_by_nonzero_modality(self.model_config))
1443+
max_tokens_by_modality_dict = self.mm_registry \
1444+
.get_max_tokens_per_item_by_nonzero_modality(self.model_config)
14451445
dummy_data_modality, max_tokens_per_mm_item = max(
14461446
max_tokens_by_modality_dict.items(), key=lambda item: item[1])
14471447

vllm/v1/worker/tpu_model_runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def __init__(
109109
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
110110
model_config=model_config,
111111
scheduler_config=scheduler_config,
112+
mm_registry=self.mm_registry,
112113
)
113114
self.max_num_encoder_input_tokens = encoder_compute_budget
114115
self.encoder_cache_size = encoder_cache_size

0 commit comments

Comments
 (0)