Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,12 @@ def get_image_processor(
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}

def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:

max_image_tokens = self.get_max_image_tokens()
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
return {"image": max_image_tokens, "video": max_video_tokens}
Copy link
Member

@DarkLight1337 DarkLight1337 Jun 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you validate whether the startup time is actually reduced (compared to before this PR) after this latest change?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DarkLight1337 Yep that's exactly what I'm going to do next


def _get_vision_info(
self,
*,
Expand Down
18 changes: 18 additions & 0 deletions vllm/multimodal/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,24 @@

return allowed_limits

def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
"""Return the maximum number of tokens per item of for each modality.
By default, returns `None`. When `None` is returned, vLLM will generate
dummy inputs (images/videos) at maximum possible sizes and process them
to determine the maximum token count per modality.
This approach works but can be very slow for certain models (e.g.,
Qwen2.5-VL), leading to very long startup time. For better performance,
each model can override this method to return pre-computed maximum token
counts, avoiding the need for dummy input generation and processing.

NOTE: The maximum number of tokens per item of each modality returned from
this function should respect to the model maximum sequence length and the
maximum number of items of each modality allowed, and agrees with dummy

Check failure on line 1115 in vllm/multimodal/processing.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/multimodal/processing.py:1115:81: E501 Line too long (83 > 80)
inputs (images/videos) at maximum possible sizes.

Check failure on line 1116 in vllm/multimodal/processing.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/multimodal/processing.py:1116:81: E501 Line too long (82 > 80)

"""
return None


_I = TypeVar("_I", bound=BaseProcessingInfo)

Expand Down
5 changes: 4 additions & 1 deletion vllm/multimodal/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@
seq_len: int,
mm_counts: Optional[Mapping[str, int]] = None,
) -> Mapping[str, int]:
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
max_tokens_per_item = self.processing_info.get_max_tokens_per_item()

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]

Check failure on line 256 in vllm/multimodal/profiling.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing positional arguments "seq_len", "mm_counts" in call to "get_max_tokens_per_item" of "BaseProcessingInfo" [call-arg]
if max_tokens_per_item is not None:
return max_tokens_per_item

mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
return self._get_mm_num_tokens(mm_inputs)