Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,9 @@ def get_image_processor(
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}

def get_max_tokens_per_item(self) -> Mapping[str, int]:
return {"image": 16384, "video": 98304}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The hardcoded values 16384 and 98304 represent the maximum tokens for images and videos respectively for Qwen2-VL. For better readability and maintainability, consider defining these as named constants at the module level or within the class. This makes their meaning clearer and simplifies updates if these values change in the future.

For example:

# At the module level or as class attributes
_MAX_IMAGE_TOKENS_QWEN2_VL = 16384
_MAX_VIDEO_TOKENS_QWEN2_VL = 98304

class Qwen2VLForCausalLM(nn.Module, SupportsMultiModal):
    # ...
    def get_max_tokens_per_item(self) -> Mapping[str, int]:
        return {"image": _MAX_IMAGE_TOKENS_QWEN2_VL, "video": _MAX_VIDEO_TOKENS_QWEN2_VL}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1


def _get_vision_info(
self,
*,
Expand Down
12 changes: 12 additions & 0 deletions vllm/multimodal/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,18 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:

return allowed_limits

def get_max_tokens_per_item(self) -> Optional[Mapping[str, int]]:
"""Return the maximum number of tokens for each modality.
By default, returns `None`. When `None` is returned, vLLM will generate
dummy inputs (images/videos) at maximum possible sizes and process them
to determine the maximum token count per modality.
This approach works but can be very slow for certain models (e.g.,
Qwen2.5-VL), leading to very long startup time. For better performance,
each model can override this method to return pre-computed maximum token
counts, avoiding the need for dummy input generation and processing.
"""
return None


_I = TypeVar("_I", bound=BaseProcessingInfo)

Expand Down
5 changes: 4 additions & 1 deletion vllm/multimodal/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ def get_mm_max_tokens(
seq_len: int,
mm_counts: Optional[Mapping[str, int]] = None,
) -> Mapping[str, int]:
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
max_tokens_per_item = self.processing_info.get_max_tokens_per_item()
if max_tokens_per_item is not None:
return max_tokens_per_item

mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
return self._get_mm_num_tokens(mm_inputs)