vllm-project · ywang96 · Jun 21, 2025 · Jun 17, 2025 · Jun 19, 2025 · Jun 19, 2025
@@ -819,6 +819,12 @@ def get_image_processor(
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
+    def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+
+        max_image_tokens = self.get_max_image_tokens()
+        max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
     def _get_vision_info(
         self,
         *,

@@ -1100,6 +1100,24 @@
 
         return allowed_limits
 
+    def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+        """Return the maximum number of tokens per item of for each modality.
+        By default, returns `None`. When `None` is returned, vLLM will generate
+        dummy inputs (images/videos) at maximum possible sizes and process them
+        to determine the maximum token count per modality.
+        This approach works but can be very slow for certain models (e.g.,
+        Qwen2.5-VL), leading to very long startup time. For better performance,
+        each model can override this method to return pre-computed maximum token
+        counts, avoiding the need for dummy input generation and processing.
+
+        NOTE: The maximum number of tokens per item of each modality returned from 
+        this function should respect to the model maximum sequence length and the 
+        maximum number of items of each modality allowed, and agrees with dummy 
+        inputs (images/videos) at maximum possible sizes.
+
+        """
+        return None
+
 
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 

@@ -253,6 +253,9 @@
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> Mapping[str, int]:
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        max_tokens_per_item = self.processing_info.get_max_tokens_per_item()
+        if max_tokens_per_item is not None:
+            return max_tokens_per_item
 
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         return self._get_mm_num_tokens(mm_inputs)