diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a658d97cc8c5..da24cb334a49 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -125,8 +125,20 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): { "image_url": "https://example.com/image.jpg" } + + With UUID for caching optimization: + { + "image_url": "https://example.com/image.jpg", + "uuid": "abcde" + } + + UUID-only cache reference (empty content): + { + "uuid": "abcde" + } """ - image_url: Required[str] + image_url: str + uuid: str class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): @@ -136,19 +148,33 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): { "audio_url": "https://example.com/audio.mp3" } + + With UUID for caching optimization: + { + "audio_url": "https://example.com/audio.mp3", + "uuid": "abcde" + } """ - audio_url: Required[str] + audio_url: str + uuid: str class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): - """A simpler version of the param that only accepts a plain audio_url. + """A simpler version of the param that only accepts a plain video_url. Example: { "video_url": "https://example.com/video.mp4" } + + With UUID for caching optimization: + { + "video_url": "https://example.com/video.mp4", + "uuid": "abcde" + } """ - video_url: Required[str] + video_url: str + uuid: str class CustomThinkCompletionContentParam(TypedDict, total=False): @@ -572,12 +598,29 @@ def create_parser(self) -> "BaseMultiModalContentParser": class MultiModalItemTracker(BaseMultiModalItemTracker[object]): + def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): + super().__init__(model_config, tokenizer) + # Track UUIDs separately for cache optimization + self._uuids_by_modality = defaultdict[str, list[str]](list) + + def add_uuid(self, modality: str, uuid: str) -> None: + """Add a UUID for the given modality.""" + self._uuids_by_modality[modality].append(uuid) def all_mm_data(self) -> Optional[MultiModalDataDict]: - if not self._items_by_modality: + if not self._items_by_modality and not self._uuids_by_modality: return None mm_inputs = {} items_by_modality = dict(self._items_by_modality) + + # Add UUID fields to mm_inputs for cache optimization + if "image" in self._uuids_by_modality: + mm_inputs["image_uuids"] = self._uuids_by_modality["image"] + if "audio" in self._uuids_by_modality: + mm_inputs["audio_uuids"] = self._uuids_by_modality["audio"] + if "video" in self._uuids_by_modality: + mm_inputs["video_uuids"] = self._uuids_by_modality["video"] + if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError(\ "Mixing raw image and embedding inputs is not allowed") @@ -601,15 +644,31 @@ def create_parser(self) -> "BaseMultiModalContentParser": class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): + def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): + super().__init__(model_config, tokenizer) + # Track UUIDs separately for cache optimization + self._uuids_by_modality = defaultdict[str, list[str]](list) + + def add_uuid(self, modality: str, uuid: str) -> None: + """Add a UUID for the given modality.""" + self._uuids_by_modality[modality].append(uuid) async def all_mm_data(self) -> Optional[MultiModalDataDict]: - if not self._items_by_modality: + if not self._items_by_modality and not self._uuids_by_modality: return None mm_inputs = {} items_by_modality = { modality: await asyncio.gather(*items) for modality, items in self._items_by_modality.items() } + + # Add UUID fields to mm_inputs for cache optimization + if "image" in self._uuids_by_modality: + mm_inputs["image_uuids"] = self._uuids_by_modality["image"] + if "audio" in self._uuids_by_modality: + mm_inputs["audio_uuids"] = self._uuids_by_modality["audio"] + if "video" in self._uuids_by_modality: + mm_inputs["video_uuids"] = self._uuids_by_modality["video"] if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError( @@ -656,7 +715,7 @@ def mm_placeholder_storage(self) -> dict[str, list]: return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: Union[str, dict[str, str]]) -> None: raise NotImplementedError @abstractmethod @@ -669,7 +728,7 @@ def parse_image_pil(self, image_pil: Image.Image) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: Union[str, dict[str, str]]) -> None: raise NotImplementedError @abstractmethod @@ -677,7 +736,7 @@ def parse_input_audio(self, input_audio: InputAudio) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: Union[str, dict[str, str]]) -> None: raise NotImplementedError @@ -693,8 +752,24 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str) -> None: - image = self._connector.fetch_image(image_url) + def parse_image(self, image_url: Union[str, dict[str, str]]) -> None: + if isinstance(image_url, dict): + # Handle UUID for cache optimization + if "uuid" in image_url: + self._tracker.add_uuid("image", image_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in image_url or not image_url["url"]: + placeholder = self._tracker.add("image", None) # Placeholder for UUID-only + self._add_placeholder("image", placeholder) + return + url = image_url.get("url", "") + if url: + image = self._connector.fetch_image(url) + else: + # UUID-only reference, use None as placeholder + image = None + else: + image = self._connector.fetch_image(image_url) placeholder = self._tracker.add("image", image) self._add_placeholder("image", placeholder) @@ -718,8 +793,24 @@ def parse_image_pil(self, image_pil: Image.Image) -> None: placeholder = self._tracker.add("image", image_pil) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: - audio = self._connector.fetch_audio(audio_url) + def parse_audio(self, audio_url: Union[str, dict[str, str]]) -> None: + if isinstance(audio_url, dict): + # Handle UUID for cache optimization + if "uuid" in audio_url: + self._tracker.add_uuid("audio", audio_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in audio_url or not audio_url["url"]: + placeholder = self._tracker.add("audio", None) # Placeholder for UUID-only + self._add_placeholder("audio", placeholder) + return + url = audio_url.get("url", "") + if url: + audio = self._connector.fetch_audio(url) + else: + # UUID-only reference, use None as placeholder + audio = None + else: + audio = self._connector.fetch_audio(audio_url) placeholder = self._tracker.add("audio", audio) self._add_placeholder("audio", placeholder) @@ -731,8 +822,24 @@ def parse_input_audio(self, input_audio: InputAudio) -> None: return self.parse_audio(audio_url) - def parse_video(self, video_url: str) -> None: - video = self._connector.fetch_video(video_url=video_url) + def parse_video(self, video_url: Union[str, dict[str, str]]) -> None: + if isinstance(video_url, dict): + # Handle UUID for cache optimization + if "uuid" in video_url: + self._tracker.add_uuid("video", video_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in video_url or not video_url["url"]: + placeholder = self._tracker.add("video", None) # Placeholder for UUID-only + self._add_placeholder("video", placeholder) + return + url = video_url.get("url", "") + if url: + video = self._connector.fetch_video(video_url=url) + else: + # UUID-only reference, use None as placeholder + video = None + else: + video = self._connector.fetch_video(video_url=video_url) placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -749,8 +856,28 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path ) - def parse_image(self, image_url: str) -> None: - image_coro = self._connector.fetch_image_async(image_url) + def parse_image(self, image_url: Union[str, dict[str, str]]) -> None: + if isinstance(image_url, dict): + # Handle UUID for cache optimization + if "uuid" in image_url: + self._tracker.add_uuid("image", image_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in image_url or not image_url["url"]: + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + placeholder = self._tracker.add("image", future) # Placeholder for UUID-only + self._add_placeholder("image", placeholder) + return + url = image_url.get("url", "") + if url: + image_coro = self._connector.fetch_image_async(url) + else: + # UUID-only reference, use None as placeholder + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + image_coro = future + else: + image_coro = self._connector.fetch_image_async(image_url) placeholder = self._tracker.add("image", image_coro) self._add_placeholder("image", placeholder) @@ -781,8 +908,28 @@ def parse_image_pil(self, image_pil: Image.Image) -> None: placeholder = self._tracker.add("image", future) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: - audio_coro = self._connector.fetch_audio_async(audio_url) + def parse_audio(self, audio_url: Union[str, dict[str, str]]) -> None: + if isinstance(audio_url, dict): + # Handle UUID for cache optimization + if "uuid" in audio_url: + self._tracker.add_uuid("audio", audio_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in audio_url or not audio_url["url"]: + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + placeholder = self._tracker.add("audio", future) # Placeholder for UUID-only + self._add_placeholder("audio", placeholder) + return + url = audio_url.get("url", "") + if url: + audio_coro = self._connector.fetch_audio_async(url) + else: + # UUID-only reference, use None as placeholder + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + audio_coro = future + else: + audio_coro = self._connector.fetch_audio_async(audio_url) placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder("audio", placeholder) @@ -794,8 +941,28 @@ def parse_input_audio(self, input_audio: InputAudio) -> None: return self.parse_audio(audio_url) - def parse_video(self, video_url: str) -> None: - video = self._connector.fetch_video_async(video_url=video_url) + def parse_video(self, video_url: Union[str, dict[str, str]]) -> None: + if isinstance(video_url, dict): + # Handle UUID for cache optimization + if "uuid" in video_url: + self._tracker.add_uuid("video", video_url["uuid"]) + # Skip fetching if only UUID is provided (cache-only reference) + if "url" not in video_url or not video_url["url"]: + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + placeholder = self._tracker.add("video", future) # Placeholder for UUID-only + self._add_placeholder("video", placeholder) + return + url = video_url.get("url", "") + if url: + video = self._connector.fetch_video_async(video_url=url) + else: + # UUID-only reference, use None as placeholder + future: asyncio.Future[None] = asyncio.Future() + future.set_result(None) + video = future + else: + video = self._connector.fetch_video_async(video_url=video_url) placeholder = self._tracker.add("video", video) self._add_placeholder("video", placeholder) @@ -945,6 +1112,34 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], ResponseInputImageParam).validate_python _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage] + +def _parse_image_url_with_uuid(parsed_part: dict[str, Any]) -> dict[str, str]: + """Extract both URL and UUID from image_url part.""" + image_url_data = parsed_part.get("image_url", {}) + result = {"url": image_url_data.get("url", "")} + if "uuid" in parsed_part: + result["uuid"] = parsed_part["uuid"] + return result + + +def _parse_audio_url_with_uuid(parsed_part: dict[str, Any]) -> dict[str, str]: + """Extract both URL and UUID from audio_url part.""" + audio_url_data = parsed_part.get("audio_url", {}) + result = {"url": audio_url_data.get("url", "")} + if "uuid" in parsed_part: + result["uuid"] = parsed_part["uuid"] + return result + + +def _parse_video_url_with_uuid(parsed_part: dict[str, Any]) -> dict[str, str]: + """Extract both URL and UUID from video_url part.""" + video_url_data = parsed_part.get("video_url", {}) + result = {"url": video_url_data.get("url", "")} + if "uuid" in parsed_part: + result["uuid"] = parsed_part["uuid"] + return result + + # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: dict[ str, @@ -959,18 +1154,18 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": - lambda part: _ImageParser(part).get("image_url", {}).get("url", None), + lambda part: _parse_image_url_with_uuid(_ImageParser(part)), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), "audio_url": - lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), + lambda part: _parse_audio_url_with_uuid(_AudioParser(part)), "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None), "refusal": lambda part: _RefusalParser(part).get("refusal", None), "video_url": - lambda part: _VideoParser(part).get("video_url", {}).get("url", None), + lambda part: _parse_video_url_with_uuid(_VideoParser(part)), } @@ -1008,21 +1203,47 @@ def _parse_chat_message_content_mm_part( # Handle missing 'type' but provided direct URL fields. # 'type' is required field by pydantic if part_type is None: + # Handle UUID-only references first + if part.get("uuid") is not None and len(part) == 1: + # Pure UUID reference without URL - need to infer type from context + # Default to image_url for now, but this should ideally be specified + return "image_url", {"uuid": part["uuid"]} + if part.get("image_url") is not None: image_params = cast(CustomChatCompletionContentSimpleImageParam, part) - return "image_url", image_params.get("image_url", "") + result = {"url": image_params.get("image_url", "")} + if "uuid" in image_params: + result["uuid"] = image_params["uuid"] + return "image_url", result if part.get("audio_url") is not None: audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part) - return "audio_url", audio_params.get("audio_url", "") + result = {"url": audio_params.get("audio_url", "")} + if "uuid" in audio_params: + result["uuid"] = audio_params["uuid"] + return "audio_url", result if part.get("input_audio") is not None: input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params if part.get("video_url") is not None: video_params = cast(CustomChatCompletionContentSimpleVideoParam, part) - return "video_url", video_params.get("video_url", "") + result = {"url": video_params.get("video_url", "")} + if "uuid" in video_params: + result["uuid"] = video_params["uuid"] + return "video_url", result + # Handle UUID-only references for specific modalities + if "uuid" in part: + # Check for modality-specific UUID references + if "image_uuid" in part or part.get("type") == "image": + return "image_url", {"uuid": part["uuid"]} + elif "audio_uuid" in part or part.get("type") == "audio": + return "audio_url", {"uuid": part["uuid"]} + elif "video_uuid" in part or part.get("type") == "video": + return "video_url", {"uuid": part["uuid"]} + # Default to image for generic UUID + return "image_url", {"uuid": part["uuid"]} # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -1113,24 +1334,24 @@ def _parse_chat_message_content_part( mm_parser.parse_image_pil(image_content) modality = "image" elif part_type in ("image_url", "input_image"): - str_content = cast(str, content) - mm_parser.parse_image(str_content) + # content can now be str or dict[str, str] (with UUID) + mm_parser.parse_image(content) modality = "image" elif part_type == "image_embeds": content = cast(Union[str, dict[str, str]], content) mm_parser.parse_image_embeds(content) modality = "image" elif part_type == "audio_url": - str_content = cast(str, content) - mm_parser.parse_audio(str_content) + # content can now be str or dict[str, str] (with UUID) + mm_parser.parse_audio(content) modality = "audio" elif part_type == "input_audio": dict_content = cast(InputAudio, content) mm_parser.parse_input_audio(dict_content) modality = "audio" elif part_type == "video_url": - str_content = cast(str, content) - mm_parser.parse_video(str_content) + # content can now be str or dict[str, str] (with UUID) + mm_parser.parse_video(content) modality = "video" else: raise NotImplementedError(f"Unknown part type: {part_type}") diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index ac27bb66f7b5..5082f45bd67f 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -23,6 +23,45 @@ class MultiModalHasher: + @classmethod + def get_cache_keys_from_mm_data(cls, mm_data: Mapping[str, object]) -> list[str]: + """Extract cache keys from multimodal data, preferring UUIDs.""" + cache_keys = [] + + # Check for UUID fields first + for modality in ['image', 'video', 'audio']: + uuid_key = f"{modality}_uuids" + if uuid_key in mm_data: + uuids = mm_data[uuid_key] + if isinstance(uuids, list): + cache_keys.extend(uuids) + else: + cache_keys.append(uuids) + + # If no UUIDs found, fall back to URL-optimized hashing + if not cache_keys: + return [cls._hash_with_url_optimization(**mm_data)] + + return cache_keys + + @classmethod + def _hash_with_url_optimization(cls, **kwargs: object) -> str: + """ + Hash multimodal data with URL-only optimization when possible. + For URL-based inputs, hash only the URL instead of full content. + """ + optimized_kwargs: dict[str, object] = {} + + for key, value in kwargs.items(): + # For URL-based inputs, hash only the URL + if key.endswith('_url') and isinstance(value, str): + optimized_kwargs[key] = value + # For other inputs, use full content hashing + else: + optimized_kwargs[key] = value + + return cls.hash_kwargs(**optimized_kwargs) + @classmethod def serialize_item(cls, obj: object) -> Union[bytes, memoryview]: # Simple cases diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 18aae35c6fd4..37a0723d9d49 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -105,6 +105,16 @@ class MultiModalDataBuiltins(TypedDict, total=False): audio: ModalityData[AudioItem] """The input audio(s).""" + + # UUID fields for caching optimization + image_uuids: ModalityData[str] + """User-provided UUIDs for image cache optimization.""" + + video_uuids: ModalityData[str] + """User-provided UUIDs for video cache optimization.""" + + audio_uuids: ModalityData[str] + """User-provided UUIDs for audio cache optimization.""" MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 46240855d12a..c34b8912181d 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1021,6 +1021,28 @@ def get_item( value=self._cache.get(cache_key), ) + def get_by_key(self, cache_key: str) -> Optional[MultiModalKwargsItem]: + """ + Get a processed multi-modal item from the cache using a direct cache key. + + This method is used for UUID-based cache lookups where the cache key + is already known (e.g., when using UUIDs as direct cache keys). + + Args: + cache_key: The direct cache key to lookup + + Returns: + The cached item if found, None otherwise + """ + self._maybe_log_cache_stats() + + if self.debug_cache_hit_ratio_steps: + if cache_key in self._cache: + self.debug_cache_hits += 1 + self.debug_cache_total += 1 + + return self._cache.get(cache_key) + def put( self, model_id: str, @@ -1220,6 +1242,10 @@ def _to_mm_items( before passing them to [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ + # Handle UUID-only references (empty content with UUIDs) + if self._is_uuid_only_reference(mm_data): + return self._create_uuid_placeholder_items(mm_data) + mm_items = self.data_parser.parse_mm_data(mm_data) for modality, items in mm_items.items(): @@ -1227,6 +1253,19 @@ def _to_mm_items( return mm_items + def _is_uuid_only_reference(self, mm_data: MultiModalDataDict) -> bool: + """Check if this is a UUID-only cache reference with no actual content.""" + has_uuids = any(key.endswith('_uuids') for key in mm_data) + has_content = any(key in ['image', 'video', 'audio'] for key in mm_data) + return has_uuids and not has_content + + def _create_uuid_placeholder_items(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: + """Create placeholder items for UUID-only references.""" + from .parse import MultiModalDataItems + # For UUID-only references, return empty items + # The actual processing will be handled by the UUID cache lookup + return MultiModalDataItems() + @abstractmethod def _get_mm_fields_config( self, @@ -1823,6 +1862,29 @@ def _maybe_apply_prompt_updates( return prompt_ids, prompt, mm_placeholders + def _has_uuid_fields(self, mm_data: MultiModalDataDict) -> bool: + """Check if multimodal data contains any UUID fields.""" + return any(key.endswith('_uuids') for key in mm_data) + + def _create_uuid_hash_dict( + self, + mm_data: MultiModalDataDict + ) -> Mapping[str, list[str]]: + """Create a hash dictionary from UUIDs in multimodal data.""" + hash_dict = {} + + for modality in ['image', 'video', 'audio']: + uuid_key = f"{modality}_uuids" + if uuid_key in mm_data: + uuids = mm_data[uuid_key] + if isinstance(uuids, list): + hash_dict[modality] = uuids + else: + hash_dict[modality] = [uuids] + + return hash_dict + + def apply( self, prompt: Union[str, list[int]], @@ -1844,23 +1906,53 @@ def apply( 3. Extract information about the placeholder tokens from the processed token IDs. """ + # NEW: UUID-based optimization - bypass expensive MediaConnector and content hashing + # but still do proper prompt processing for placeholder insertion + uuid_cache_kwargs = None + if self.cache is not None and return_mm_hashes and self._has_uuid_fields(mm_data): + cache_keys = MultiModalHasher.get_cache_keys_from_mm_data(mm_data) + if cache_keys: + cache_hits = [self.cache.get_by_key(key) for key in cache_keys] + if all(hit is not None for hit in cache_hits): + # Full cache hit - we can reconstruct mm_kwargs directly + try: + non_null_hits = cast(list[MultiModalKwargsItem], cache_hits) + uuid_cache_kwargs = MultiModalKwargs.from_items(non_null_hits) + except (ValueError, KeyError): + # Race condition: cache evicted between check and reconstruction + pass + mm_items = self._to_mm_items(mm_data) if tokenization_kwargs is None: tokenization_kwargs = {} - ( - prompt_ids, - mm_kwargs, - mm_hashes, - is_update_applied, - ) = self._cached_apply_hf_processor( - prompt, - mm_items, - hf_processor_mm_kwargs, - tokenization_kwargs=tokenization_kwargs, - return_mm_hashes=return_mm_hashes, - ) + # Use cached kwargs if available, otherwise process normally + if uuid_cache_kwargs is not None: + # UUID cache hit - skip HF processing but do prompt tokenization + if isinstance(prompt, str): + tokenizer = self.info.get_tokenizer() + prompt_ids = encode_tokens(tokenizer, prompt, add_special_tokens=False) + else: + prompt_ids = prompt + mm_kwargs = uuid_cache_kwargs + # Convert UUIDs to proper MultiModalHashDict format + mm_hashes = self._create_uuid_hash_dict(mm_data) if return_mm_hashes else None + is_update_applied = False # Will need to apply prompt updates + else: + # Normal processing path + ( + prompt_ids, + mm_kwargs, + mm_hashes, + is_update_applied, + ) = self._cached_apply_hf_processor( + prompt, + mm_items, + hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + return_mm_hashes=return_mm_hashes, + ) # NOTE: tokenization_kwargs are not required to init processor prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 8dfbc6503520..c468901ad239 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -287,6 +287,7 @@ def fetch_image_embedding( return image_embedding_io.load_base64("", data) + def encode_audio_base64( audio: np.ndarray, sampling_rate: float,