[Frontend] Multithreaded async multimodal load_bytes (#22710)

milesial · web-flow · commit 20d65aa75548 · 2025-08-13T06:09:26.000-07:00
Signed-off-by: Alexandre Milesi &lt;30204471+milesial@users.noreply.github.com&gt;
Co-authored-by: Alexandre Milesi &lt;30204471+milesial@users.noreply.github.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -63,6 +63,7 @@
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
@@ -555,6 +556,12 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_AUDIO_FETCH_TIMEOUT":
     lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
 
+    # Max number of workers for the thread pool handling
+    # media bytes loading. Set to 1 to disable parallel processing.
+    # Default is 8
+    "VLLM_MEDIA_LOADING_THREAD_COUNT":
+    lambda: int(os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")),
+
     # Maximum filesize in MB for a single audio file when processing
     # speech-to-text requests. Files larger than this will be rejected.
     # Default is 25 MB
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
+import atexit
+from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
@@ -33,6 +36,10 @@
     MultiModalKwargs = Any
     MultiModalPlaceholderDict = Any
 
+global_thread_pool = ThreadPoolExecutor(
+    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT)
+atexit.register(global_thread_pool.shutdown)
+
 
 class MediaConnector:
 
@@ -139,19 +146,26 @@ async def load_from_url_async(
         fetch_timeout: Optional[int] = None,
     ) -> _M:
         url_spec = urlparse(url)
+        loop = asyncio.get_running_loop()
 
         if url_spec.scheme.startswith("http"):
             connection = self.connection
             data = await connection.async_get_bytes(url, timeout=fetch_timeout)
-
-            return media_io.load_bytes(data)
+            future = loop.run_in_executor(global_thread_pool,
+                                          media_io.load_bytes, data)
+            return await future
 
         if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
+            future = loop.run_in_executor(global_thread_pool,
+                                          self._load_data_url, url_spec,
+                                          media_io)
+            return await future
 
         if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
-
+            future = loop.run_in_executor(global_thread_pool,
+                                          self._load_file_url, url_spec,
+                                          media_io)
+            return await future
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
 
@@ -489,4 +503,4 @@ def fetch_video(
         "video": video_io_kwargs
     }
     media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
-    return media_connector.fetch_video(video_url)
+    return media_connector.fetch_video(video_url)