vllm-project · vllm-bot · May 13, 2025 · May 13, 2025 · DarkLight1337 · May 13, 2025
@@ -150,10 +150,6 @@ async def build_async_engine_client(
 
     async with build_async_engine_client_from_engine_args(
             engine_args, args.disable_frontend_multiprocessing) as engine:
-
-        # Don't keep the dummy data in memory
-        await engine.reset_mm_cache()
-
         yield engine
 
 
@@ -189,6 +185,10 @@ async def build_async_engine_client_from_engine_args(
                 usage_context=usage_context,
                 disable_log_requests=engine_args.disable_log_requests,
                 disable_log_stats=engine_args.disable_log_stats)
+
+            # Don't keep the dummy data in memory
+            await async_llm.reset_mm_cache()
+
             yield async_llm
         finally:
             if async_llm:

@@ -289,7 +289,7 @@ def profile(self, is_start: bool = True):
     def reset_mm_cache(self):
         # NOTE: Since this is mainly for debugging, we don't attempt to
         # re-sync the internal caches (P0 processor, P0 mirror, P1 mirror)
-        if self.scheduler.get_num_unfinished_requests():
+        if self.scheduler.has_unfinished_requests():
             logger.warning("Resetting the multi-modal cache when requests are "
                            "in progress may lead to desynced internal caches.")