[Bugfix] Fix entrypoints metrics tests (vllm-project#18063)

DarkLight1337 · Yuqi Zhang · commit 58beb6630509 · 2025-05-24T08:03:00.000Z
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
Signed-off-by: Yuqi Zhang &lt;yuqizhang@google.com&gt;
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -150,10 +150,6 @@ async def build_async_engine_client(
 
     async with build_async_engine_client_from_engine_args(
             engine_args, args.disable_frontend_multiprocessing) as engine:
-
-        # Don't keep the dummy data in memory
-        await engine.reset_mm_cache()
-
         yield engine
 
 
@@ -189,6 +185,10 @@ async def build_async_engine_client_from_engine_args(
                 usage_context=usage_context,
                 disable_log_requests=engine_args.disable_log_requests,
                 disable_log_stats=engine_args.disable_log_stats)
+
+            # Don't keep the dummy data in memory
+            await async_llm.reset_mm_cache()
+
             yield async_llm
         finally:
             if async_llm:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -289,7 +289,7 @@ def profile(self, is_start: bool = True):
     def reset_mm_cache(self):
         # NOTE: Since this is mainly for debugging, we don't attempt to
         # re-sync the internal caches (P0 processor, P0 mirror, P1 mirror)
-        if self.scheduler.get_num_unfinished_requests():
+        if self.scheduler.has_unfinished_requests():
             logger.warning("Resetting the multi-modal cache when requests are "
                            "in progress may lead to desynced internal caches.")