fix MM LRU caching

vadiklyutiy · vadiklyutiy · commit dd56058d4266 · 2025-04-14T14:53:23.000Z
Signed-off-by: Vadim Gimpelson &lt;vadim.gimpelson@centml.ai&gt;
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
@@ -51,6 +51,13 @@ def get_and_update_p0(
         full_mm_inputs = list[Optional[MultiModalKwargs]]()
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
             if mm_hash in self.mm_cache:
+                # Client and Server must be exactly the same (see description
+                # in the top of this file).
+                # `in` in above statement don't update access time by design.
+                # But server side make a direct access and update access time.
+                # Have to make a dummy access to update access time to keep
+                # LRU order of caches consistent.
+                _ = self.mm_cache[mm_hash]
                 mm_input = None
             else:
                 self.mm_cache[mm_hash] = mm_input