vllm-project · DarkLight1337 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from vllm import LLM
+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+
+from ..openai.test_vision import TEST_IMAGE_URLS
+
+
+def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
+    return [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            },
+        ],
+    }]
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_mm_cache_stats(
+    image_urls: list[str],
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+
+        llm = LLM(
+            model="HuggingFaceTB/SmolVLM-256M-Instruct",
+            max_model_len=4096,
+            max_num_seqs=5,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image": 2},
+        )
+        engine = llm.llm_engine
+        if isinstance(engine, V0LLMEngine):
+            mm_registry = engine.input_preprocessor.mm_registry
+        elif isinstance(engine, V1LLMEngine):
+            mm_registry = engine.processor.mm_registry
+
+        # In case the previous test failed, we still need to reset the cache
+        # (which is shared across tests)
+        engine.reset_mm_cache()
+        mm_registry.make_processor_cache_stats()
+
+        llm.chat(_make_messages(image_urls[0]))
+
+        cache_stats = mm_registry.make_processor_cache_stats()
+        assert cache_stats.size_items == 1
+
+        llm.chat(_make_messages(image_urls[1]))
+
+        cache_stats = mm_registry.make_processor_cache_stats()
+        assert cache_stats.size_items == 2
+
+        llm.chat(_make_messages(image_urls[0]))
+
+        cache_stats = mm_registry.make_processor_cache_stats()
+        assert cache_stats.size_items == 2
+
+        engine.reset_mm_cache()
+
+        cache_stats = mm_registry.make_processor_cache_stats()
+        assert cache_stats.size_items == 0
+        assert cache_stats.reset is True
@@ -17,10 +17,18 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MODELS = {
+    "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
+}
 PREV_MINOR_VERSION = version._prev_minor_version()
 
 
+@pytest.fixture(scope="module", params=list(MODELS.keys()))
+def model_key(request):
+    yield request.param
+
+
 @pytest.fixture(scope="module", params=[True, False])
 def use_v1(request):
     # Module-scoped variant of run_with_both_engines
@@ -60,11 +68,13 @@ def default_server_args():
                     "--disable-frontend-multiprocessing",
                     f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
                 ])
-def server(use_v1, default_server_args, request):
+def server(model_key, use_v1, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
+
+    model_name = MODELS[model_key]
     env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
+    with RemoteOpenAIServer(model_name, default_server_args,
                             env_dict=env_dict) as remote_server:
         yield remote_server
 
@@ -76,63 +86,74 @@ async def client(server):
 
 
 _PROMPT = "Hello my name is Robert and I love magic"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
+_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 _NUM_REQUESTS = 10
-_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
 _NUM_GENERATION_TOKENS_PER_REQUEST = 10
 
-# {metric_family: [(suffix, expected_value)]}
-EXPECTED_VALUES = {
-    "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:time_per_output_token_seconds":
-    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
-    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_prompt_tokens":
-    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
-    "vllm:request_generation_tokens":
-    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
-    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
-    "vllm:request_params_max_tokens": [
-        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-        ("_count", _NUM_REQUESTS)
-    ],
-    "vllm:iteration_tokens_total":
-    [("_sum", _NUM_REQUESTS *
-      (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
-     ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
-    "vllm:prompt_tokens": [("_total",
-                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
-    "vllm:generation_tokens": [
-        ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
-    ],
-    "vllm:request_success": [("_total", _NUM_REQUESTS)],
-}
+
+def _get_expected_values(prompt_ids: list[int]):
+    num_prompt_tokens = len(prompt_ids)
+
+    # {metric_family: [(suffix, expected_value)]}
+    return {
+        "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:time_per_output_token_seconds":
+        [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
+        "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_prompt_tokens": [("_sum",
+                                        _NUM_REQUESTS * num_prompt_tokens),
+                                       ("_count", _NUM_REQUESTS)],
+        "vllm:request_generation_tokens":
+        [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+         ("_count", _NUM_REQUESTS)],
+        "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
+        "vllm:request_params_max_tokens":
+        [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+         ("_count", _NUM_REQUESTS)],
+        "vllm:iteration_tokens_total":
+        [("_sum", _NUM_REQUESTS *
+          (num_prompt_tokens + _NUM_GENERATION_TOKENS_PER_REQUEST)),
+         ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
+        "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * num_prompt_tokens)],
+        "vllm:generation_tokens": [("_total",
+                                    _NUM_REQUESTS * num_prompt_tokens)],
+        "vllm:request_success": [("_total", _NUM_REQUESTS)],
+    }
 
 
 @pytest.mark.asyncio
-async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient, use_v1: bool):
+async def test_metrics_counts(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+    use_v1: bool,
+):
+    if model_key == "multimodal":
+        pytest.skip("Unnecessary test")
+
+    model_name = MODELS[model_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    prompt_ids = tokenizer.encode(_PROMPT)
+
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
-            model=MODEL_NAME,
-            prompt=_TOKENIZED_PROMPT,
+            model=model_name,
+            prompt=prompt_ids,
             max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
 
     response = requests.get(server.url_for("metrics"))
     print(response.text)
     assert response.status_code == HTTPStatus.OK
 
     # Loop over all expected metric_families
-    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+    expected_values = _get_expected_values(prompt_ids)
+    for metric_family, suffix_values_list in expected_values.items():
         if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
                 or (not server.show_hidden_metrics
                     and metric_family in HIDDEN_DEPRECATED_METRICS)):
@@ -274,25 +295,67 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_decode_time_seconds_count",
 ]
 
+EXPECTED_METRICS_MM = [
+    "vllm:mm_cache_usage",
+    "vllm:mm_cache_size_G",
+    "vllm:mm_cache_size_items",
+    "vllm:mm_cache_queries",
+    "vllm:mm_cache_hits",
+]
+
 HIDDEN_DEPRECATED_METRICS = [
     "vllm:num_requests_swapped",
     "vllm:cpu_cache_usage_perc",
 ]
 
 
 @pytest.mark.asyncio
-async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient, use_v1: bool):
+async def test_metrics_exist(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+    use_v1: bool,
+):
     # sending a request triggers the metrics to be logged.
-    await client.completions.create(model=MODEL_NAME,
-                                    prompt="Hello, my name is",
-                                    max_tokens=5,
-                                    temperature=0.0)
+    model_name = MODELS[model_key]
+
+    if model_key == "text":
+        await client.completions.create(model=model_name,
+                                        prompt="Hello, my name is",
+                                        max_tokens=5,
+                                        temperature=0.0)
+    else:
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": _IMAGE_URL
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }]
+
+        await client.chat.completions.create(model=model_name,
+                                             messages=messages,
+                                             max_tokens=5,
+                                             temperature=0.0)
 
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
+    expected_metrics = EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS
+    if model_key == "multimodal":
+        # NOTE: Don't use in-place assignment
+        expected_metrics = expected_metrics + EXPECTED_METRICS_MM
+
+    for metric in expected_metrics:
         if (not server.show_hidden_metrics
                 and metric not in HIDDEN_DEPRECATED_METRICS):
             assert metric in response.text

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
@@ -288,7 +288,6 @@ def test_metric_spec_decode(
 @pytest.mark.parametrize("max_tokens", [10])
 @pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
 def test_metric_spec_decode_interval(
-    vllm_runner,
     example_prompts,
     model: str,
     dtype: str,

@@ -12,15 +12,14 @@
 # yapf: disable
 from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
                                          FreeKVCacheBlockQueue, KVCacheBlock,
-                                         PrefixCachingMetrics,
                                          estimate_max_model_len,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens,
                                          unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor)
-from vllm.v1.metrics.stats import PrefixCacheStats
+from vllm.v1.metrics.stats import CachingMetrics, PrefixCacheStats
 from vllm.v1.request import Request
 
 # yapf: enable
@@ -351,7 +350,7 @@ def test_metrics():
     def stats(requests, queries, hits):
         return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
 
-    metrics = PrefixCachingMetrics(max_recent_requests=5)
+    metrics = CachingMetrics(max_recent_requests=5)
     assert metrics.hit_rate == 0.0
 
     metrics.observe(stats(1, 20, 9))

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -1662,6 +1662,15 @@ def _get_stats(self,
         gpu_prefix_cache_hit_rate = self.scheduler[
             0].get_prefix_cache_hit_rate(Device.GPU)
 
+        # Multi-modal cache stats
+        mm_registry = self.input_preprocessor.mm_registry
+        processor_cache_stats = mm_registry.make_processor_cache_stats()
+        mm_cache_usage = processor_cache_stats.usage
+        mm_cache_size_G = processor_cache_stats.size_G
+        mm_cache_size_items = processor_cache_stats.size_items
+        mm_cache_queries = processor_cache_stats.queries
+        mm_cache_hits = processor_cache_stats.hits
+
         # Iteration stats
         num_prompt_tokens_iter = 0
         num_generation_tokens_iter = 0
@@ -1848,6 +1857,12 @@ def _get_stats(self,
             #   Prefix Cache Hit Rate
             cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
             gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,
+            #   Multi-modal cache stats
+            mm_cache_usage=mm_cache_usage,
+            mm_cache_size_G=mm_cache_size_G,
+            mm_cache_size_items=mm_cache_size_items,
+            mm_cache_queries=mm_cache_queries,
+            mm_cache_hits=mm_cache_hits,
 
             # Iteration stats
             num_prompt_tokens_iter=num_prompt_tokens_iter,