vllm-project
diff --git a/‎tests/entrypoints/openai/test_metrics.py‎
Lines changed: 115 additions & 140 deletions b/‎tests/entrypoints/openai/test_metrics.py‎
Lines changed: 115 additions & 140 deletions
@@ -18,25 +18,15 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MODELS = {
+    "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
+}
 PREV_MINOR_VERSION = version._prev_minor_version()
 
 
-@pytest.fixture(scope="module", params=[True])
-def use_v1(request):
-    # Module-scoped variant of run_with_both_engines
-    #
-    # Use this fixture to run a test with both v0 and v1, and
-    # also to conditionalize the test logic e.g.
-    #
-    # def test_metrics_exist(use_v1, server, client):
-    #     ...
-    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
-    #     for metric in expected:
-    #         assert metric in response.text
-    #
-    # @skip_v1 wouldn't work here because this is a module-level
-    # fixture - per-function decorators would have no effect
+@pytest.fixture(scope="module", params=list(MODELS.keys()))
+def model_key(request):
     yield request.param
 
 
@@ -63,13 +53,12 @@ def default_server_args():
         f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
     ],
 )
-def server(use_v1, default_server_args, request):
+def server(model_key, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
-    with RemoteOpenAIServer(
-        MODEL_NAME, default_server_args, env_dict=env_dict
-    ) as remote_server:
+
+    model_name = MODELS[model_key]
+    with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
         yield remote_server
 
 
@@ -80,62 +69,71 @@ async def client(server):
 
 
 _PROMPT = "Hello my name is Robert and I love magic"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
+_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 _NUM_REQUESTS = 10
-_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
 _NUM_GENERATION_TOKENS_PER_REQUEST = 10
 
-# {metric_family: [(suffix, expected_value)]}
-EXPECTED_VALUES = {
-    "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:time_per_output_token_seconds": [
-        ("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
-    ],
-    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_prompt_tokens": [
-        ("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
-        ("_count", _NUM_REQUESTS),
-    ],
-    "vllm:request_generation_tokens": [
-        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-        ("_count", _NUM_REQUESTS),
-    ],
-    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
-    "vllm:request_params_max_tokens": [
-        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-        ("_count", _NUM_REQUESTS),
-    ],
-    "vllm:iteration_tokens_total": [
-        (
-            "_sum",
-            _NUM_REQUESTS
-            * (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST),
-        ),
-        ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-    ],
-    "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
-    "vllm:generation_tokens": [
-        ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
-    ],
-    "vllm:request_success": [("_total", _NUM_REQUESTS)],
-}
+
+def _get_expected_values(prompt_ids: list[int]):
+    num_prompt_tokens = len(prompt_ids)
+
+    # {metric_family: [(suffix, expected_value)]}
+    return {
+        "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:time_per_output_token_seconds": [
+            ("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
+        ],
+        "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
+        "vllm:request_prompt_tokens": [
+            ("_sum", _NUM_REQUESTS * num_prompt_tokens),
+            ("_count", _NUM_REQUESTS),
+        ],
+        "vllm:request_generation_tokens": [
+            ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+            ("_count", _NUM_REQUESTS),
+        ],
+        "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
+        "vllm:request_params_max_tokens": [
+            ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+            ("_count", _NUM_REQUESTS),
+        ],
+        "vllm:iteration_tokens_total": [
+            (
+                "_sum",
+                _NUM_REQUESTS
+                * (num_prompt_tokens + _NUM_GENERATION_TOKENS_PER_REQUEST),
+            ),
+            ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+        ],
+        "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * num_prompt_tokens)],
+        "vllm:generation_tokens": [("_total", _NUM_REQUESTS * num_prompt_tokens)],
+        "vllm:request_success": [("_total", _NUM_REQUESTS)],
+    }
 
 
 @pytest.mark.asyncio
 async def test_metrics_counts(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
 ):
+    if model_key == "multimodal":
+        pytest.skip("Unnecessary test")
+
+    model_name = MODELS[model_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    prompt_ids = tokenizer.encode(_PROMPT)
+
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
-            model=MODEL_NAME,
-            prompt=_TOKENIZED_PROMPT,
+            model=model_name,
+            prompt=prompt_ids,
             max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST,
         )
 
@@ -144,8 +142,9 @@ async def test_metrics_counts(
     assert response.status_code == HTTPStatus.OK
 
     # Loop over all expected metric_families
-    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
+    expected_values = _get_expected_values(prompt_ids)
+    for metric_family, suffix_values_list in expected_values.items():
+        if metric_family not in EXPECTED_METRICS_V1 or (
             not server.show_hidden_metrics
             and metric_family in HIDDEN_DEPRECATED_METRICS
         ):
@@ -183,62 +182,6 @@ async def test_metrics_counts(
         assert found_metric, f"Did not find {metric_family} in prom endpoint"
 
 
-EXPECTED_METRICS = [
-    "vllm:num_requests_running",
-    "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
-    "vllm:e2e_request_latency_seconds_sum",
-    "vllm:e2e_request_latency_seconds_bucket",
-    "vllm:e2e_request_latency_seconds_count",
-    "vllm:request_queue_time_seconds_sum",
-    "vllm:request_queue_time_seconds_bucket",
-    "vllm:request_queue_time_seconds_count",
-    "vllm:request_inference_time_seconds_sum",
-    "vllm:request_inference_time_seconds_bucket",
-    "vllm:request_inference_time_seconds_count",
-    "vllm:request_prefill_time_seconds_sum",
-    "vllm:request_prefill_time_seconds_bucket",
-    "vllm:request_prefill_time_seconds_count",
-    "vllm:request_decode_time_seconds_sum",
-    "vllm:request_decode_time_seconds_bucket",
-    "vllm:request_decode_time_seconds_count",
-    "vllm:request_prompt_tokens_sum",
-    "vllm:request_prompt_tokens_bucket",
-    "vllm:request_prompt_tokens_count",
-    "vllm:request_generation_tokens_sum",
-    "vllm:request_generation_tokens_bucket",
-    "vllm:request_generation_tokens_count",
-    "vllm:request_params_n_sum",
-    "vllm:request_params_n_bucket",
-    "vllm:request_params_n_count",
-    "vllm:request_params_max_tokens_sum",
-    "vllm:request_params_max_tokens_bucket",
-    "vllm:request_params_max_tokens_count",
-    "vllm:iteration_tokens_total",
-    "vllm:num_preemptions_total",
-    "vllm:prompt_tokens_total",
-    "vllm:generation_tokens_total",
-    "vllm:request_success_total",
-    "vllm:cache_config_info",
-    # labels in cache_config_info
-    "block_size",
-    "cache_dtype",
-    "cpu_offload_gb",
-    "enable_prefix_caching",
-    "gpu_memory_utilization",
-    "num_cpu_blocks",
-    "num_gpu_blocks",
-    "num_gpu_blocks_override",
-    "sliding_window",
-    "swap_space_bytes",
-]
-
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
@@ -292,6 +235,11 @@ async def test_metrics_counts(
     "vllm:request_decode_time_seconds_count",
 ]
 
+EXPECTED_METRICS_MM = [
+    "vllm:mm_cache_queries",
+    "vllm:mm_cache_hits",
+]
+
 HIDDEN_DEPRECATED_METRICS: list[str] = [
     "vllm:gpu_cache_usage_perc",
     "vllm:gpu_prefix_cache_queries",
@@ -304,28 +252,58 @@ async def test_metrics_counts(
 
 @pytest.mark.asyncio
 async def test_metrics_exist(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
 ):
+    model_name = MODELS[model_key]
+
     # sending a request triggers the metrics to be logged.
-    await client.completions.create(
-        model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
-    )
+    if model_key == "text":
+        await client.completions.create(
+            model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+        )
+    else:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": _IMAGE_URL}},
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ]
+
+        await client.chat.completions.create(
+            model=model_name, messages=messages, max_tokens=5, temperature=0.0
+        )
 
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
+    expected_metrics = EXPECTED_METRICS_V1
+    if model_key == "multimodal":
+        # NOTE: Don't use in-place assignment
+        expected_metrics = expected_metrics + EXPECTED_METRICS_MM
+
+    for metric in expected_metrics:
         if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
             continue
         assert metric in response.text
 
 
 @pytest.mark.asyncio
 async def test_abort_metrics_reset(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
 ):
+    model_name = MODELS[model_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    prompt_ids = tokenizer.encode(_PROMPT)
+
     running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server,
     )
 
     # Expect no running requests or kvcache usage
@@ -338,8 +316,8 @@ async def test_abort_metrics_reset(
     for _ in range(3):
         task = asyncio.create_task(
             client.completions.create(
-                model=MODEL_NAME,
-                prompt=_TOKENIZED_PROMPT,
+                model=model_name,
+                prompt=prompt_ids,
                 max_tokens=100,  # Long generation to give time to abort
                 temperature=0.0,
             )
@@ -351,7 +329,7 @@ async def test_abort_metrics_reset(
 
     # Check that we have running requests
     running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server,
     )
 
     # Expect running requests and kvcache usage
@@ -371,7 +349,7 @@ async def test_abort_metrics_reset(
 
     # Verify running and waiting requests counts and KV cache usage are zero
     running_requests_after, waiting_requests_after, kv_cache_usage_after = (
-        _get_running_metrics_from_api(server, use_v1)
+        _get_running_metrics_from_api(server)
     )
 
     assert running_requests_after == 0, (
@@ -385,7 +363,7 @@ async def test_abort_metrics_reset(
     )
 
 
-def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
     """Return (running_count, waiting_count, kv_cache_usage)"""
 
     response = requests.get(server.url_for("metrics"))
@@ -394,9 +372,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
     # Verify running and waiting requests counts and KV cache usage are zero
     running_requests, waiting_requests, kv_cache_usage = None, None, None
 
-    kv_cache_usage_metric = (
-        "vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
-    )
+    kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
 
     for family in text_string_to_metric_families(response.text):
         if family.name == "vllm:num_requests_running":
@@ -422,7 +398,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
     return running_requests, waiting_requests, kv_cache_usage
 
 
-def test_metrics_exist_run_batch(use_v1: bool):
+def test_metrics_exist_run_batch():
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
@@ -452,7 +428,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
                 "--port",
                 port,
             ],
-            env={"VLLM_USE_V1": "1"},
         )
 
         def is_server_up(url):