Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions tests/entrypoints/llm/test_mm_cache_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: Apache-2.0

import pytest

from vllm import LLM
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine

from ..openai.test_vision import TEST_IMAGE_URLS


def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
return [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
],
}]


@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
@pytest.mark.parametrize("use_v1", [True, False])
def test_mm_cache_stats(
image_urls: list[str],
use_v1: bool,
monkeypatch: pytest.MonkeyPatch,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")

llm = LLM(
model="HuggingFaceTB/SmolVLM-256M-Instruct",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
limit_mm_per_prompt={"image": 2},
)
engine = llm.llm_engine
if isinstance(engine, V0LLMEngine):
mm_registry = engine.input_preprocessor.mm_registry
elif isinstance(engine, V1LLMEngine):
mm_registry = engine.processor.mm_registry

# In case the previous test failed, we still need to reset the cache
# (which is shared across tests)
engine.reset_mm_cache()
mm_registry.make_processor_cache_stats()

llm.chat(_make_messages(image_urls[0]))

cache_stats = mm_registry.make_processor_cache_stats()
assert cache_stats.size_items == 1

llm.chat(_make_messages(image_urls[1]))

cache_stats = mm_registry.make_processor_cache_stats()
assert cache_stats.size_items == 2

llm.chat(_make_messages(image_urls[0]))

cache_stats = mm_registry.make_processor_cache_stats()
assert cache_stats.size_items == 2

engine.reset_mm_cache()

cache_stats = mm_registry.make_processor_cache_stats()
assert cache_stats.size_items == 0
assert cache_stats.reset is True
163 changes: 113 additions & 50 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,18 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
}
PREV_MINOR_VERSION = version._prev_minor_version()


@pytest.fixture(scope="module", params=list(MODELS.keys()))
def model_key(request):
yield request.param


@pytest.fixture(scope="module", params=[True, False])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
Expand Down Expand Up @@ -60,11 +68,13 @@ def default_server_args():
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
])
def server(use_v1, default_server_args, request):
def server(model_key, use_v1, default_server_args, request):
if request.param:
default_server_args.append(request.param)

model_name = MODELS[model_key]
env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
with RemoteOpenAIServer(MODEL_NAME, default_server_args,
with RemoteOpenAIServer(model_name, default_server_args,
env_dict=env_dict) as remote_server:
yield remote_server

Expand All @@ -76,63 +86,74 @@ async def client(server):


_PROMPT = "Hello my name is Robert and I love magic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

_NUM_REQUESTS = 10
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
_NUM_GENERATION_TOKENS_PER_REQUEST = 10

# {metric_family: [(suffix, expected_value)]}
EXPECTED_VALUES = {
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
"vllm:time_per_output_token_seconds":
[("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prompt_tokens":
[("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_generation_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens": [
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)
],
"vllm:iteration_tokens_total":
[("_sum", _NUM_REQUESTS *
(_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
"vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": [
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}

def _get_expected_values(prompt_ids: list[int]):
num_prompt_tokens = len(prompt_ids)

# {metric_family: [(suffix, expected_value)]}
return {
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
"vllm:time_per_output_token_seconds":
[("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prompt_tokens": [("_sum",
_NUM_REQUESTS * num_prompt_tokens),
("_count", _NUM_REQUESTS)],
"vllm:request_generation_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:iteration_tokens_total":
[("_sum", _NUM_REQUESTS *
(num_prompt_tokens + _NUM_GENERATION_TOKENS_PER_REQUEST)),
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
"vllm:prompt_tokens": [("_total", _NUM_REQUESTS * num_prompt_tokens)],
"vllm:generation_tokens": [("_total",
_NUM_REQUESTS * num_prompt_tokens)],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}


@pytest.mark.asyncio
async def test_metrics_counts(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
async def test_metrics_counts(
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
use_v1: bool,
):
if model_key == "multimodal":
pytest.skip("Unnecessary test")

model_name = MODELS[model_key]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_ids = tokenizer.encode(_PROMPT)

for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
model=model_name,
prompt=prompt_ids,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)

response = requests.get(server.url_for("metrics"))
print(response.text)
assert response.status_code == HTTPStatus.OK

# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
expected_values = _get_expected_values(prompt_ids)
for metric_family, suffix_values_list in expected_values.items():
if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
or (not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS)):
Expand Down Expand Up @@ -274,25 +295,67 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:request_decode_time_seconds_count",
]

EXPECTED_METRICS_MM = [
"vllm:mm_cache_usage",
"vllm:mm_cache_size_G",
"vllm:mm_cache_size_items",
"vllm:mm_cache_queries",
"vllm:mm_cache_hits",
]

HIDDEN_DEPRECATED_METRICS = [
"vllm:num_requests_swapped",
"vllm:cpu_cache_usage_perc",
]


@pytest.mark.asyncio
async def test_metrics_exist(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
async def test_metrics_exist(
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
use_v1: bool,
):
# sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
model_name = MODELS[model_key]

if model_key == "text":
await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
else:
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": _IMAGE_URL
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]

await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=5,
temperature=0.0)

response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK

for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
expected_metrics = EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS
if model_key == "multimodal":
# NOTE: Don't use in-place assignment
expected_metrics = expected_metrics + EXPECTED_METRICS_MM

for metric in expected_metrics:
if (not server.show_hidden_metrics
and metric not in HIDDEN_DEPRECATED_METRICS):
assert metric in response.text
Expand Down
1 change: 0 additions & 1 deletion tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ def test_metric_spec_decode(
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
def test_metric_spec_decode_interval(
vllm_runner,
example_prompts,
model: str,
dtype: str,
Expand Down
5 changes: 2 additions & 3 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
# yapf: disable
from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
FreeKVCacheBlockQueue, KVCacheBlock,
PrefixCachingMetrics,
estimate_max_model_len,
generate_block_hash_extra_keys,
hash_block_tokens,
hash_request_tokens,
unify_kv_cache_configs)
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheTensor)
from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.metrics.stats import CachingMetrics, PrefixCacheStats
from vllm.v1.request import Request

# yapf: enable
Expand Down Expand Up @@ -351,7 +350,7 @@ def test_metrics():
def stats(requests, queries, hits):
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)

metrics = PrefixCachingMetrics(max_recent_requests=5)
metrics = CachingMetrics(max_recent_requests=5)
assert metrics.hit_rate == 0.0

metrics.observe(stats(1, 20, 9))
Expand Down
15 changes: 15 additions & 0 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1662,6 +1662,15 @@ def _get_stats(self,
gpu_prefix_cache_hit_rate = self.scheduler[
0].get_prefix_cache_hit_rate(Device.GPU)

# Multi-modal cache stats
mm_registry = self.input_preprocessor.mm_registry
processor_cache_stats = mm_registry.make_processor_cache_stats()
mm_cache_usage = processor_cache_stats.usage
mm_cache_size_G = processor_cache_stats.size_G
mm_cache_size_items = processor_cache_stats.size_items
mm_cache_queries = processor_cache_stats.queries
mm_cache_hits = processor_cache_stats.hits

# Iteration stats
num_prompt_tokens_iter = 0
num_generation_tokens_iter = 0
Expand Down Expand Up @@ -1848,6 +1857,12 @@ def _get_stats(self,
# Prefix Cache Hit Rate
cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,
# Multi-modal cache stats
mm_cache_usage=mm_cache_usage,
mm_cache_size_G=mm_cache_size_G,
mm_cache_size_items=mm_cache_size_items,
mm_cache_queries=mm_cache_queries,
mm_cache_hits=mm_cache_hits,

# Iteration stats
num_prompt_tokens_iter=num_prompt_tokens_iter,
Expand Down
Loading