Skip to content

Commit db606d9

Browse files
committed
{Metrics] Log multi-modal cache stats
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 43c146c commit db606d9

File tree

6 files changed

+369
-238
lines changed

6 files changed

+369
-238
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 115 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,15 @@
1818

1919
from ...utils import RemoteOpenAIServer
2020

21-
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
21+
MODELS = {
22+
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
23+
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
24+
}
2225
PREV_MINOR_VERSION = version._prev_minor_version()
2326

2427

25-
@pytest.fixture(scope="module", params=[True])
26-
def use_v1(request):
27-
# Module-scoped variant of run_with_both_engines
28-
#
29-
# Use this fixture to run a test with both v0 and v1, and
30-
# also to conditionalize the test logic e.g.
31-
#
32-
# def test_metrics_exist(use_v1, server, client):
33-
# ...
34-
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
35-
# for metric in expected:
36-
# assert metric in response.text
37-
#
38-
# @skip_v1 wouldn't work here because this is a module-level
39-
# fixture - per-function decorators would have no effect
28+
@pytest.fixture(scope="module", params=list(MODELS.keys()))
29+
def model_key(request):
4030
yield request.param
4131

4232

@@ -63,13 +53,12 @@ def default_server_args():
6353
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
6454
],
6555
)
66-
def server(use_v1, default_server_args, request):
56+
def server(model_key, default_server_args, request):
6757
if request.param:
6858
default_server_args.append(request.param)
69-
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
70-
with RemoteOpenAIServer(
71-
MODEL_NAME, default_server_args, env_dict=env_dict
72-
) as remote_server:
59+
60+
model_name = MODELS[model_key]
61+
with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
7362
yield remote_server
7463

7564

@@ -80,62 +69,71 @@ async def client(server):
8069

8170

8271
_PROMPT = "Hello my name is Robert and I love magic"
83-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
84-
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
72+
_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
8573

8674
_NUM_REQUESTS = 10
87-
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
8875
_NUM_GENERATION_TOKENS_PER_REQUEST = 10
8976

90-
# {metric_family: [(suffix, expected_value)]}
91-
EXPECTED_VALUES = {
92-
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
93-
"vllm:time_per_output_token_seconds": [
94-
("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
95-
],
96-
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
97-
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
98-
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
99-
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
100-
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
101-
"vllm:request_prompt_tokens": [
102-
("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
103-
("_count", _NUM_REQUESTS),
104-
],
105-
"vllm:request_generation_tokens": [
106-
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
107-
("_count", _NUM_REQUESTS),
108-
],
109-
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
110-
"vllm:request_params_max_tokens": [
111-
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
112-
("_count", _NUM_REQUESTS),
113-
],
114-
"vllm:iteration_tokens_total": [
115-
(
116-
"_sum",
117-
_NUM_REQUESTS
118-
* (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST),
119-
),
120-
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
121-
],
122-
"vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
123-
"vllm:generation_tokens": [
124-
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
125-
],
126-
"vllm:request_success": [("_total", _NUM_REQUESTS)],
127-
}
77+
78+
def _get_expected_values(prompt_ids: list[int]):
79+
num_prompt_tokens = len(prompt_ids)
80+
81+
# {metric_family: [(suffix, expected_value)]}
82+
return {
83+
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
84+
"vllm:time_per_output_token_seconds": [
85+
("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
86+
],
87+
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
88+
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
89+
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
90+
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
91+
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
92+
"vllm:request_prompt_tokens": [
93+
("_sum", _NUM_REQUESTS * num_prompt_tokens),
94+
("_count", _NUM_REQUESTS),
95+
],
96+
"vllm:request_generation_tokens": [
97+
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
98+
("_count", _NUM_REQUESTS),
99+
],
100+
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
101+
"vllm:request_params_max_tokens": [
102+
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
103+
("_count", _NUM_REQUESTS),
104+
],
105+
"vllm:iteration_tokens_total": [
106+
(
107+
"_sum",
108+
_NUM_REQUESTS
109+
* (num_prompt_tokens + _NUM_GENERATION_TOKENS_PER_REQUEST),
110+
),
111+
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
112+
],
113+
"vllm:prompt_tokens": [("_total", _NUM_REQUESTS * num_prompt_tokens)],
114+
"vllm:generation_tokens": [("_total", _NUM_REQUESTS * num_prompt_tokens)],
115+
"vllm:request_success": [("_total", _NUM_REQUESTS)],
116+
}
128117

129118

130119
@pytest.mark.asyncio
131120
async def test_metrics_counts(
132-
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
121+
server: RemoteOpenAIServer,
122+
client: openai.AsyncClient,
123+
model_key: str,
133124
):
125+
if model_key == "multimodal":
126+
pytest.skip("Unnecessary test")
127+
128+
model_name = MODELS[model_key]
129+
tokenizer = AutoTokenizer.from_pretrained(model_name)
130+
prompt_ids = tokenizer.encode(_PROMPT)
131+
134132
for _ in range(_NUM_REQUESTS):
135133
# sending a request triggers the metrics to be logged.
136134
await client.completions.create(
137-
model=MODEL_NAME,
138-
prompt=_TOKENIZED_PROMPT,
135+
model=model_name,
136+
prompt=prompt_ids,
139137
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST,
140138
)
141139

@@ -144,8 +142,9 @@ async def test_metrics_counts(
144142
assert response.status_code == HTTPStatus.OK
145143

146144
# Loop over all expected metric_families
147-
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
148-
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
145+
expected_values = _get_expected_values(prompt_ids)
146+
for metric_family, suffix_values_list in expected_values.items():
147+
if metric_family not in EXPECTED_METRICS_V1 or (
149148
not server.show_hidden_metrics
150149
and metric_family in HIDDEN_DEPRECATED_METRICS
151150
):
@@ -183,62 +182,6 @@ async def test_metrics_counts(
183182
assert found_metric, f"Did not find {metric_family} in prom endpoint"
184183

185184

186-
EXPECTED_METRICS = [
187-
"vllm:num_requests_running",
188-
"vllm:num_requests_waiting",
189-
"vllm:gpu_cache_usage_perc",
190-
"vllm:time_to_first_token_seconds_sum",
191-
"vllm:time_to_first_token_seconds_bucket",
192-
"vllm:time_to_first_token_seconds_count",
193-
"vllm:time_per_output_token_seconds_sum",
194-
"vllm:time_per_output_token_seconds_bucket",
195-
"vllm:time_per_output_token_seconds_count",
196-
"vllm:e2e_request_latency_seconds_sum",
197-
"vllm:e2e_request_latency_seconds_bucket",
198-
"vllm:e2e_request_latency_seconds_count",
199-
"vllm:request_queue_time_seconds_sum",
200-
"vllm:request_queue_time_seconds_bucket",
201-
"vllm:request_queue_time_seconds_count",
202-
"vllm:request_inference_time_seconds_sum",
203-
"vllm:request_inference_time_seconds_bucket",
204-
"vllm:request_inference_time_seconds_count",
205-
"vllm:request_prefill_time_seconds_sum",
206-
"vllm:request_prefill_time_seconds_bucket",
207-
"vllm:request_prefill_time_seconds_count",
208-
"vllm:request_decode_time_seconds_sum",
209-
"vllm:request_decode_time_seconds_bucket",
210-
"vllm:request_decode_time_seconds_count",
211-
"vllm:request_prompt_tokens_sum",
212-
"vllm:request_prompt_tokens_bucket",
213-
"vllm:request_prompt_tokens_count",
214-
"vllm:request_generation_tokens_sum",
215-
"vllm:request_generation_tokens_bucket",
216-
"vllm:request_generation_tokens_count",
217-
"vllm:request_params_n_sum",
218-
"vllm:request_params_n_bucket",
219-
"vllm:request_params_n_count",
220-
"vllm:request_params_max_tokens_sum",
221-
"vllm:request_params_max_tokens_bucket",
222-
"vllm:request_params_max_tokens_count",
223-
"vllm:iteration_tokens_total",
224-
"vllm:num_preemptions_total",
225-
"vllm:prompt_tokens_total",
226-
"vllm:generation_tokens_total",
227-
"vllm:request_success_total",
228-
"vllm:cache_config_info",
229-
# labels in cache_config_info
230-
"block_size",
231-
"cache_dtype",
232-
"cpu_offload_gb",
233-
"enable_prefix_caching",
234-
"gpu_memory_utilization",
235-
"num_cpu_blocks",
236-
"num_gpu_blocks",
237-
"num_gpu_blocks_override",
238-
"sliding_window",
239-
"swap_space_bytes",
240-
]
241-
242185
EXPECTED_METRICS_V1 = [
243186
"vllm:num_requests_running",
244187
"vllm:num_requests_waiting",
@@ -292,6 +235,11 @@ async def test_metrics_counts(
292235
"vllm:request_decode_time_seconds_count",
293236
]
294237

238+
EXPECTED_METRICS_MM = [
239+
"vllm:mm_cache_queries",
240+
"vllm:mm_cache_hits",
241+
]
242+
295243
HIDDEN_DEPRECATED_METRICS: list[str] = [
296244
"vllm:gpu_cache_usage_perc",
297245
"vllm:gpu_prefix_cache_queries",
@@ -304,28 +252,58 @@ async def test_metrics_counts(
304252

305253
@pytest.mark.asyncio
306254
async def test_metrics_exist(
307-
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
255+
server: RemoteOpenAIServer,
256+
client: openai.AsyncClient,
257+
model_key: str,
308258
):
259+
model_name = MODELS[model_key]
260+
309261
# sending a request triggers the metrics to be logged.
310-
await client.completions.create(
311-
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
312-
)
262+
if model_key == "text":
263+
await client.completions.create(
264+
model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
265+
)
266+
else:
267+
messages = [
268+
{
269+
"role": "user",
270+
"content": [
271+
{"type": "image_url", "image_url": {"url": _IMAGE_URL}},
272+
{"type": "text", "text": "What's in this image?"},
273+
],
274+
}
275+
]
276+
277+
await client.chat.completions.create(
278+
model=model_name, messages=messages, max_tokens=5, temperature=0.0
279+
)
313280

314281
response = requests.get(server.url_for("metrics"))
315282
assert response.status_code == HTTPStatus.OK
316283

317-
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
284+
expected_metrics = EXPECTED_METRICS_V1
285+
if model_key == "multimodal":
286+
# NOTE: Don't use in-place assignment
287+
expected_metrics = expected_metrics + EXPECTED_METRICS_MM
288+
289+
for metric in expected_metrics:
318290
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
319291
continue
320292
assert metric in response.text
321293

322294

323295
@pytest.mark.asyncio
324296
async def test_abort_metrics_reset(
325-
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
297+
server: RemoteOpenAIServer,
298+
client: openai.AsyncClient,
299+
model_key: str,
326300
):
301+
model_name = MODELS[model_key]
302+
tokenizer = AutoTokenizer.from_pretrained(model_name)
303+
prompt_ids = tokenizer.encode(_PROMPT)
304+
327305
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
328-
server, use_v1
306+
server,
329307
)
330308

331309
# Expect no running requests or kvcache usage
@@ -338,8 +316,8 @@ async def test_abort_metrics_reset(
338316
for _ in range(3):
339317
task = asyncio.create_task(
340318
client.completions.create(
341-
model=MODEL_NAME,
342-
prompt=_TOKENIZED_PROMPT,
319+
model=model_name,
320+
prompt=prompt_ids,
343321
max_tokens=100, # Long generation to give time to abort
344322
temperature=0.0,
345323
)
@@ -351,7 +329,7 @@ async def test_abort_metrics_reset(
351329

352330
# Check that we have running requests
353331
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
354-
server, use_v1
332+
server,
355333
)
356334

357335
# Expect running requests and kvcache usage
@@ -371,7 +349,7 @@ async def test_abort_metrics_reset(
371349

372350
# Verify running and waiting requests counts and KV cache usage are zero
373351
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
374-
_get_running_metrics_from_api(server, use_v1)
352+
_get_running_metrics_from_api(server)
375353
)
376354

377355
assert running_requests_after == 0, (
@@ -385,7 +363,7 @@ async def test_abort_metrics_reset(
385363
)
386364

387365

388-
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
366+
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
389367
"""Return (running_count, waiting_count, kv_cache_usage)"""
390368

391369
response = requests.get(server.url_for("metrics"))
@@ -394,9 +372,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
394372
# Verify running and waiting requests counts and KV cache usage are zero
395373
running_requests, waiting_requests, kv_cache_usage = None, None, None
396374

397-
kv_cache_usage_metric = (
398-
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
399-
)
375+
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
400376

401377
for family in text_string_to_metric_families(response.text):
402378
if family.name == "vllm:num_requests_running":
@@ -422,7 +398,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
422398
return running_requests, waiting_requests, kv_cache_usage
423399

424400

425-
def test_metrics_exist_run_batch(use_v1: bool):
401+
def test_metrics_exist_run_batch():
426402
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
427403

428404
base_url = "0.0.0.0"
@@ -452,7 +428,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
452428
"--port",
453429
port,
454430
],
455-
env={"VLLM_USE_V1": "1"},
456431
)
457432

458433
def is_server_up(url):

0 commit comments

Comments
 (0)