18
18
19
19
from ...utils import RemoteOpenAIServer
20
20
21
- MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
21
+ MODELS = {
22
+ "text" : "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ,
23
+ "multimodal" : "HuggingFaceTB/SmolVLM-256M-Instruct" ,
24
+ }
22
25
PREV_MINOR_VERSION = version ._prev_minor_version ()
23
26
24
27
25
- @pytest .fixture (scope = "module" , params = [True ])
26
- def use_v1 (request ):
27
- # Module-scoped variant of run_with_both_engines
28
- #
29
- # Use this fixture to run a test with both v0 and v1, and
30
- # also to conditionalize the test logic e.g.
31
- #
32
- # def test_metrics_exist(use_v1, server, client):
33
- # ...
34
- # expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
35
- # for metric in expected:
36
- # assert metric in response.text
37
- #
38
- # @skip_v1 wouldn't work here because this is a module-level
39
- # fixture - per-function decorators would have no effect
28
+ @pytest .fixture (scope = "module" , params = list (MODELS .keys ()))
29
+ def model_key (request ):
40
30
yield request .param
41
31
42
32
@@ -63,13 +53,12 @@ def default_server_args():
63
53
f"--show-hidden-metrics-for-version={ PREV_MINOR_VERSION } " ,
64
54
],
65
55
)
66
- def server (use_v1 , default_server_args , request ):
56
+ def server (model_key , default_server_args , request ):
67
57
if request .param :
68
58
default_server_args .append (request .param )
69
- env_dict = dict (VLLM_USE_V1 = "1" if use_v1 else "0" )
70
- with RemoteOpenAIServer (
71
- MODEL_NAME , default_server_args , env_dict = env_dict
72
- ) as remote_server :
59
+
60
+ model_name = MODELS [model_key ]
61
+ with RemoteOpenAIServer (model_name , default_server_args ) as remote_server :
73
62
yield remote_server
74
63
75
64
@@ -80,62 +69,71 @@ async def client(server):
80
69
81
70
82
71
_PROMPT = "Hello my name is Robert and I love magic"
83
- tokenizer = AutoTokenizer .from_pretrained (MODEL_NAME )
84
- _TOKENIZED_PROMPT = tokenizer (_PROMPT )["input_ids" ]
72
+ _IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
85
73
86
74
_NUM_REQUESTS = 10
87
- _NUM_PROMPT_TOKENS_PER_REQUEST = len (_TOKENIZED_PROMPT )
88
75
_NUM_GENERATION_TOKENS_PER_REQUEST = 10
89
76
90
- # {metric_family: [(suffix, expected_value)]}
91
- EXPECTED_VALUES = {
92
- "vllm:time_to_first_token_seconds" : [("_count" , _NUM_REQUESTS )],
93
- "vllm:time_per_output_token_seconds" : [
94
- ("_count" , _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1 ))
95
- ],
96
- "vllm:e2e_request_latency_seconds" : [("_count" , _NUM_REQUESTS )],
97
- "vllm:request_queue_time_seconds" : [("_count" , _NUM_REQUESTS )],
98
- "vllm:request_inference_time_seconds" : [("_count" , _NUM_REQUESTS )],
99
- "vllm:request_prefill_time_seconds" : [("_count" , _NUM_REQUESTS )],
100
- "vllm:request_decode_time_seconds" : [("_count" , _NUM_REQUESTS )],
101
- "vllm:request_prompt_tokens" : [
102
- ("_sum" , _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST ),
103
- ("_count" , _NUM_REQUESTS ),
104
- ],
105
- "vllm:request_generation_tokens" : [
106
- ("_sum" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
107
- ("_count" , _NUM_REQUESTS ),
108
- ],
109
- "vllm:request_params_n" : [("_count" , _NUM_REQUESTS )],
110
- "vllm:request_params_max_tokens" : [
111
- ("_sum" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
112
- ("_count" , _NUM_REQUESTS ),
113
- ],
114
- "vllm:iteration_tokens_total" : [
115
- (
116
- "_sum" ,
117
- _NUM_REQUESTS
118
- * (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST ),
119
- ),
120
- ("_count" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
121
- ],
122
- "vllm:prompt_tokens" : [("_total" , _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST )],
123
- "vllm:generation_tokens" : [
124
- ("_total" , _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST )
125
- ],
126
- "vllm:request_success" : [("_total" , _NUM_REQUESTS )],
127
- }
77
+
78
+ def _get_expected_values (prompt_ids : list [int ]):
79
+ num_prompt_tokens = len (prompt_ids )
80
+
81
+ # {metric_family: [(suffix, expected_value)]}
82
+ return {
83
+ "vllm:time_to_first_token_seconds" : [("_count" , _NUM_REQUESTS )],
84
+ "vllm:time_per_output_token_seconds" : [
85
+ ("_count" , _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1 ))
86
+ ],
87
+ "vllm:e2e_request_latency_seconds" : [("_count" , _NUM_REQUESTS )],
88
+ "vllm:request_queue_time_seconds" : [("_count" , _NUM_REQUESTS )],
89
+ "vllm:request_inference_time_seconds" : [("_count" , _NUM_REQUESTS )],
90
+ "vllm:request_prefill_time_seconds" : [("_count" , _NUM_REQUESTS )],
91
+ "vllm:request_decode_time_seconds" : [("_count" , _NUM_REQUESTS )],
92
+ "vllm:request_prompt_tokens" : [
93
+ ("_sum" , _NUM_REQUESTS * num_prompt_tokens ),
94
+ ("_count" , _NUM_REQUESTS ),
95
+ ],
96
+ "vllm:request_generation_tokens" : [
97
+ ("_sum" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
98
+ ("_count" , _NUM_REQUESTS ),
99
+ ],
100
+ "vllm:request_params_n" : [("_count" , _NUM_REQUESTS )],
101
+ "vllm:request_params_max_tokens" : [
102
+ ("_sum" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
103
+ ("_count" , _NUM_REQUESTS ),
104
+ ],
105
+ "vllm:iteration_tokens_total" : [
106
+ (
107
+ "_sum" ,
108
+ _NUM_REQUESTS
109
+ * (num_prompt_tokens + _NUM_GENERATION_TOKENS_PER_REQUEST ),
110
+ ),
111
+ ("_count" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
112
+ ],
113
+ "vllm:prompt_tokens" : [("_total" , _NUM_REQUESTS * num_prompt_tokens )],
114
+ "vllm:generation_tokens" : [("_total" , _NUM_REQUESTS * num_prompt_tokens )],
115
+ "vllm:request_success" : [("_total" , _NUM_REQUESTS )],
116
+ }
128
117
129
118
130
119
@pytest .mark .asyncio
131
120
async def test_metrics_counts (
132
- server : RemoteOpenAIServer , client : openai .AsyncClient , use_v1 : bool
121
+ server : RemoteOpenAIServer ,
122
+ client : openai .AsyncClient ,
123
+ model_key : str ,
133
124
):
125
+ if model_key == "multimodal" :
126
+ pytest .skip ("Unnecessary test" )
127
+
128
+ model_name = MODELS [model_key ]
129
+ tokenizer = AutoTokenizer .from_pretrained (model_name )
130
+ prompt_ids = tokenizer .encode (_PROMPT )
131
+
134
132
for _ in range (_NUM_REQUESTS ):
135
133
# sending a request triggers the metrics to be logged.
136
134
await client .completions .create (
137
- model = MODEL_NAME ,
138
- prompt = _TOKENIZED_PROMPT ,
135
+ model = model_name ,
136
+ prompt = prompt_ids ,
139
137
max_tokens = _NUM_GENERATION_TOKENS_PER_REQUEST ,
140
138
)
141
139
@@ -144,8 +142,9 @@ async def test_metrics_counts(
144
142
assert response .status_code == HTTPStatus .OK
145
143
146
144
# Loop over all expected metric_families
147
- for metric_family , suffix_values_list in EXPECTED_VALUES .items ():
148
- if (use_v1 and metric_family not in EXPECTED_METRICS_V1 ) or (
145
+ expected_values = _get_expected_values (prompt_ids )
146
+ for metric_family , suffix_values_list in expected_values .items ():
147
+ if metric_family not in EXPECTED_METRICS_V1 or (
149
148
not server .show_hidden_metrics
150
149
and metric_family in HIDDEN_DEPRECATED_METRICS
151
150
):
@@ -183,62 +182,6 @@ async def test_metrics_counts(
183
182
assert found_metric , f"Did not find { metric_family } in prom endpoint"
184
183
185
184
186
- EXPECTED_METRICS = [
187
- "vllm:num_requests_running" ,
188
- "vllm:num_requests_waiting" ,
189
- "vllm:gpu_cache_usage_perc" ,
190
- "vllm:time_to_first_token_seconds_sum" ,
191
- "vllm:time_to_first_token_seconds_bucket" ,
192
- "vllm:time_to_first_token_seconds_count" ,
193
- "vllm:time_per_output_token_seconds_sum" ,
194
- "vllm:time_per_output_token_seconds_bucket" ,
195
- "vllm:time_per_output_token_seconds_count" ,
196
- "vllm:e2e_request_latency_seconds_sum" ,
197
- "vllm:e2e_request_latency_seconds_bucket" ,
198
- "vllm:e2e_request_latency_seconds_count" ,
199
- "vllm:request_queue_time_seconds_sum" ,
200
- "vllm:request_queue_time_seconds_bucket" ,
201
- "vllm:request_queue_time_seconds_count" ,
202
- "vllm:request_inference_time_seconds_sum" ,
203
- "vllm:request_inference_time_seconds_bucket" ,
204
- "vllm:request_inference_time_seconds_count" ,
205
- "vllm:request_prefill_time_seconds_sum" ,
206
- "vllm:request_prefill_time_seconds_bucket" ,
207
- "vllm:request_prefill_time_seconds_count" ,
208
- "vllm:request_decode_time_seconds_sum" ,
209
- "vllm:request_decode_time_seconds_bucket" ,
210
- "vllm:request_decode_time_seconds_count" ,
211
- "vllm:request_prompt_tokens_sum" ,
212
- "vllm:request_prompt_tokens_bucket" ,
213
- "vllm:request_prompt_tokens_count" ,
214
- "vllm:request_generation_tokens_sum" ,
215
- "vllm:request_generation_tokens_bucket" ,
216
- "vllm:request_generation_tokens_count" ,
217
- "vllm:request_params_n_sum" ,
218
- "vllm:request_params_n_bucket" ,
219
- "vllm:request_params_n_count" ,
220
- "vllm:request_params_max_tokens_sum" ,
221
- "vllm:request_params_max_tokens_bucket" ,
222
- "vllm:request_params_max_tokens_count" ,
223
- "vllm:iteration_tokens_total" ,
224
- "vllm:num_preemptions_total" ,
225
- "vllm:prompt_tokens_total" ,
226
- "vllm:generation_tokens_total" ,
227
- "vllm:request_success_total" ,
228
- "vllm:cache_config_info" ,
229
- # labels in cache_config_info
230
- "block_size" ,
231
- "cache_dtype" ,
232
- "cpu_offload_gb" ,
233
- "enable_prefix_caching" ,
234
- "gpu_memory_utilization" ,
235
- "num_cpu_blocks" ,
236
- "num_gpu_blocks" ,
237
- "num_gpu_blocks_override" ,
238
- "sliding_window" ,
239
- "swap_space_bytes" ,
240
- ]
241
-
242
185
EXPECTED_METRICS_V1 = [
243
186
"vllm:num_requests_running" ,
244
187
"vllm:num_requests_waiting" ,
@@ -292,6 +235,11 @@ async def test_metrics_counts(
292
235
"vllm:request_decode_time_seconds_count" ,
293
236
]
294
237
238
+ EXPECTED_METRICS_MM = [
239
+ "vllm:mm_cache_queries" ,
240
+ "vllm:mm_cache_hits" ,
241
+ ]
242
+
295
243
HIDDEN_DEPRECATED_METRICS : list [str ] = [
296
244
"vllm:gpu_cache_usage_perc" ,
297
245
"vllm:gpu_prefix_cache_queries" ,
@@ -304,28 +252,58 @@ async def test_metrics_counts(
304
252
305
253
@pytest .mark .asyncio
306
254
async def test_metrics_exist (
307
- server : RemoteOpenAIServer , client : openai .AsyncClient , use_v1 : bool
255
+ server : RemoteOpenAIServer ,
256
+ client : openai .AsyncClient ,
257
+ model_key : str ,
308
258
):
259
+ model_name = MODELS [model_key ]
260
+
309
261
# sending a request triggers the metrics to be logged.
310
- await client .completions .create (
311
- model = MODEL_NAME , prompt = "Hello, my name is" , max_tokens = 5 , temperature = 0.0
312
- )
262
+ if model_key == "text" :
263
+ await client .completions .create (
264
+ model = model_name , prompt = "Hello, my name is" , max_tokens = 5 , temperature = 0.0
265
+ )
266
+ else :
267
+ messages = [
268
+ {
269
+ "role" : "user" ,
270
+ "content" : [
271
+ {"type" : "image_url" , "image_url" : {"url" : _IMAGE_URL }},
272
+ {"type" : "text" , "text" : "What's in this image?" },
273
+ ],
274
+ }
275
+ ]
276
+
277
+ await client .chat .completions .create (
278
+ model = model_name , messages = messages , max_tokens = 5 , temperature = 0.0
279
+ )
313
280
314
281
response = requests .get (server .url_for ("metrics" ))
315
282
assert response .status_code == HTTPStatus .OK
316
283
317
- for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS :
284
+ expected_metrics = EXPECTED_METRICS_V1
285
+ if model_key == "multimodal" :
286
+ # NOTE: Don't use in-place assignment
287
+ expected_metrics = expected_metrics + EXPECTED_METRICS_MM
288
+
289
+ for metric in expected_metrics :
318
290
if metric in HIDDEN_DEPRECATED_METRICS and not server .show_hidden_metrics :
319
291
continue
320
292
assert metric in response .text
321
293
322
294
323
295
@pytest .mark .asyncio
324
296
async def test_abort_metrics_reset (
325
- server : RemoteOpenAIServer , client : openai .AsyncClient , use_v1 : bool
297
+ server : RemoteOpenAIServer ,
298
+ client : openai .AsyncClient ,
299
+ model_key : str ,
326
300
):
301
+ model_name = MODELS [model_key ]
302
+ tokenizer = AutoTokenizer .from_pretrained (model_name )
303
+ prompt_ids = tokenizer .encode (_PROMPT )
304
+
327
305
running_requests , waiting_requests , kv_cache_usage = _get_running_metrics_from_api (
328
- server , use_v1
306
+ server ,
329
307
)
330
308
331
309
# Expect no running requests or kvcache usage
@@ -338,8 +316,8 @@ async def test_abort_metrics_reset(
338
316
for _ in range (3 ):
339
317
task = asyncio .create_task (
340
318
client .completions .create (
341
- model = MODEL_NAME ,
342
- prompt = _TOKENIZED_PROMPT ,
319
+ model = model_name ,
320
+ prompt = prompt_ids ,
343
321
max_tokens = 100 , # Long generation to give time to abort
344
322
temperature = 0.0 ,
345
323
)
@@ -351,7 +329,7 @@ async def test_abort_metrics_reset(
351
329
352
330
# Check that we have running requests
353
331
running_requests , waiting_requests , kv_cache_usage = _get_running_metrics_from_api (
354
- server , use_v1
332
+ server ,
355
333
)
356
334
357
335
# Expect running requests and kvcache usage
@@ -371,7 +349,7 @@ async def test_abort_metrics_reset(
371
349
372
350
# Verify running and waiting requests counts and KV cache usage are zero
373
351
running_requests_after , waiting_requests_after , kv_cache_usage_after = (
374
- _get_running_metrics_from_api (server , use_v1 )
352
+ _get_running_metrics_from_api (server )
375
353
)
376
354
377
355
assert running_requests_after == 0 , (
@@ -385,7 +363,7 @@ async def test_abort_metrics_reset(
385
363
)
386
364
387
365
388
- def _get_running_metrics_from_api (server : RemoteOpenAIServer , use_v1 : bool ):
366
+ def _get_running_metrics_from_api (server : RemoteOpenAIServer ):
389
367
"""Return (running_count, waiting_count, kv_cache_usage)"""
390
368
391
369
response = requests .get (server .url_for ("metrics" ))
@@ -394,9 +372,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
394
372
# Verify running and waiting requests counts and KV cache usage are zero
395
373
running_requests , waiting_requests , kv_cache_usage = None , None , None
396
374
397
- kv_cache_usage_metric = (
398
- "vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
399
- )
375
+ kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
400
376
401
377
for family in text_string_to_metric_families (response .text ):
402
378
if family .name == "vllm:num_requests_running" :
@@ -422,7 +398,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
422
398
return running_requests , waiting_requests , kv_cache_usage
423
399
424
400
425
- def test_metrics_exist_run_batch (use_v1 : bool ):
401
+ def test_metrics_exist_run_batch ():
426
402
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
427
403
428
404
base_url = "0.0.0.0"
@@ -452,7 +428,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
452
428
"--port" ,
453
429
port ,
454
430
],
455
- env = {"VLLM_USE_V1" : "1" },
456
431
)
457
432
458
433
def is_server_up (url ):
0 commit comments