Skip to content

Commit 2942970

Browse files
authored
[Metrics] Hide deprecated metrics with gpu_ prefix (vllm-project#24245)
Signed-off-by: Mark McLoughlin <[email protected]>
1 parent 3c96e7b commit 2942970

File tree

2 files changed

+63
-46
lines changed

2 files changed

+63
-46
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
232232
"vllm:gpu_cache_usage_perc",
233233
"vllm:gpu_prefix_cache_queries",
234234
"vllm:gpu_prefix_cache_hits",
235+
"vllm:kv_cache_usage_perc",
236+
"vllm:prefix_cache_queries",
237+
"vllm:prefix_cache_hits",
235238
"vllm:num_preemptions_total",
236239
"vllm:prompt_tokens_total",
237240
"vllm:generation_tokens_total",
@@ -277,6 +280,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
277280
]
278281

279282
HIDDEN_DEPRECATED_METRICS: list[str] = [
283+
"vllm:gpu_cache_usage_perc",
284+
"vllm:gpu_prefix_cache_queries",
285+
"vllm:gpu_prefix_cache_hits",
280286
"vllm:time_per_output_token_seconds_sum",
281287
"vllm:time_per_output_token_seconds_bucket",
282288
"vllm:time_per_output_token_seconds_count",
@@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
307313
client: openai.AsyncClient, use_v1: bool):
308314

309315
running_requests, waiting_requests, kv_cache_usage = (
310-
_get_running_metrics_from_api(server))
316+
_get_running_metrics_from_api(server, use_v1))
311317

312318
# Expect no running requests or kvcache usage
313319
assert running_requests == 0
@@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
330336

331337
# Check that we have running requests
332338
running_requests, waiting_requests, kv_cache_usage = (
333-
_get_running_metrics_from_api(server))
339+
_get_running_metrics_from_api(server, use_v1))
334340

335341
# Expect running requests and kvcache usage
336342
assert running_requests > 0
@@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
349355

350356
# Verify running and waiting requests counts and KV cache usage are zero
351357
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
352-
_get_running_metrics_from_api(server))
358+
_get_running_metrics_from_api(server, use_v1))
353359

354360
assert running_requests_after == 0,\
355361
(f"Expected 0 running requests after abort, got "
@@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
362368
f"{kv_cache_usage_after}")
363369

364370

365-
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
371+
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
366372
"""Return (running_count, waiting_count, kv_cache_usage)"""
367373

368374
response = requests.get(server.url_for("metrics"))
@@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
371377
# Verify running and waiting requests counts and KV cache usage are zero
372378
running_requests, waiting_requests, kv_cache_usage = None, None, None
373379

380+
kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
381+
if use_v1 else "vllm:gpu_cache_usage_perc")
382+
374383
for family in text_string_to_metric_families(response.text):
375384
if family.name == "vllm:num_requests_running":
376385
for sample in family.samples:
@@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
382391
if sample.name == "vllm:num_requests_waiting":
383392
waiting_requests = sample.value
384393
break
385-
elif family.name == "vllm:gpu_cache_usage_perc":
394+
elif family.name == kv_cache_usage_metric:
386395
for sample in family.samples:
387-
if sample.name == "vllm:gpu_cache_usage_perc":
396+
if sample.name == kv_cache_usage_metric:
388397
kv_cache_usage = sample.value
389398
break
390399

vllm/v1/metrics/loggers.py

Lines changed: 48 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -202,40 +202,46 @@ def __init__(self,
202202
#
203203
# GPU cache
204204
#
205-
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
206-
# TODO: in 0.10, only enable if show_hidden_metrics=True
207-
gauge_gpu_cache_usage = self._gauge_cls(
208-
name="vllm:gpu_cache_usage_perc",
209-
documentation=(
210-
"GPU KV-cache usage. 1 means 100 percent usage."
211-
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
212-
multiprocess_mode="mostrecent",
213-
labelnames=labelnames)
214-
self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage,
215-
engine_indexes,
216-
model_name)
217-
218-
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
219-
# TODO: in 0.10, only enable if show_hidden_metrics=True
220-
counter_gpu_prefix_cache_queries = self._counter_cls(
221-
name="vllm:gpu_prefix_cache_queries",
222-
documentation=(
223-
"GPU prefix cache queries, in terms of number of queried"
224-
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."),
225-
labelnames=labelnames)
226-
self.counter_gpu_prefix_cache_queries = make_per_engine(
227-
counter_gpu_prefix_cache_queries, engine_indexes, model_name)
228-
229-
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
230-
# TODO: in 0.10, only enable if show_hidden_metrics=True
231-
counter_gpu_prefix_cache_hits = self._counter_cls(
232-
name="vllm:gpu_prefix_cache_hits",
233-
documentation=(
234-
"GPU prefix cache hits, in terms of number of cached "
235-
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
236-
labelnames=labelnames)
237-
self.counter_gpu_prefix_cache_hits = make_per_engine(
238-
counter_gpu_prefix_cache_hits, engine_indexes, model_name)
205+
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
206+
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
207+
# TODO: remove in 0.12.0
208+
if self.show_hidden_metrics:
209+
gauge_gpu_cache_usage = self._gauge_cls(
210+
name="vllm:gpu_cache_usage_perc",
211+
documentation=(
212+
"GPU KV-cache usage. 1 means 100 percent usage."
213+
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
214+
multiprocess_mode="mostrecent",
215+
labelnames=labelnames)
216+
self.gauge_gpu_cache_usage = make_per_engine(
217+
gauge_gpu_cache_usage, engine_indexes, model_name)
218+
219+
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
220+
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
221+
# TODO: remove in 0.12.0
222+
if self.show_hidden_metrics:
223+
counter_gpu_prefix_cache_queries = self._counter_cls(
224+
name="vllm:gpu_prefix_cache_queries",
225+
documentation=(
226+
"GPU prefix cache queries, in terms of number of queried"
227+
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
228+
),
229+
labelnames=labelnames)
230+
self.counter_gpu_prefix_cache_queries = make_per_engine(
231+
counter_gpu_prefix_cache_queries, engine_indexes, model_name)
232+
233+
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
234+
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
235+
# TODO: remove in 0.12.0
236+
if self.show_hidden_metrics:
237+
counter_gpu_prefix_cache_hits = self._counter_cls(
238+
name="vllm:gpu_prefix_cache_hits",
239+
documentation=(
240+
"GPU prefix cache hits, in terms of number of cached "
241+
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
242+
labelnames=labelnames)
243+
self.counter_gpu_prefix_cache_hits = make_per_engine(
244+
counter_gpu_prefix_cache_hits, engine_indexes, model_name)
239245

240246
gauge_kv_cache_usage = self._gauge_cls(
241247
name="vllm:kv_cache_usage_perc",
@@ -509,15 +515,17 @@ def record(self,
509515
self.gauge_scheduler_waiting[engine_idx].set(
510516
scheduler_stats.num_waiting_reqs)
511517

512-
self.gauge_gpu_cache_usage[engine_idx].set(
513-
scheduler_stats.kv_cache_usage)
518+
if self.show_hidden_metrics:
519+
self.gauge_gpu_cache_usage[engine_idx].set(
520+
scheduler_stats.kv_cache_usage)
514521
self.gauge_kv_cache_usage[engine_idx].set(
515522
scheduler_stats.kv_cache_usage)
516523

517-
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
518-
scheduler_stats.prefix_cache_stats.queries)
519-
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
520-
scheduler_stats.prefix_cache_stats.hits)
524+
if self.show_hidden_metrics:
525+
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
526+
scheduler_stats.prefix_cache_stats.queries)
527+
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
528+
scheduler_stats.prefix_cache_stats.hits)
521529

522530
self.counter_prefix_cache_queries[engine_idx].inc(
523531
scheduler_stats.prefix_cache_stats.queries)

0 commit comments

Comments
 (0)