Skip to content

Commit 0e98964

Browse files
authored
[V1][Metrics] Remove metrics that were deprecated in 0.8 (vllm-project#18837)
Signed-off-by: Mark McLoughlin <[email protected]>
1 parent c68b5c6 commit 0e98964

File tree

6 files changed

+1
-156
lines changed

6 files changed

+1
-156
lines changed

docs/usage/metrics.md

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,6 @@ The following metrics are exposed:
3535
--8<-- "vllm/engine/metrics.py:metrics-definitions"
3636
```
3737

38-
The following metrics are deprecated and due to be removed in a future version:
39-
40-
- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
41-
`vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
42-
used in V1.
43-
- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
44-
counters in V1.
45-
- `vllm:time_in_queue_requests` because it duplicates
46-
`vllm:request_queue_time_seconds`.
47-
- `vllm:model_forward_time_milliseconds` and
48-
`vllm:model_execute_time_milliseconds` because
49-
prefill/decode/inference time metrics should be used instead.
50-
5138
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
5239
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
5340
and are then removed in version `X.Y+2`.

examples/online_serving/prometheus_grafana/grafana.json

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -577,23 +577,6 @@
577577
"refId": "A",
578578
"useBackend": false
579579
},
580-
{
581-
"datasource": {
582-
"type": "prometheus",
583-
"uid": "${DS_PROMETHEUS}"
584-
},
585-
"disableTextWrap": false,
586-
"editorMode": "builder",
587-
"expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
588-
"fullMetaSearch": false,
589-
"hide": false,
590-
"includeNullMetadata": true,
591-
"instant": false,
592-
"legendFormat": "Num Swapped",
593-
"range": true,
594-
"refId": "B",
595-
"useBackend": false
596-
},
597580
{
598581
"datasource": {
599582
"type": "prometheus",
@@ -874,19 +857,6 @@
874857
"legendFormat": "GPU Cache Usage",
875858
"range": true,
876859
"refId": "A"
877-
},
878-
{
879-
"datasource": {
880-
"type": "prometheus",
881-
"uid": "${DS_PROMETHEUS}"
882-
},
883-
"editorMode": "code",
884-
"expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
885-
"hide": false,
886-
"instant": false,
887-
"legendFormat": "CPU Cache Usage",
888-
"range": true,
889-
"refId": "B"
890860
}
891861
],
892862
"title": "Cache Utilization",

tests/entrypoints/openai/test_metrics.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
171171

172172
EXPECTED_METRICS = [
173173
"vllm:num_requests_running",
174-
"vllm:num_requests_swapped", # deprecated
175174
"vllm:num_requests_waiting",
176175
"vllm:gpu_cache_usage_perc",
177-
"vllm:cpu_cache_usage_perc", # deprecated
178176
"vllm:time_to_first_token_seconds_sum",
179177
"vllm:time_to_first_token_seconds_bucket",
180178
"vllm:time_to_first_token_seconds_count",
@@ -274,10 +272,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
274272
"vllm:request_decode_time_seconds_count",
275273
]
276274

277-
HIDDEN_DEPRECATED_METRICS = [
278-
"vllm:num_requests_swapped",
279-
"vllm:cpu_cache_usage_perc",
280-
]
275+
HIDDEN_DEPRECATED_METRICS: list[str] = []
281276

282277

283278
@pytest.mark.asyncio

vllm/engine/llm_engine.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,9 +1680,6 @@ def _get_stats(self,
16801680
time_inference_requests: List[float] = []
16811681
time_prefill_requests: List[float] = []
16821682
time_decode_requests: List[float] = []
1683-
time_in_queue_requests: List[float] = []
1684-
model_forward_time_requests: List[float] = []
1685-
model_execute_time_requests: List[float] = []
16861683
# Metadata
16871684
num_prompt_tokens_requests: List[int] = []
16881685
num_generation_tokens_requests: List[int] = []
@@ -1790,15 +1787,6 @@ def _get_stats(self,
17901787
now - seq_group.metrics.first_token_time)
17911788
time_inference_requests.append(
17921789
now - seq_group.metrics.first_scheduled_time)
1793-
if seq_group.metrics.time_in_queue is not None:
1794-
time_in_queue_requests.append(
1795-
seq_group.metrics.time_in_queue)
1796-
if seq_group.metrics.model_forward_time is not None:
1797-
model_forward_time_requests.append(
1798-
seq_group.metrics.model_forward_time)
1799-
if seq_group.metrics.model_execute_time is not None:
1800-
model_execute_time_requests.append(
1801-
seq_group.metrics.model_execute_time * 1000)
18021790
# Metadata
18031791
num_prompt_tokens_requests.append(
18041792
len(seq_group.prompt_token_ids))
@@ -1867,9 +1855,6 @@ def _get_stats(self,
18671855
time_inference_requests=time_inference_requests,
18681856
time_prefill_requests=time_prefill_requests,
18691857
time_decode_requests=time_decode_requests,
1870-
time_in_queue_requests=time_in_queue_requests,
1871-
model_forward_time_requests=model_forward_time_requests,
1872-
model_execute_time_requests=model_execute_time_requests,
18731858
# Metadata
18741859
num_prompt_tokens_requests=num_prompt_tokens_requests,
18751860
num_generation_tokens_requests=num_generation_tokens_requests,

vllm/engine/metrics.py

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -80,53 +80,13 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
8080
multiprocess_mode="livemostrecent",
8181
)
8282

83-
# Deprecated in 0.8 - KV cache offloading is not used in V1
84-
# Hidden in 0.9, due to be removed in 0.10
85-
if self.show_hidden_metrics:
86-
self.gauge_scheduler_swapped = self._gauge_cls(
87-
name="vllm:num_requests_swapped",
88-
documentation=(
89-
"Number of requests swapped to CPU. "
90-
"DEPRECATED: KV cache offloading is not used in V1"),
91-
labelnames=labelnames,
92-
multiprocess_mode="sum")
93-
9483
# KV Cache Usage in %
9584
self.gauge_gpu_cache_usage = self._gauge_cls(
9685
name="vllm:gpu_cache_usage_perc",
9786
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
9887
labelnames=labelnames,
9988
multiprocess_mode="sum")
10089

101-
# Deprecated in 0.8 - KV cache offloading is not used in V1
102-
# Hidden in 0.9, due to be removed in 0.10
103-
if self.show_hidden_metrics:
104-
self.gauge_cpu_cache_usage = self._gauge_cls(
105-
name="vllm:cpu_cache_usage_perc",
106-
documentation=(
107-
"CPU KV-cache usage. 1 means 100 percent usage. "
108-
"DEPRECATED: KV cache offloading is not used in V1"),
109-
labelnames=labelnames,
110-
multiprocess_mode="sum")
111-
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
112-
name="vllm:cpu_prefix_cache_hit_rate",
113-
documentation=(
114-
"CPU prefix cache block hit rate. "
115-
"DEPRECATED: KV cache offloading is not used in V1"),
116-
labelnames=labelnames,
117-
multiprocess_mode="sum")
118-
119-
# Deprecated in 0.8 - replaced by queries+hits counters in V1
120-
# Hidden in 0.9, due to be removed in 0.10
121-
if self.show_hidden_metrics:
122-
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
123-
name="vllm:gpu_prefix_cache_hit_rate",
124-
documentation=("GPU prefix cache block hit rate. "
125-
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
126-
"and vllm:gpu_prefix_cache_queries in V1"),
127-
labelnames=labelnames,
128-
multiprocess_mode="sum")
129-
13090
# Iteration stats
13191
self.counter_num_preemption = self._counter_cls(
13292
name="vllm:num_preemptions_total",
@@ -200,36 +160,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
200160
"Histogram of time spent in DECODE phase for request.",
201161
labelnames=labelnames,
202162
buckets=request_latency_buckets)
203-
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
204-
# Hidden in 0.9, due to be removed in 0.10
205-
if self.show_hidden_metrics:
206-
self.histogram_time_in_queue_request = self._histogram_cls(
207-
name="vllm:time_in_queue_requests",
208-
documentation=
209-
("Histogram of time the request spent in the queue in seconds. "
210-
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
211-
labelnames=labelnames,
212-
buckets=request_latency_buckets)
213-
214-
# Deprecated in 0.8 - use prefill/decode/inference time metrics
215-
# Hidden in 0.9, due to be removed in 0.10
216-
if self.show_hidden_metrics:
217-
self.histogram_model_forward_time_request = self._histogram_cls(
218-
name="vllm:model_forward_time_milliseconds",
219-
documentation=
220-
("Histogram of time spent in the model forward pass in ms. "
221-
"DEPRECATED: use prefill/decode/inference time metrics instead"
222-
),
223-
labelnames=labelnames,
224-
buckets=build_1_2_3_5_8_buckets(3000))
225-
self.histogram_model_execute_time_request = self._histogram_cls(
226-
name="vllm:model_execute_time_milliseconds",
227-
documentation=
228-
("Histogram of time spent in the model execute function in ms."
229-
"DEPRECATED: use prefill/decode/inference time metrics instead"
230-
),
231-
labelnames=labelnames,
232-
buckets=build_1_2_3_5_8_buckets(3000))
233163

234164
# Metadata
235165
self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -580,20 +510,10 @@ def _log_prometheus(self, stats: Stats) -> None:
580510
# System state data
581511
self._log_gauge(self.metrics.gauge_scheduler_running,
582512
stats.num_running_sys)
583-
if self.metrics.show_hidden_metrics:
584-
self._log_gauge(self.metrics.gauge_scheduler_swapped,
585-
stats.num_swapped_sys)
586513
self._log_gauge(self.metrics.gauge_scheduler_waiting,
587514
stats.num_waiting_sys)
588515
self._log_gauge(self.metrics.gauge_gpu_cache_usage,
589516
stats.gpu_cache_usage_sys)
590-
if self.metrics.show_hidden_metrics:
591-
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
592-
stats.cpu_cache_usage_sys)
593-
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
594-
stats.cpu_prefix_cache_hit_rate)
595-
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
596-
stats.gpu_prefix_cache_hit_rate)
597517
# Including max-lora in metric, in future this property of lora
598518
# config maybe extended to be dynamic.
599519
lora_info = {
@@ -631,15 +551,6 @@ def _log_prometheus(self, stats: Stats) -> None:
631551
stats.time_prefill_requests)
632552
self._log_histogram(self.metrics.histogram_decode_time_request,
633553
stats.time_decode_requests)
634-
if self.metrics.show_hidden_metrics:
635-
self._log_histogram(self.metrics.histogram_time_in_queue_request,
636-
stats.time_in_queue_requests)
637-
self._log_histogram(
638-
self.metrics.histogram_model_forward_time_request,
639-
stats.model_forward_time_requests)
640-
self._log_histogram(
641-
self.metrics.histogram_model_execute_time_request,
642-
stats.model_execute_time_requests)
643554
# Metadata
644555
finished_reason_counter = CollectionsCounter(
645556
stats.finished_reason_requests)

vllm/engine/metrics_types.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,6 @@ class Stats:
5353
time_inference_requests: List[float]
5454
time_prefill_requests: List[float]
5555
time_decode_requests: List[float]
56-
time_in_queue_requests: List[float]
57-
model_forward_time_requests: List[float]
58-
model_execute_time_requests: List[float]
5956
# Metadata
6057
num_prompt_tokens_requests: List[int]
6158
num_generation_tokens_requests: List[int]

0 commit comments

Comments
 (0)