Skip to content

Commit 1cb26b7

Browse files
author
ea.pav
committed
remove deprecated metrics, fix namings
1 parent 6be53bd commit 1cb26b7

File tree

2 files changed

+18
-78
lines changed

2 files changed

+18
-78
lines changed

README.md

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -227,16 +227,8 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with
227227
gauge_scheduler_running
228228
# Number of requests waiting to be processed.
229229
gauge_scheduler_waiting
230-
# Number of requests swapped to CPU.
231-
gauge_scheduler_swapped
232230
# GPU KV-cache usage. 1 means 100 percent usage.
233231
gauge_gpu_cache_usage
234-
# CPU KV-cache usage. 1 means 100 percent usage.
235-
gauge_cpu_cache_usage
236-
# CPU prefix cache block hit rate.
237-
gauge_cpu_prefix_cache_hit_rate
238-
# GPU prefix cache block hit rate.
239-
gauge_gpu_prefix_cache_hit_rate
240232
# Number of prefill tokens processed.
241233
counter_prompt_tokens
242234
# Number of generation tokens processed.
@@ -285,21 +277,9 @@ vllm:num_requests_running{model="vllm_model",version="1"} 0
285277
# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
286278
# TYPE vllm:num_requests_waiting gauge
287279
vllm:num_requests_waiting{model="vllm_model",version="1"} 0
288-
# HELP vllm:num_requests_swapped Number of requests swapped to CPU.
289-
# TYPE vllm:num_requests_swapped gauge
290-
vllm:num_requests_swapped{model="vllm_model",version="1"} 0
291280
# HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage.
292281
# TYPE vllm:gpu_cache_usage_perc gauge
293282
vllm:gpu_cache_usage_perc{model="vllm_model",version="1"} 0
294-
# HELP vllm:cpu_cache_usage_perc Gauga of cpu cache usage. 1 means 100 percent usage.
295-
# TYPE vllm:cpu_cache_usage_perc gauge
296-
vllm:cpu_cache_usage_perc{model="vllm_model",version="1"} 0
297-
# HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate.
298-
# TYPE vllm:cpu_prefix_cache_hit_rate gauge
299-
vllm:cpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1
300-
# HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate.
301-
# TYPE vllm:gpu_prefix_cache_hit_rate gauge
302-
vllm:gpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1
303283
# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
304284
# TYPE vllm:iteration_tokens_total histogram
305285
vllm:iteration_tokens_total_count{model="vllm_model",version="1"} 10

src/utils/metrics.py

Lines changed: 18 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,12 @@ def __init__(self, labels: List[str], max_model_len: int):
4949
description="Number of generation tokens processed.",
5050
kind=pb_utils.MetricFamily.COUNTER,
5151
)
52-
self.counter_preemption_tokens_family = pb_utils.MetricFamily(
52+
self.counter_num_preemption_family = pb_utils.MetricFamily(
5353
name="vllm:num_preemptions_total",
5454
description="Number of preemption tokens processed.",
5555
kind=pb_utils.MetricFamily.COUNTER,
5656
)
57-
self.histogram_iteration_tokens_total_family = pb_utils.MetricFamily(
57+
self.histogram_iteration_tokens_family = pb_utils.MetricFamily(
5858
name="vllm:iteration_tokens_total",
5959
description="Histogram of number of tokens per engine_step.",
6060
kind=pb_utils.MetricFamily.HISTOGRAM,
@@ -124,33 +124,12 @@ def __init__(self, labels: List[str], max_model_len: int):
124124
description="Number of requests waiting to be processed.",
125125
kind=pb_utils.MetricFamily.GAUGE,
126126
)
127-
self.gauge_scheduler_swapped_family = pb_utils.MetricFamily(
128-
name="vllm:num_requests_swapped",
129-
description="Number of requests swapped to CPU.",
130-
kind=pb_utils.MetricFamily.GAUGE,
131-
)
132127
# KV Cache Usage in %
133128
self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
134129
name="vllm:gpu_cache_usage_perc",
135130
description="GPU KV-cache usage. 1 means 100 percent usage.",
136131
kind=pb_utils.MetricFamily.GAUGE,
137132
)
138-
self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily(
139-
name="vllm:cpu_cache_usage_perc",
140-
description="CPU KV-cache usage. 1 means 100 percent usage.",
141-
kind=pb_utils.MetricFamily.GAUGE,
142-
)
143-
# Prefix caching block hit rate
144-
self.gauge_cpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily(
145-
name="vllm:cpu_prefix_cache_hit_rate",
146-
description="CPU prefix cache block hit rate.",
147-
kind=pb_utils.MetricFamily.GAUGE,
148-
)
149-
self.gauge_gpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily(
150-
name="vllm:gpu_prefix_cache_hit_rate",
151-
description="GPU prefix cache block hit rate.",
152-
kind=pb_utils.MetricFamily.GAUGE,
153-
)
154133

155134
# Initialize metrics
156135
# Iteration stats
@@ -160,14 +139,14 @@ def __init__(self, labels: List[str], max_model_len: int):
160139
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
161140
labels=labels
162141
)
163-
self.counter_preemption_tokens = self.counter_preemption_tokens_family.Metric(
142+
self.counter_num_preemption = self.counter_num_preemption_family.Metric(
164143
labels=labels
165144
)
166145

167146
# Use the same bucket boundaries from vLLM sample metrics as an example.
168147
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
169-
self.histogram_iteration_tokens_total = (
170-
self.histogram_iteration_tokens_total_family.Metric(
148+
self.histogram_iteration_tokens = (
149+
self.histogram_iteration_tokens_family.Metric(
171150
labels=labels,
172151
buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
173152
)
@@ -218,32 +197,36 @@ def __init__(self, labels: List[str], max_model_len: int):
218197
)
219198
# Request stats
220199
# Latency
200+
request_latency_buckets = [
201+
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
202+
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
203+
]
221204
self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
222205
labels=labels,
223-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
206+
buckets=request_latency_buckets,
224207
)
225208
self.histogram_prefill_time_request = (
226209
self.histogram_prefill_time_request_family.Metric(
227210
labels=labels,
228-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
211+
buckets=request_latency_buckets,
229212
)
230213
)
231214
self.histogram_decode_time_request = (
232215
self.histogram_decode_time_request_family.Metric(
233216
labels=labels,
234-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
217+
buckets=request_latency_buckets,
235218
)
236219
)
237220
self.histogram_inference_time_request = (
238221
self.histogram_inference_time_request_family.Metric(
239222
labels=labels,
240-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
223+
buckets=request_latency_buckets,
241224
)
242225
)
243226
self.histogram_queue_time_request = (
244227
self.histogram_queue_time_request_family.Metric(
245228
labels=labels,
246-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
229+
buckets=request_latency_buckets,
247230
)
248231
)
249232
# Metadata
@@ -265,29 +248,16 @@ def __init__(self, labels: List[str], max_model_len: int):
265248
)
266249
# System stats
267250
# Scheduler State
268-
self.gauge_num_requests_running = self.gauge_scheduler_running_family.Metric(
251+
self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric(
269252
labels=labels
270253
)
271-
self.gauge_num_requests_waiting = self.gauge_scheduler_waiting_family.Metric(
272-
labels=labels
273-
)
274-
self.gauge_num_requests_swapped = self.gauge_scheduler_swapped_family.Metric(
254+
self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric(
275255
labels=labels
276256
)
277257
# KV Cache Usage in %
278258
self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
279259
labels=labels
280260
)
281-
self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric(
282-
labels=labels
283-
)
284-
# Prefix caching block hit rate
285-
self.gauge_cpu_prefix_cache_hit_rate = (
286-
self.gauge_cpu_prefix_cache_hit_rate_family.Metric(labels=labels)
287-
)
288-
self.gauge_gpu_prefix_cache_hit_rate = (
289-
self.gauge_gpu_prefix_cache_hit_rate_family.Metric(labels=labels)
290-
)
291261

292262

293263
class VllmStatLogger(VllmStatLoggerBase):
@@ -394,19 +364,9 @@ def log(self, stats: VllmStats) -> None:
394364
(self.metrics.histogram_n_request, stats.n_requests),
395365
]
396366
gauge_metrics = [
397-
(self.metrics.gauge_num_requests_running, stats.num_running_sys),
398-
(self.metrics.gauge_num_requests_waiting, stats.num_waiting_sys),
399-
(self.metrics.gauge_num_requests_swapped, stats.num_swapped_sys),
367+
(self.metrics.gauge_scheduler_running, stats.num_running_sys),
368+
(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys),
400369
(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys),
401-
(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys),
402-
(
403-
self.metrics.gauge_cpu_prefix_cache_hit_rate,
404-
stats.cpu_prefix_cache_hit_rate,
405-
),
406-
(
407-
self.metrics.gauge_gpu_prefix_cache_hit_rate,
408-
stats.gpu_prefix_cache_hit_rate,
409-
),
410370
]
411371
for metric, data in counter_metrics:
412372
self._log_counter(metric, data)

0 commit comments

Comments
 (0)