Skip to content

Commit 9b96279

Browse files
author
ea.pav
committed
remove deprecated metrics, fix namings
1 parent 6be53bd commit 9b96279

File tree

2 files changed

+38
-81
lines changed

2 files changed

+38
-81
lines changed

README.md

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -227,16 +227,8 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with
227227
gauge_scheduler_running
228228
# Number of requests waiting to be processed.
229229
gauge_scheduler_waiting
230-
# Number of requests swapped to CPU.
231-
gauge_scheduler_swapped
232230
# GPU KV-cache usage. 1 means 100 percent usage.
233231
gauge_gpu_cache_usage
234-
# CPU KV-cache usage. 1 means 100 percent usage.
235-
gauge_cpu_cache_usage
236-
# CPU prefix cache block hit rate.
237-
gauge_cpu_prefix_cache_hit_rate
238-
# GPU prefix cache block hit rate.
239-
gauge_gpu_prefix_cache_hit_rate
240232
# Number of prefill tokens processed.
241233
counter_prompt_tokens
242234
# Number of generation tokens processed.
@@ -285,21 +277,9 @@ vllm:num_requests_running{model="vllm_model",version="1"} 0
285277
# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
286278
# TYPE vllm:num_requests_waiting gauge
287279
vllm:num_requests_waiting{model="vllm_model",version="1"} 0
288-
# HELP vllm:num_requests_swapped Number of requests swapped to CPU.
289-
# TYPE vllm:num_requests_swapped gauge
290-
vllm:num_requests_swapped{model="vllm_model",version="1"} 0
291280
# HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage.
292281
# TYPE vllm:gpu_cache_usage_perc gauge
293282
vllm:gpu_cache_usage_perc{model="vllm_model",version="1"} 0
294-
# HELP vllm:cpu_cache_usage_perc Gauga of cpu cache usage. 1 means 100 percent usage.
295-
# TYPE vllm:cpu_cache_usage_perc gauge
296-
vllm:cpu_cache_usage_perc{model="vllm_model",version="1"} 0
297-
# HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate.
298-
# TYPE vllm:cpu_prefix_cache_hit_rate gauge
299-
vllm:cpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1
300-
# HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate.
301-
# TYPE vllm:gpu_prefix_cache_hit_rate gauge
302-
vllm:gpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1
303283
# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
304284
# TYPE vllm:iteration_tokens_total histogram
305285
vllm:iteration_tokens_total_count{model="vllm_model",version="1"} 10

src/utils/metrics.py

Lines changed: 38 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,12 @@ def __init__(self, labels: List[str], max_model_len: int):
4949
description="Number of generation tokens processed.",
5050
kind=pb_utils.MetricFamily.COUNTER,
5151
)
52-
self.counter_preemption_tokens_family = pb_utils.MetricFamily(
52+
self.counter_num_preemption_family = pb_utils.MetricFamily(
5353
name="vllm:num_preemptions_total",
5454
description="Number of preemption tokens processed.",
5555
kind=pb_utils.MetricFamily.COUNTER,
5656
)
57-
self.histogram_iteration_tokens_total_family = pb_utils.MetricFamily(
57+
self.histogram_iteration_tokens_family = pb_utils.MetricFamily(
5858
name="vllm:iteration_tokens_total",
5959
description="Histogram of number of tokens per engine_step.",
6060
kind=pb_utils.MetricFamily.HISTOGRAM,
@@ -124,33 +124,12 @@ def __init__(self, labels: List[str], max_model_len: int):
124124
description="Number of requests waiting to be processed.",
125125
kind=pb_utils.MetricFamily.GAUGE,
126126
)
127-
self.gauge_scheduler_swapped_family = pb_utils.MetricFamily(
128-
name="vllm:num_requests_swapped",
129-
description="Number of requests swapped to CPU.",
130-
kind=pb_utils.MetricFamily.GAUGE,
131-
)
132127
# KV Cache Usage in %
133128
self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
134129
name="vllm:gpu_cache_usage_perc",
135130
description="GPU KV-cache usage. 1 means 100 percent usage.",
136131
kind=pb_utils.MetricFamily.GAUGE,
137132
)
138-
self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily(
139-
name="vllm:cpu_cache_usage_perc",
140-
description="CPU KV-cache usage. 1 means 100 percent usage.",
141-
kind=pb_utils.MetricFamily.GAUGE,
142-
)
143-
# Prefix caching block hit rate
144-
self.gauge_cpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily(
145-
name="vllm:cpu_prefix_cache_hit_rate",
146-
description="CPU prefix cache block hit rate.",
147-
kind=pb_utils.MetricFamily.GAUGE,
148-
)
149-
self.gauge_gpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily(
150-
name="vllm:gpu_prefix_cache_hit_rate",
151-
description="GPU prefix cache block hit rate.",
152-
kind=pb_utils.MetricFamily.GAUGE,
153-
)
154133

155134
# Initialize metrics
156135
# Iteration stats
@@ -160,17 +139,15 @@ def __init__(self, labels: List[str], max_model_len: int):
160139
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
161140
labels=labels
162141
)
163-
self.counter_preemption_tokens = self.counter_preemption_tokens_family.Metric(
142+
self.counter_num_preemption = self.counter_num_preemption_family.Metric(
164143
labels=labels
165144
)
166145

167146
# Use the same bucket boundaries from vLLM sample metrics as an example.
168147
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
169-
self.histogram_iteration_tokens_total = (
170-
self.histogram_iteration_tokens_total_family.Metric(
171-
labels=labels,
172-
buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
173-
)
148+
self.histogram_iteration_tokens = self.histogram_iteration_tokens_family.Metric(
149+
labels=labels,
150+
buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
174151
)
175152

176153
self.histogram_time_to_first_token = (
@@ -218,32 +195,55 @@ def __init__(self, labels: List[str], max_model_len: int):
218195
)
219196
# Request stats
220197
# Latency
198+
request_latency_buckets = [
199+
0.3,
200+
0.5,
201+
0.8,
202+
1.0,
203+
1.5,
204+
2.0,
205+
2.5,
206+
5.0,
207+
10.0,
208+
15.0,
209+
20.0,
210+
30.0,
211+
40.0,
212+
50.0,
213+
60.0,
214+
120.0,
215+
240.0,
216+
480.0,
217+
960.0,
218+
1920.0,
219+
7680.0,
220+
]
221221
self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
222222
labels=labels,
223-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
223+
buckets=request_latency_buckets,
224224
)
225225
self.histogram_prefill_time_request = (
226226
self.histogram_prefill_time_request_family.Metric(
227227
labels=labels,
228-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
228+
buckets=request_latency_buckets,
229229
)
230230
)
231231
self.histogram_decode_time_request = (
232232
self.histogram_decode_time_request_family.Metric(
233233
labels=labels,
234-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
234+
buckets=request_latency_buckets,
235235
)
236236
)
237237
self.histogram_inference_time_request = (
238238
self.histogram_inference_time_request_family.Metric(
239239
labels=labels,
240-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
240+
buckets=request_latency_buckets,
241241
)
242242
)
243243
self.histogram_queue_time_request = (
244244
self.histogram_queue_time_request_family.Metric(
245245
labels=labels,
246-
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
246+
buckets=request_latency_buckets,
247247
)
248248
)
249249
# Metadata
@@ -265,29 +265,16 @@ def __init__(self, labels: List[str], max_model_len: int):
265265
)
266266
# System stats
267267
# Scheduler State
268-
self.gauge_num_requests_running = self.gauge_scheduler_running_family.Metric(
269-
labels=labels
270-
)
271-
self.gauge_num_requests_waiting = self.gauge_scheduler_waiting_family.Metric(
268+
self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric(
272269
labels=labels
273270
)
274-
self.gauge_num_requests_swapped = self.gauge_scheduler_swapped_family.Metric(
271+
self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric(
275272
labels=labels
276273
)
277274
# KV Cache Usage in %
278275
self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
279276
labels=labels
280277
)
281-
self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric(
282-
labels=labels
283-
)
284-
# Prefix caching block hit rate
285-
self.gauge_cpu_prefix_cache_hit_rate = (
286-
self.gauge_cpu_prefix_cache_hit_rate_family.Metric(labels=labels)
287-
)
288-
self.gauge_gpu_prefix_cache_hit_rate = (
289-
self.gauge_gpu_prefix_cache_hit_rate_family.Metric(labels=labels)
290-
)
291278

292279

293280
class VllmStatLogger(VllmStatLoggerBase):
@@ -394,19 +381,9 @@ def log(self, stats: VllmStats) -> None:
394381
(self.metrics.histogram_n_request, stats.n_requests),
395382
]
396383
gauge_metrics = [
397-
(self.metrics.gauge_num_requests_running, stats.num_running_sys),
398-
(self.metrics.gauge_num_requests_waiting, stats.num_waiting_sys),
399-
(self.metrics.gauge_num_requests_swapped, stats.num_swapped_sys),
384+
(self.metrics.gauge_scheduler_running, stats.num_running_sys),
385+
(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys),
400386
(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys),
401-
(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys),
402-
(
403-
self.metrics.gauge_cpu_prefix_cache_hit_rate,
404-
stats.cpu_prefix_cache_hit_rate,
405-
),
406-
(
407-
self.metrics.gauge_gpu_prefix_cache_hit_rate,
408-
stats.gpu_prefix_cache_hit_rate,
409-
),
410387
]
411388
for metric, data in counter_metrics:
412389
self._log_counter(metric, data)

0 commit comments

Comments
 (0)