Skip to content

Commit b87a1cd

Browse files
committed
add new metrics reporting
1 parent 3564c3c commit b87a1cd

File tree

1 file changed

+171
-0
lines changed

1 file changed

+171
-0
lines changed

src/utils/metrics.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from typing import Dict, List, Union
3030

3131
import triton_python_backend_utils as pb_utils
32+
3233
from vllm.config import VllmConfig
3334
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
3435
from vllm.engine.metrics import Stats as VllmStats
@@ -49,6 +50,16 @@ def __init__(self, labels: List[str], max_model_len: int):
4950
description="Number of generation tokens processed.",
5051
kind=pb_utils.MetricFamily.COUNTER,
5152
)
53+
self.counter_preemption_tokens_family = pb_utils.MetricFamily(
54+
name="vllm:num_preemptions_total",
55+
description="Number of preemption tokens processed.",
56+
kind=pb_utils.MetricFamily.COUNTER,
57+
)
58+
self.histogram_iteration_tokens_total_family = pb_utils.MetricFamily(
59+
name="vllm:iteration_tokens_total",
60+
description="Histogram of number of tokens per engine_step.",
61+
kind=pb_utils.MetricFamily.HISTOGRAM,
62+
)
5263
self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
5364
name="vllm:time_to_first_token_seconds",
5465
description="Histogram of time to first token in seconds.",
@@ -66,6 +77,26 @@ def __init__(self, labels: List[str], max_model_len: int):
6677
description="Histogram of end to end request latency in seconds.",
6778
kind=pb_utils.MetricFamily.HISTOGRAM,
6879
)
80+
self.histogram_queue_time_request_family = pb_utils.MetricFamily(
81+
name="vllm:request_queue_time_seconds",
82+
description="Histogram of time spent in WAITING phase for request.",
83+
kind=pb_utils.MetricFamily.HISTOGRAM,
84+
)
85+
self.histogram_inference_time_request_family = pb_utils.MetricFamily(
86+
name="vllm:request_inference_time_seconds",
87+
description="Histogram of time spent in RUNNING phase for request.",
88+
kind=pb_utils.MetricFamily.HISTOGRAM,
89+
)
90+
self.histogram_prefill_time_request_family = pb_utils.MetricFamily(
91+
name="vllm:request_prefill_time_seconds",
92+
description="Histogram of time spent in PREFILL phase for request.",
93+
kind=pb_utils.MetricFamily.HISTOGRAM,
94+
)
95+
self.histogram_decode_time_request_family = pb_utils.MetricFamily(
96+
name="vllm:request_decode_time_seconds",
97+
description="Histogram of time spent in DECODE phase for request.",
98+
kind=pb_utils.MetricFamily.HISTOGRAM,
99+
)
69100
# Metadata
70101
self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
71102
name="vllm:request_prompt_tokens",
@@ -82,6 +113,45 @@ def __init__(self, labels: List[str], max_model_len: int):
82113
description="Histogram of the n request parameter.",
83114
kind=pb_utils.MetricFamily.HISTOGRAM,
84115
)
116+
# System stats
117+
# Scheduler State
118+
self.gauge_scheduler_running_family = pb_utils.MetricFamily(
119+
name="vllm:num_requests_running",
120+
description="Number of requests currently running on GPU.",
121+
kind=pb_utils.MetricFamily.GAUGE,
122+
)
123+
self.gauge_scheduler_waiting_family = pb_utils.MetricFamily(
124+
name="vllm:num_requests_waiting",
125+
description="Number of requests waiting to be processed.",
126+
kind=pb_utils.MetricFamily.GAUGE,
127+
)
128+
self.gauge_scheduler_swapped_family = pb_utils.MetricFamily(
129+
name="vllm:num_requests_swapped",
130+
description="Number of requests swapped to CPU.",
131+
kind=pb_utils.MetricFamily.GAUGE,
132+
)
133+
# KV Cache Usage in %
134+
self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
135+
name="vllm:gpu_cache_usage_perc",
136+
description="GPU KV-cache usage. 1 means 100 percent usage.",
137+
kind=pb_utils.MetricFamily.GAUGE,
138+
)
139+
self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily(
140+
name="vllm:cpu_cache_usage_perc",
141+
description="CPU KV-cache usage. 1 means 100 percent usage.",
142+
kind=pb_utils.MetricFamily.GAUGE,
143+
)
144+
# Prefix caching block hit rate
145+
self.gauge_cpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily(
146+
name="vllm:cpu_prefix_cache_hit_rate",
147+
description="CPU prefix cache block hit rate.",
148+
kind=pb_utils.MetricFamily.GAUGE,
149+
)
150+
self.gauge_gpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily(
151+
name="vllm:gpu_prefix_cache_hit_rate",
152+
description="GPU prefix cache block hit rate.",
153+
kind=pb_utils.MetricFamily.GAUGE,
154+
)
85155

86156
# Initialize metrics
87157
# Iteration stats
@@ -91,8 +161,19 @@ def __init__(self, labels: List[str], max_model_len: int):
91161
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
92162
labels=labels
93163
)
164+
self.counter_preemption_tokens = self.counter_preemption_tokens_family.Metric(
165+
labels=labels
166+
)
167+
94168
# Use the same bucket boundaries from vLLM sample metrics as an example.
95169
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
170+
self.histogram_iteration_tokens_total = (
171+
self.histogram_iteration_tokens_total_family.Metric(
172+
labels=labels,
173+
buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096],
174+
)
175+
)
176+
96177
self.histogram_time_to_first_token = (
97178
self.histogram_time_to_first_token_family.Metric(
98179
labels=labels,
@@ -142,6 +223,30 @@ def __init__(self, labels: List[str], max_model_len: int):
142223
labels=labels,
143224
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
144225
)
226+
self.histogram_prefill_time_request = (
227+
self.histogram_prefill_time_request_family.Metric(
228+
labels=labels,
229+
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
230+
)
231+
)
232+
self.histogram_decode_time_request = (
233+
self.histogram_decode_time_request_family.Metric(
234+
labels=labels,
235+
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
236+
)
237+
)
238+
self.histogram_inference_time_request = (
239+
self.histogram_inference_time_request_family.Metric(
240+
labels=labels,
241+
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
242+
)
243+
)
244+
self.histogram_queue_time_request = (
245+
self.histogram_queue_time_request_family.Metric(
246+
labels=labels,
247+
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
248+
)
249+
)
145250
# Metadata
146251
self.histogram_num_prompt_tokens_request = (
147252
self.histogram_num_prompt_tokens_request_family.Metric(
@@ -159,6 +264,31 @@ def __init__(self, labels: List[str], max_model_len: int):
159264
labels=labels,
160265
buckets=[1, 2, 5, 10, 20],
161266
)
267+
# System stats
268+
# Scheduler State
269+
self.gauge_num_requests_running = self.gauge_scheduler_running_family.Metric(
270+
labels=labels
271+
)
272+
self.gauge_num_requests_waiting = self.gauge_scheduler_waiting_family.Metric(
273+
labels=labels
274+
)
275+
self.gauge_num_requests_swapped = self.gauge_scheduler_swapped_family.Metric(
276+
labels=labels
277+
)
278+
# KV Cache Usage in %
279+
self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
280+
labels=labels
281+
)
282+
self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric(
283+
labels=labels
284+
)
285+
# Prefix caching block hit rate
286+
self.gauge_cpu_prefix_cache_hit_rate = (
287+
self.gauge_cpu_prefix_cache_hit_rate_family.Metric(labels=labels)
288+
)
289+
self.gauge_gpu_prefix_cache_hit_rate = (
290+
self.gauge_gpu_prefix_cache_hit_rate_family.Metric(labels=labels)
291+
)
162292

163293

164294
class VllmStatLogger(VllmStatLoggerBase):
@@ -182,6 +312,18 @@ def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
182312
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
183313
pass
184314

315+
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
316+
"""Convenience function for logging to gauge.
317+
318+
Args:
319+
gauge: A gauge metric instance.
320+
data: An int or float to set as the current gauge value.
321+
322+
Returns:
323+
None
324+
"""
325+
self._logger_queue.put_nowait((gauge, "set", data))
326+
185327
def _log_counter(self, counter, data: Union[int, float]) -> None:
186328
"""Convenience function for logging to counter.
187329
@@ -222,8 +364,10 @@ def log(self, stats: VllmStats) -> None:
222364
counter_metrics = [
223365
(self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter),
224366
(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter),
367+
(self.metrics.counter_preemption_tokens, stats.num_preemption_iter),
225368
]
226369
histogram_metrics = [
370+
(self.metrics.histogram_iteration_tokens_total, [stats.num_tokens_iter]),
227371
(
228372
self.metrics.histogram_time_to_first_token,
229373
stats.time_to_first_tokens_iter,
@@ -233,6 +377,13 @@ def log(self, stats: VllmStats) -> None:
233377
stats.time_per_output_tokens_iter,
234378
),
235379
(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests),
380+
(self.metrics.histogram_prefill_time_request, stats.time_prefill_requests),
381+
(self.metrics.histogram_decode_time_request, stats.time_decode_requests),
382+
(
383+
self.metrics.histogram_inference_time_request,
384+
stats.time_inference_requests,
385+
),
386+
(self.metrics.histogram_queue_time_request, stats.time_queue_requests),
236387
(
237388
self.metrics.histogram_num_prompt_tokens_request,
238389
stats.num_prompt_tokens_requests,
@@ -243,10 +394,27 @@ def log(self, stats: VllmStats) -> None:
243394
),
244395
(self.metrics.histogram_n_request, stats.n_requests),
245396
]
397+
gauge_metrics = [
398+
(self.metrics.gauge_num_requests_running, stats.num_running_sys),
399+
(self.metrics.gauge_num_requests_waiting, stats.num_waiting_sys),
400+
(self.metrics.gauge_num_requests_swapped, stats.num_swapped_sys),
401+
(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys),
402+
(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys),
403+
(
404+
self.metrics.gauge_cpu_prefix_cache_hit_rate,
405+
stats.cpu_prefix_cache_hit_rate,
406+
),
407+
(
408+
self.metrics.gauge_gpu_prefix_cache_hit_rate,
409+
stats.gpu_prefix_cache_hit_rate,
410+
),
411+
]
246412
for metric, data in counter_metrics:
247413
self._log_counter(metric, data)
248414
for metric, data in histogram_metrics:
249415
self._log_histogram(metric, data)
416+
for metric, data in gauge_metrics:
417+
self._log_gauge(metric, data)
250418

251419
def logger_loop(self):
252420
while True:
@@ -259,6 +427,8 @@ def logger_loop(self):
259427
metric.increment(data)
260428
elif command == "observe":
261429
metric.observe(data)
430+
elif command == "set":
431+
metric.set(data)
262432
else:
263433
self.log_logger.log_error(f"Undefined command name: {command}")
264434

@@ -268,3 +438,4 @@ def finalize(self):
268438
if self._logger_thread is not None:
269439
self._logger_thread.join()
270440
self._logger_thread = None
441+
self._logger_thread = None

0 commit comments

Comments
 (0)