2929from typing import Dict , List , Union
3030
3131import triton_python_backend_utils as pb_utils
32+
3233from vllm .config import VllmConfig
3334from vllm .engine .metrics import StatLoggerBase as VllmStatLoggerBase
3435from vllm .engine .metrics import Stats as VllmStats
@@ -49,6 +50,16 @@ def __init__(self, labels: List[str], max_model_len: int):
4950 description = "Number of generation tokens processed." ,
5051 kind = pb_utils .MetricFamily .COUNTER ,
5152 )
53+ self .counter_preemption_tokens_family = pb_utils .MetricFamily (
54+ name = "vllm:num_preemptions_total" ,
55+ description = "Number of preemption tokens processed." ,
56+ kind = pb_utils .MetricFamily .COUNTER ,
57+ )
58+ self .histogram_iteration_tokens_total_family = pb_utils .MetricFamily (
59+ name = "vllm:iteration_tokens_total" ,
60+ description = "Histogram of number of tokens per engine_step." ,
61+ kind = pb_utils .MetricFamily .HISTOGRAM ,
62+ )
5263 self .histogram_time_to_first_token_family = pb_utils .MetricFamily (
5364 name = "vllm:time_to_first_token_seconds" ,
5465 description = "Histogram of time to first token in seconds." ,
@@ -66,6 +77,26 @@ def __init__(self, labels: List[str], max_model_len: int):
6677 description = "Histogram of end to end request latency in seconds." ,
6778 kind = pb_utils .MetricFamily .HISTOGRAM ,
6879 )
80+ self .histogram_queue_time_request_family = pb_utils .MetricFamily (
81+ name = "vllm:request_queue_time_seconds" ,
82+ description = "Histogram of time spent in WAITING phase for request." ,
83+ kind = pb_utils .MetricFamily .HISTOGRAM ,
84+ )
85+ self .histogram_inference_time_request_family = pb_utils .MetricFamily (
86+ name = "vllm:request_inference_time_seconds" ,
87+ description = "Histogram of time spent in RUNNING phase for request." ,
88+ kind = pb_utils .MetricFamily .HISTOGRAM ,
89+ )
90+ self .histogram_prefill_time_request_family = pb_utils .MetricFamily (
91+ name = "vllm:request_prefill_time_seconds" ,
92+ description = "Histogram of time spent in PREFILL phase for request." ,
93+ kind = pb_utils .MetricFamily .HISTOGRAM ,
94+ )
95+ self .histogram_decode_time_request_family = pb_utils .MetricFamily (
96+ name = "vllm:request_decode_time_seconds" ,
97+ description = "Histogram of time spent in DECODE phase for request." ,
98+ kind = pb_utils .MetricFamily .HISTOGRAM ,
99+ )
69100 # Metadata
70101 self .histogram_num_prompt_tokens_request_family = pb_utils .MetricFamily (
71102 name = "vllm:request_prompt_tokens" ,
@@ -82,6 +113,45 @@ def __init__(self, labels: List[str], max_model_len: int):
82113 description = "Histogram of the n request parameter." ,
83114 kind = pb_utils .MetricFamily .HISTOGRAM ,
84115 )
116+ # System stats
117+ # Scheduler State
118+ self .gauge_scheduler_running_family = pb_utils .MetricFamily (
119+ name = "vllm:num_requests_running" ,
120+ description = "Number of requests currently running on GPU." ,
121+ kind = pb_utils .MetricFamily .GAUGE ,
122+ )
123+ self .gauge_scheduler_waiting_family = pb_utils .MetricFamily (
124+ name = "vllm:num_requests_waiting" ,
125+ description = "Number of requests waiting to be processed." ,
126+ kind = pb_utils .MetricFamily .GAUGE ,
127+ )
128+ self .gauge_scheduler_swapped_family = pb_utils .MetricFamily (
129+ name = "vllm:num_requests_swapped" ,
130+ description = "Number of requests swapped to CPU." ,
131+ kind = pb_utils .MetricFamily .GAUGE ,
132+ )
133+ # KV Cache Usage in %
134+ self .gauge_gpu_cache_usage_family = pb_utils .MetricFamily (
135+ name = "vllm:gpu_cache_usage_perc" ,
136+ description = "GPU KV-cache usage. 1 means 100 percent usage." ,
137+ kind = pb_utils .MetricFamily .GAUGE ,
138+ )
139+ self .gauge_cpu_cache_usage_family = pb_utils .MetricFamily (
140+ name = "vllm:cpu_cache_usage_perc" ,
141+ description = "CPU KV-cache usage. 1 means 100 percent usage." ,
142+ kind = pb_utils .MetricFamily .GAUGE ,
143+ )
144+ # Prefix caching block hit rate
145+ self .gauge_cpu_prefix_cache_hit_rate_family = pb_utils .MetricFamily (
146+ name = "vllm:cpu_prefix_cache_hit_rate" ,
147+ description = "CPU prefix cache block hit rate." ,
148+ kind = pb_utils .MetricFamily .GAUGE ,
149+ )
150+ self .gauge_gpu_prefix_cache_hit_rate_family = pb_utils .MetricFamily (
151+ name = "vllm:gpu_prefix_cache_hit_rate" ,
152+ description = "GPU prefix cache block hit rate." ,
153+ kind = pb_utils .MetricFamily .GAUGE ,
154+ )
85155
86156 # Initialize metrics
87157 # Iteration stats
@@ -91,8 +161,19 @@ def __init__(self, labels: List[str], max_model_len: int):
91161 self .counter_generation_tokens = self .counter_generation_tokens_family .Metric (
92162 labels = labels
93163 )
164+ self .counter_preemption_tokens = self .counter_preemption_tokens_family .Metric (
165+ labels = labels
166+ )
167+
94168 # Use the same bucket boundaries from vLLM sample metrics as an example.
95169 # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
170+ self .histogram_iteration_tokens_total = (
171+ self .histogram_iteration_tokens_total_family .Metric (
172+ labels = labels ,
173+ buckets = [1 , 8 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 , 8096 ],
174+ )
175+ )
176+
96177 self .histogram_time_to_first_token = (
97178 self .histogram_time_to_first_token_family .Metric (
98179 labels = labels ,
@@ -142,6 +223,30 @@ def __init__(self, labels: List[str], max_model_len: int):
142223 labels = labels ,
143224 buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
144225 )
226+ self .histogram_prefill_time_request = (
227+ self .histogram_prefill_time_request_family .Metric (
228+ labels = labels ,
229+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
230+ )
231+ )
232+ self .histogram_decode_time_request = (
233+ self .histogram_decode_time_request_family .Metric (
234+ labels = labels ,
235+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
236+ )
237+ )
238+ self .histogram_inference_time_request = (
239+ self .histogram_inference_time_request_family .Metric (
240+ labels = labels ,
241+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
242+ )
243+ )
244+ self .histogram_queue_time_request = (
245+ self .histogram_queue_time_request_family .Metric (
246+ labels = labels ,
247+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
248+ )
249+ )
145250 # Metadata
146251 self .histogram_num_prompt_tokens_request = (
147252 self .histogram_num_prompt_tokens_request_family .Metric (
@@ -159,6 +264,31 @@ def __init__(self, labels: List[str], max_model_len: int):
159264 labels = labels ,
160265 buckets = [1 , 2 , 5 , 10 , 20 ],
161266 )
267+ # System stats
268+ # Scheduler State
269+ self .gauge_num_requests_running = self .gauge_scheduler_running_family .Metric (
270+ labels = labels
271+ )
272+ self .gauge_num_requests_waiting = self .gauge_scheduler_waiting_family .Metric (
273+ labels = labels
274+ )
275+ self .gauge_num_requests_swapped = self .gauge_scheduler_swapped_family .Metric (
276+ labels = labels
277+ )
278+ # KV Cache Usage in %
279+ self .gauge_gpu_cache_usage = self .gauge_gpu_cache_usage_family .Metric (
280+ labels = labels
281+ )
282+ self .gauge_cpu_cache_usage = self .gauge_cpu_cache_usage_family .Metric (
283+ labels = labels
284+ )
285+ # Prefix caching block hit rate
286+ self .gauge_cpu_prefix_cache_hit_rate = (
287+ self .gauge_cpu_prefix_cache_hit_rate_family .Metric (labels = labels )
288+ )
289+ self .gauge_gpu_prefix_cache_hit_rate = (
290+ self .gauge_gpu_prefix_cache_hit_rate_family .Metric (labels = labels )
291+ )
162292
163293
164294class VllmStatLogger (VllmStatLoggerBase ):
@@ -182,6 +312,18 @@ def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
182312 def info (self , type : str , obj : SupportsMetricsInfo ) -> None :
183313 pass
184314
315+ def _log_gauge (self , gauge , data : Union [int , float ]) -> None :
316+ """Convenience function for logging to gauge.
317+
318+ Args:
319+ gauge: A gauge metric instance.
320+ data: An int or float to set as the current gauge value.
321+
322+ Returns:
323+ None
324+ """
325+ self ._logger_queue .put_nowait ((gauge , "set" , data ))
326+
185327 def _log_counter (self , counter , data : Union [int , float ]) -> None :
186328 """Convenience function for logging to counter.
187329
@@ -222,8 +364,10 @@ def log(self, stats: VllmStats) -> None:
222364 counter_metrics = [
223365 (self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter ),
224366 (self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter ),
367+ (self .metrics .counter_preemption_tokens , stats .num_preemption_iter ),
225368 ]
226369 histogram_metrics = [
370+ (self .metrics .histogram_iteration_tokens_total , [stats .num_tokens_iter ]),
227371 (
228372 self .metrics .histogram_time_to_first_token ,
229373 stats .time_to_first_tokens_iter ,
@@ -233,6 +377,13 @@ def log(self, stats: VllmStats) -> None:
233377 stats .time_per_output_tokens_iter ,
234378 ),
235379 (self .metrics .histogram_e2e_time_request , stats .time_e2e_requests ),
380+ (self .metrics .histogram_prefill_time_request , stats .time_prefill_requests ),
381+ (self .metrics .histogram_decode_time_request , stats .time_decode_requests ),
382+ (
383+ self .metrics .histogram_inference_time_request ,
384+ stats .time_inference_requests ,
385+ ),
386+ (self .metrics .histogram_queue_time_request , stats .time_queue_requests ),
236387 (
237388 self .metrics .histogram_num_prompt_tokens_request ,
238389 stats .num_prompt_tokens_requests ,
@@ -243,10 +394,27 @@ def log(self, stats: VllmStats) -> None:
243394 ),
244395 (self .metrics .histogram_n_request , stats .n_requests ),
245396 ]
397+ gauge_metrics = [
398+ (self .metrics .gauge_num_requests_running , stats .num_running_sys ),
399+ (self .metrics .gauge_num_requests_waiting , stats .num_waiting_sys ),
400+ (self .metrics .gauge_num_requests_swapped , stats .num_swapped_sys ),
401+ (self .metrics .gauge_gpu_cache_usage , stats .gpu_cache_usage_sys ),
402+ (self .metrics .gauge_cpu_cache_usage , stats .cpu_cache_usage_sys ),
403+ (
404+ self .metrics .gauge_cpu_prefix_cache_hit_rate ,
405+ stats .cpu_prefix_cache_hit_rate ,
406+ ),
407+ (
408+ self .metrics .gauge_gpu_prefix_cache_hit_rate ,
409+ stats .gpu_prefix_cache_hit_rate ,
410+ ),
411+ ]
246412 for metric , data in counter_metrics :
247413 self ._log_counter (metric , data )
248414 for metric , data in histogram_metrics :
249415 self ._log_histogram (metric , data )
416+ for metric , data in gauge_metrics :
417+ self ._log_gauge (metric , data )
250418
251419 def logger_loop (self ):
252420 while True :
@@ -259,6 +427,8 @@ def logger_loop(self):
259427 metric .increment (data )
260428 elif command == "observe" :
261429 metric .observe (data )
430+ elif command == "set" :
431+ metric .set (data )
262432 else :
263433 self .log_logger .log_error (f"Undefined command name: { command } " )
264434
@@ -268,3 +438,4 @@ def finalize(self):
268438 if self ._logger_thread is not None :
269439 self ._logger_thread .join ()
270440 self ._logger_thread = None
441+ self ._logger_thread = None
0 commit comments