@@ -227,12 +227,24 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with
227227counter_prompt_tokens
228228# Number of generation tokens processed.
229229counter_generation_tokens
230+ # Number of preemption tokens processed.
231+ counter_preemption_tokens
232+ # Histogram of number of tokens per engine_step.
233+ histogram_iteration_tokens
230234# Histogram of time to first token in seconds.
231235histogram_time_to_first_token
232236# Histogram of time per output token in seconds.
233237histogram_time_per_output_token
234238# Histogram of end to end request latency in seconds.
235239histogram_e2e_time_request
240+ # Histogram of time spent in WAITING phase for request.
241+ histogram_queue_time_request
242+ # Histogram of time spent in RUNNING phase for request.
243+ histogram_inference_time_request
244+ # Histogram of time spent in PREFILL phase for request.
245+ histogram_prefill_time_request
246+ # Histogram of time spent in DECODE phase for request.
247+ histogram_decode_time_request
236248# Number of prefill tokens processed.
237249histogram_num_prompt_tokens_request
238250# Number of generation tokens processed.
@@ -241,6 +253,20 @@ histogram_num_generation_tokens_request
241253histogram_best_of_request
242254# Histogram of the n request parameter.
243255histogram_n_request
256+ # Number of requests currently running on GPU.
257+ gauge_scheduler_running
258+ # Number of requests waiting to be processed.
259+ gauge_scheduler_waiting
260+ # Number of requests swapped to CPU.
261+ gauge_scheduler_swapped
262+ # GPU KV-cache usage. 1 means 100 percent usage.
263+ gauge_gpu_cache_usage
264+ # CPU KV-cache usage. 1 means 100 percent usage.
265+ gauge_cpu_cache_usage
266+ # CPU prefix cache block hit rate.
267+ gauge_cpu_prefix_cache_hit_rate
268+ # GPU prefix cache block hit rate.
269+ gauge_gpu_prefix_cache_hit_rate
244270```
245271Your output for these fields should look similar to the following:
246272``` bash
@@ -250,6 +276,37 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10
250276# HELP vllm:generation_tokens_total Number of generation tokens processed.
251277# TYPE vllm:generation_tokens_total counter
252278vllm:generation_tokens_total{model=" vllm_model" ,version=" 1" } 16
279+ # HELP vllm:num_preemptions_total Number of preemption tokens processed.
280+ # TYPE vllm:num_preemptions_total counter
281+ vllm:num_preemptions_total{model=" vllm_model" ,version=" 1" } 0
282+ # HELP vllm:num_requests_running Number of requests currently running on GPU.
283+ # TYPE vllm:num_requests_running gauge
284+ vllm:num_requests_running{model=" vllm_model" ,version=" 1" } 0
285+ # HELP vllm:num_requests_waiting Number of requests waiting to be processed.
286+ # TYPE vllm:num_requests_waiting gauge
287+ vllm:num_requests_waiting{model=" vllm_model" ,version=" 1" } 0
288+ # HELP vllm:num_requests_swapped Number of requests swapped to CPU.
289+ # TYPE vllm:num_requests_swapped gauge
290+ vllm:num_requests_swapped{model=" vllm_model" ,version=" 1" } 0
291+ # HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage.
292+ # TYPE vllm:gpu_cache_usage_perc gauge
293+ vllm:gpu_cache_usage_perc{model=" vllm_model" ,version=" 1" } 0
294+ # HELP vllm:cpu_cache_usage_perc Gauga of cpu cache usage. 1 means 100 percent usage.
295+ # TYPE vllm:cpu_cache_usage_perc gauge
296+ vllm:cpu_cache_usage_perc{model=" vllm_model" ,version=" 1" } 0
297+ # HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate.
298+ # TYPE vllm:cpu_prefix_cache_hit_rate gauge
299+ vllm:cpu_prefix_cache_hit_rate{model=" vllm_model" ,version=" 1" } -1
300+ # HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate.
301+ # TYPE vllm:gpu_prefix_cache_hit_rate gauge
302+ vllm:gpu_prefix_cache_hit_rate{model=" vllm_model" ,version=" 1" } -1
303+ # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
304+ # TYPE vllm:iteration_tokens_total histogram
305+ vllm:iteration_tokens_total_count{model=" vllm_model" ,version=" 1" } 10
306+ vllm:iteration_tokens_total_sum{model=" vllm_model" ,version=" 1" } 12
307+ vllm:iteration_tokens_total_bucket{model=" vllm_model" ,version=" 1" ,le=" 1" } 9
308+ ...
309+ vllm:iteration_tokens_total_bucket{model=" vllm_model" ,version=" 1" ,le=" +Inf" } 10
253310# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
254311# TYPE vllm:time_to_first_token_seconds histogram
255312vllm:time_to_first_token_seconds_count{model=" vllm_model" ,version=" 1" } 1
@@ -271,6 +328,34 @@ vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.086861848
271328vllm:e2e_request_latency_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" 1" } 1
272329...
273330vllm:e2e_request_latency_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" +Inf" } 1
331+ # HELP vllm:request_queue_time_seconds Histogram of time spent in WAITING phase for request.
332+ # TYPE vllm:request_queue_time_seconds histogram
333+ vllm:request_queue_time_seconds_count{model=" vllm_model" ,version=" 1" } 1
334+ vllm:request_queue_time_seconds_sum{model=" vllm_model" ,version=" 1" } 0.0045166015625
335+ vllm:request_queue_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" 1" } 1
336+ ...
337+ vllm:request_queue_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" +Inf" } 1
338+ # HELP vllm:request_inference_time_seconds Histogram of time spent in RUNNING phase for request
339+ # TYPE vllm:request_inference_time_seconds histogram
340+ vllm:request_inference_time_seconds_count{model=" vllm_model" ,version=" 1" } 1
341+ vllm:request_inference_time_seconds_sum{model=" vllm_model" ,version=" 1" } 0.1418392658233643
342+ vllm:request_inference_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" 1" } 1
343+ ...
344+ vllm:request_inference_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" +Inf" } 1
345+ # HELP vllm:request_prefill_time_seconds Histogram of time spent in PREFILL phase for request.
346+ # TYPE vllm:request_prefill_time_seconds histogram
347+ vllm:request_prefill_time_seconds_count{model=" vllm_model" ,version=" 1" } 1
348+ vllm:request_prefill_time_seconds_sum{model=" vllm_model" ,version=" 1" } 0.05302977561950684
349+ vllm:request_prefill_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" 1" } 1
350+ ...
351+ vllm:request_prefill_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" +Inf" } 1
352+ # HELP vllm:request_decode_time_seconds Histogram of time spent in DECODE phase for request.
353+ # TYPE vllm:request_decode_time_seconds histogram
354+ vllm:request_decode_time_seconds_count{model=" vllm_model" ,version=" 1" } 1
355+ vllm:request_decode_time_seconds_sum{model=" vllm_model" ,version=" 1" } 0.08880949020385742
356+ vllm:request_decode_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" 1" } 1
357+ ...
358+ vllm:request_decode_time_seconds_bucket{model=" vllm_model" ,version=" 1" ,le=" +Inf" } 1
274359# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
275360# TYPE vllm:request_prompt_tokens histogram
276361vllm:request_prompt_tokens_count{model=" vllm_model" ,version=" 1" } 1
0 commit comments