Skip to content

Commit 1a1b66e

Browse files
committed
update readme
1 parent b87a1cd commit 1a1b66e

File tree

2 files changed

+85
-2
lines changed

2 files changed

+85
-2
lines changed

README.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,12 +227,24 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with
227227
counter_prompt_tokens
228228
# Number of generation tokens processed.
229229
counter_generation_tokens
230+
# Number of preemption tokens processed.
231+
counter_preemption_tokens
232+
# Histogram of number of tokens per engine_step.
233+
histogram_iteration_tokens
230234
# Histogram of time to first token in seconds.
231235
histogram_time_to_first_token
232236
# Histogram of time per output token in seconds.
233237
histogram_time_per_output_token
234238
# Histogram of end to end request latency in seconds.
235239
histogram_e2e_time_request
240+
# Histogram of time spent in WAITING phase for request.
241+
histogram_queue_time_request
242+
# Histogram of time spent in RUNNING phase for request.
243+
histogram_inference_time_request
244+
# Histogram of time spent in PREFILL phase for request.
245+
histogram_prefill_time_request
246+
# Histogram of time spent in DECODE phase for request.
247+
histogram_decode_time_request
236248
# Number of prefill tokens processed.
237249
histogram_num_prompt_tokens_request
238250
# Number of generation tokens processed.
@@ -241,6 +253,20 @@ histogram_num_generation_tokens_request
241253
histogram_best_of_request
242254
# Histogram of the n request parameter.
243255
histogram_n_request
256+
# Number of requests currently running on GPU.
257+
gauge_scheduler_running
258+
# Number of requests waiting to be processed.
259+
gauge_scheduler_waiting
260+
# Number of requests swapped to CPU.
261+
gauge_scheduler_swapped
262+
# GPU KV-cache usage. 1 means 100 percent usage.
263+
gauge_gpu_cache_usage
264+
# CPU KV-cache usage. 1 means 100 percent usage.
265+
gauge_cpu_cache_usage
266+
# CPU prefix cache block hit rate.
267+
gauge_cpu_prefix_cache_hit_rate
268+
# GPU prefix cache block hit rate.
269+
gauge_gpu_prefix_cache_hit_rate
244270
```
245271
Your output for these fields should look similar to the following:
246272
```bash
@@ -250,6 +276,37 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10
250276
# HELP vllm:generation_tokens_total Number of generation tokens processed.
251277
# TYPE vllm:generation_tokens_total counter
252278
vllm:generation_tokens_total{model="vllm_model",version="1"} 16
279+
# HELP vllm:num_preemptions_total Number of preemption tokens processed.
280+
# TYPE vllm:num_preemptions_total counter
281+
vllm:num_preemptions_total{model="vllm_model",version="1"} 0
282+
# HELP vllm:num_requests_running Number of requests currently running on GPU.
283+
# TYPE vllm:num_requests_running gauge
284+
vllm:num_requests_running{model="vllm_model",version="1"} 0
285+
# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
286+
# TYPE vllm:num_requests_waiting gauge
287+
vllm:num_requests_waiting{model="vllm_model",version="1"} 0
288+
# HELP vllm:num_requests_swapped Number of requests swapped to CPU.
289+
# TYPE vllm:num_requests_swapped gauge
290+
vllm:num_requests_swapped{model="vllm_model",version="1"} 0
291+
# HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage.
292+
# TYPE vllm:gpu_cache_usage_perc gauge
293+
vllm:gpu_cache_usage_perc{model="vllm_model",version="1"} 0
294+
# HELP vllm:cpu_cache_usage_perc Gauga of cpu cache usage. 1 means 100 percent usage.
295+
# TYPE vllm:cpu_cache_usage_perc gauge
296+
vllm:cpu_cache_usage_perc{model="vllm_model",version="1"} 0
297+
# HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate.
298+
# TYPE vllm:cpu_prefix_cache_hit_rate gauge
299+
vllm:cpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1
300+
# HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate.
301+
# TYPE vllm:gpu_prefix_cache_hit_rate gauge
302+
vllm:gpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1
303+
# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
304+
# TYPE vllm:iteration_tokens_total histogram
305+
vllm:iteration_tokens_total_count{model="vllm_model",version="1"} 10
306+
vllm:iteration_tokens_total_sum{model="vllm_model",version="1"} 12
307+
vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="1"} 9
308+
...
309+
vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="+Inf"} 10
253310
# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
254311
# TYPE vllm:time_to_first_token_seconds histogram
255312
vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1
@@ -271,6 +328,34 @@ vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.086861848
271328
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1
272329
...
273330
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
331+
# HELP vllm:request_queue_time_seconds Histogram of time spent in WAITING phase for request.
332+
# TYPE vllm:request_queue_time_seconds histogram
333+
vllm:request_queue_time_seconds_count{model="vllm_model",version="1"} 1
334+
vllm:request_queue_time_seconds_sum{model="vllm_model",version="1"} 0.0045166015625
335+
vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
336+
...
337+
vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
338+
# HELP vllm:request_inference_time_seconds Histogram of time spent in RUNNING phase for request
339+
# TYPE vllm:request_inference_time_seconds histogram
340+
vllm:request_inference_time_seconds_count{model="vllm_model",version="1"} 1
341+
vllm:request_inference_time_seconds_sum{model="vllm_model",version="1"} 0.1418392658233643
342+
vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
343+
...
344+
vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
345+
# HELP vllm:request_prefill_time_seconds Histogram of time spent in PREFILL phase for request.
346+
# TYPE vllm:request_prefill_time_seconds histogram
347+
vllm:request_prefill_time_seconds_count{model="vllm_model",version="1"} 1
348+
vllm:request_prefill_time_seconds_sum{model="vllm_model",version="1"} 0.05302977561950684
349+
vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
350+
...
351+
vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
352+
# HELP vllm:request_decode_time_seconds Histogram of time spent in DECODE phase for request.
353+
# TYPE vllm:request_decode_time_seconds histogram
354+
vllm:request_decode_time_seconds_count{model="vllm_model",version="1"} 1
355+
vllm:request_decode_time_seconds_sum{model="vllm_model",version="1"} 0.08880949020385742
356+
vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
357+
...
358+
vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
274359
# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
275360
# TYPE vllm:request_prompt_tokens histogram
276361
vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1

src/utils/metrics.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
from typing import Dict, List, Union
3030

3131
import triton_python_backend_utils as pb_utils
32-
3332
from vllm.config import VllmConfig
3433
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
3534
from vllm.engine.metrics import Stats as VllmStats
@@ -438,4 +437,3 @@ def finalize(self):
438437
if self._logger_thread is not None:
439438
self._logger_thread.join()
440439
self._logger_thread = None
441-
self._logger_thread = None

0 commit comments

Comments
 (0)