|
| 1 | +from aioprometheus import Gauge |
| 2 | + |
| 3 | +# The begin-* and end* here are used by the documentation generator |
| 4 | +# to extract the metrics definitions. |
| 5 | + |
| 6 | +# begin-metrics-definitions |
| 7 | +gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s", |
| 8 | + "Average prefill throughput in tokens/s.") |
| 9 | +gauge_avg_generation_throughput = Gauge( |
| 10 | + "vllm:avg_generation_throughput_toks_per_s", |
| 11 | + "Average generation throughput in tokens/s.") |
| 12 | + |
| 13 | +gauge_scheduler_running = Gauge( |
| 14 | + "vllm:num_requests_running", |
| 15 | + "Number of requests that is currently running for inference.") |
| 16 | +gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped", |
| 17 | + "Number requests swapped to CPU.") |
| 18 | +gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", |
| 19 | + "Number of requests waiting to be processed.") |
| 20 | + |
| 21 | +gauge_gpu_cache_usage = Gauge( |
| 22 | + "vllm:gpu_cache_usage_perc", |
| 23 | + "GPU KV-cache usage. 1 means 100 percent usage.") |
| 24 | +gauge_cpu_cache_usage = Gauge( |
| 25 | + "vllm:cpu_cache_usage_perc", |
| 26 | + "CPU KV-cache usage. 1 means 100 percent usage.") |
| 27 | +# end-metrics-definitions |
| 28 | + |
| 29 | +labels = {} |
| 30 | + |
| 31 | + |
| 32 | +def add_global_metrics_labels(**kwargs): |
| 33 | + labels.update(kwargs) |
| 34 | + |
| 35 | + |
| 36 | +def record_metrics( |
| 37 | + avg_prompt_throughput: float, |
| 38 | + avg_generation_throughput: float, |
| 39 | + scheduler_running: int, |
| 40 | + scheduler_swapped: int, |
| 41 | + scheduler_waiting: int, |
| 42 | + gpu_cache_usage: float, |
| 43 | + cpu_cache_usage: float, |
| 44 | +): |
| 45 | + gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput) |
| 46 | + gauge_avg_generation_throughput.set(labels, avg_generation_throughput) |
| 47 | + gauge_scheduler_running.set(labels, scheduler_running) |
| 48 | + gauge_scheduler_swapped.set(labels, scheduler_swapped) |
| 49 | + gauge_scheduler_waiting.set(labels, scheduler_waiting) |
| 50 | + gauge_gpu_cache_usage.set(labels, gpu_cache_usage) |
| 51 | + gauge_cpu_cache_usage.set(labels, cpu_cache_usage) |
0 commit comments