Skip to content

Commit 5313c2c

Browse files
authored
Add Production Metrics in Prometheus format (#1890)
1 parent 5f09cbd commit 5313c2c

File tree

6 files changed

+89
-2
lines changed

6 files changed

+89
-2
lines changed

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ Documentation
6767
serving/deploying_with_triton
6868
serving/deploying_with_docker
6969
serving/serving_with_langchain
70+
serving/metrics
7071

7172
.. toctree::
7273
:maxdepth: 1

docs/source/serving/metrics.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Production Metrics
2+
==================
3+
4+
vLLM exposes a number of metrics that can be used to monitor the health of the
5+
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
6+
OpenAI compatible API server.
7+
8+
The following metrics are exposed:
9+
10+
.. literalinclude:: ../../../vllm/engine/metrics.py
11+
:language: python
12+
:start-after: begin-metrics-definitions
13+
:end-before: end-metrics-definitions

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ xformers >= 0.0.22.post7 # Required for CUDA 12.1.
1212
fastapi
1313
uvicorn[standard]
1414
pydantic == 1.10.13 # Required for OpenAI server.
15+
aioprometheus[starlette]

vllm/engine/llm_engine.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
SchedulerConfig)
88
from vllm.core.scheduler import Scheduler, SchedulerOutputs
99
from vllm.engine.arg_utils import EngineArgs
10+
from vllm.engine.metrics import record_metrics
1011
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
1112
from vllm.logger import init_logger
1213
from vllm.outputs import RequestOutput
@@ -591,8 +592,8 @@ def _log_system_stats(
591592
else:
592593
self.num_generation_tokens.append((now, num_batched_tokens))
593594

594-
elapsed_time = now - self.last_logging_time
595-
if elapsed_time < _LOGGING_INTERVAL_SEC:
595+
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
596+
if not should_log:
596597
return
597598

598599
# Discard the old stats.
@@ -631,6 +632,16 @@ def _log_system_stats(
631632
else:
632633
cpu_cache_usage = 0.0
633634

635+
record_metrics(
636+
avg_prompt_throughput=avg_prompt_throughput,
637+
avg_generation_throughput=avg_generation_throughput,
638+
scheduler_running=len(self.scheduler.running),
639+
scheduler_swapped=len(self.scheduler.swapped),
640+
scheduler_waiting=len(self.scheduler.waiting),
641+
gpu_cache_usage=gpu_cache_usage,
642+
cpu_cache_usage=cpu_cache_usage,
643+
)
644+
634645
logger.info("Avg prompt throughput: "
635646
f"{avg_prompt_throughput:.1f} tokens/s, "
636647
"Avg generation throughput: "

vllm/engine/metrics.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from aioprometheus import Gauge
2+
3+
# The begin-* and end* here are used by the documentation generator
4+
# to extract the metrics definitions.
5+
6+
# begin-metrics-definitions
7+
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
8+
"Average prefill throughput in tokens/s.")
9+
gauge_avg_generation_throughput = Gauge(
10+
"vllm:avg_generation_throughput_toks_per_s",
11+
"Average generation throughput in tokens/s.")
12+
13+
gauge_scheduler_running = Gauge(
14+
"vllm:num_requests_running",
15+
"Number of requests that is currently running for inference.")
16+
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
17+
"Number requests swapped to CPU.")
18+
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
19+
"Number of requests waiting to be processed.")
20+
21+
gauge_gpu_cache_usage = Gauge(
22+
"vllm:gpu_cache_usage_perc",
23+
"GPU KV-cache usage. 1 means 100 percent usage.")
24+
gauge_cpu_cache_usage = Gauge(
25+
"vllm:cpu_cache_usage_perc",
26+
"CPU KV-cache usage. 1 means 100 percent usage.")
27+
# end-metrics-definitions
28+
29+
labels = {}
30+
31+
32+
def add_global_metrics_labels(**kwargs):
33+
labels.update(kwargs)
34+
35+
36+
def record_metrics(
37+
avg_prompt_throughput: float,
38+
avg_generation_throughput: float,
39+
scheduler_running: int,
40+
scheduler_swapped: int,
41+
scheduler_waiting: int,
42+
gpu_cache_usage: float,
43+
cpu_cache_usage: float,
44+
):
45+
gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
46+
gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
47+
gauge_scheduler_running.set(labels, scheduler_running)
48+
gauge_scheduler_swapped.set(labels, scheduler_swapped)
49+
gauge_scheduler_waiting.set(labels, scheduler_waiting)
50+
gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
51+
gauge_cpu_cache_usage.set(labels, cpu_cache_usage)

vllm/entrypoints/openai/api_server.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from http import HTTPStatus
1010
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
1111

12+
from aioprometheus import MetricsMiddleware
13+
from aioprometheus.asgi.starlette import metrics
1214
import fastapi
1315
import uvicorn
1416
from fastapi import Request
@@ -18,6 +20,7 @@
1820

1921
from vllm.engine.arg_utils import AsyncEngineArgs
2022
from vllm.engine.async_llm_engine import AsyncLLMEngine
23+
from vllm.engine.metrics import add_global_metrics_labels
2124
from vllm.entrypoints.openai.protocol import (
2225
CompletionRequest, CompletionResponse, CompletionResponseChoice,
2326
CompletionResponseStreamChoice, CompletionStreamResponse,
@@ -82,6 +85,10 @@ def parse_args():
8285
return parser.parse_args()
8386

8487

88+
app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics
89+
app.add_route("/metrics", metrics) # Exposes HTTP metrics
90+
91+
8592
def create_error_response(status_code: HTTPStatus,
8693
message: str) -> JSONResponse:
8794
return JSONResponse(ErrorResponse(message=message,
@@ -722,6 +729,9 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
722729
trust_remote_code=engine_model_config.trust_remote_code)
723730
load_chat_template(args, tokenizer)
724731

732+
# Register labels for metrics
733+
add_global_metrics_labels(model_name=engine_args.model)
734+
725735
uvicorn.run(app,
726736
host=args.host,
727737
port=args.port,

0 commit comments

Comments
 (0)