Skip to content

Commit 3ae056d

Browse files
committed
vllm metrics pulled from backend
Signed-off-by: Daiyaan <[email protected]>
1 parent ae4b08a commit 3ae056d

File tree

2 files changed

+209
-31
lines changed

2 files changed

+209
-31
lines changed

components/src/dynamo/planner/utils/planner_core.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,16 @@ def __init__(
8989
else:
9090
raise ValueError(f"Invalid environment: {args.environment}")
9191

92+
# Use backend metrics for vLLM (queries vllm:* metrics directly from workers)
93+
# Use frontend metrics for other backends (queries dynamo_frontend_* metrics)
94+
metric_source = "backend" if args.backend.lower() == "vllm" else "frontend"
95+
logger.info(
96+
f"Initializing Prometheus client with metric_source='{metric_source}' for backend '{args.backend}'"
97+
)
9298
self.prometheus_api_client = PrometheusAPIClient(
9399
SLAPlannerDefaults.prometheus_endpoint,
94100
args.namespace,
101+
metric_source=metric_source,
95102
)
96103

97104
self.num_req_predictor = LOAD_PREDICTORS[args.load_predictor](

components/src/dynamo/planner/utils/prometheus.py

Lines changed: 202 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ class FrontendMetric(BaseModel):
3232
endpoint: typing.Optional[str] = None
3333
instance: typing.Optional[str] = None
3434
job: typing.Optional[str] = None
35-
model: typing.Optional[str] = None
36-
namespace: typing.Optional[str] = None
37-
pod: typing.Optional[str] = None
35+
model: typing.Optional[str] = None # Frontend uses this label
36+
model_name: typing.Optional[str] = None # Backend (vLLM) uses this label
37+
namespace: typing.Optional[str] = None # Kubernetes namespace
38+
pod: typing.Optional[str] = None # Pod name (used for backend filtering)
39+
engine: typing.Optional[str] = None # vLLM engine index
3840

3941

4042
class FrontendMetricContainer(BaseModel):
@@ -43,9 +45,54 @@ class FrontendMetricContainer(BaseModel):
4345

4446

4547
class PrometheusAPIClient:
46-
def __init__(self, url: str, dynamo_namespace: str):
48+
"""
49+
Client for querying Dynamo metrics from Prometheus.
50+
51+
Supports querying both frontend and backend metrics:
52+
- Frontend metrics: dynamo_frontend_* (from Dynamo HTTP frontend)
53+
- Backend metrics: vllm:* (from vLLM engine workers)
54+
55+
Usage:
56+
# Query frontend metrics (default)
57+
frontend_client = PrometheusAPIClient(url="http://prometheus:9090",
58+
dynamo_namespace="my-deployment")
59+
ttft = frontend_client.get_avg_time_to_first_token("60s", "llama-3-8b")
60+
61+
# Query backend worker metrics
62+
backend_client = PrometheusAPIClient(url="http://prometheus:9090",
63+
dynamo_namespace="my-deployment",
64+
metric_source="backend")
65+
ttft = backend_client.get_avg_time_to_first_token("60s", "llama-3-8b")
66+
"""
67+
68+
# Metric name mapping for backend (vLLM) metrics
69+
# Maps from frontend metric concept to actual vLLM metric name
70+
BACKEND_METRIC_MAP = {
71+
"time_to_first_token_seconds": "time_to_first_token_seconds", # histogram
72+
"inter_token_latency_seconds": "inter_token_latency_seconds", # histogram
73+
"request_duration_seconds": "e2e_request_latency_seconds", # histogram - vLLM's e2e latency
74+
"input_sequence_tokens": "prompt_tokens_total", # counter - total prompt tokens
75+
"output_sequence_tokens": "generation_tokens_total", # counter - total generation tokens
76+
"requests_total": "request_success_total", # counter
77+
}
78+
79+
def __init__(self, url: str, dynamo_namespace: str, metric_source: str = "frontend"):
80+
"""
81+
Initialize Prometheus API client.
82+
83+
Args:
84+
url: Prometheus server URL
85+
dynamo_namespace: Dynamo namespace to filter metrics
86+
metric_source: Either "frontend" or "backend".
87+
"frontend" queries dynamo_frontend_* metrics
88+
"backend" queries vllm:* metrics from workers
89+
"""
90+
if metric_source not in ["frontend", "backend"]:
91+
raise ValueError(f"metric_source must be 'frontend' or 'backend', got: {metric_source}")
92+
4793
self.prom = PrometheusConnect(url=url, disable_ssl=True)
4894
self.dynamo_namespace = dynamo_namespace
95+
self.metric_source = metric_source
4996

5097
def _get_average_metric(
5198
self, full_metric_name: str, interval: str, operation_name: str, model_name: str
@@ -55,19 +102,28 @@ def _get_average_metric(
55102
increase(metric_sum[interval])/increase(metric_count[interval])
56103
57104
Args:
58-
full_metric_name: Full metric name (e.g., 'dynamo_frontend_inter_token_latency_seconds')
105+
full_metric_name: Full metric name (e.g., 'dynamo_frontend_inter_token_latency_seconds' or 'time_to_first_token_seconds')
59106
interval: Time interval for the query (e.g., '60s')
60107
operation_name: Human-readable name for error logging
108+
model_name: Model name to filter by
61109
62110
Returns:
63111
Average metric value or 0 if no data/error
64112
"""
65113
try:
66-
# Prepend the frontend metric prefix if not already present
67-
if not full_metric_name.startswith(prometheus_names.name_prefix.FRONTEND):
68-
full_metric_name = (
69-
f"{prometheus_names.name_prefix.FRONTEND}_{full_metric_name}"
70-
)
114+
# Apply metric prefix based on source
115+
if self.metric_source == "frontend":
116+
# Prepend the frontend metric prefix if not already present
117+
if not full_metric_name.startswith(prometheus_names.name_prefix.FRONTEND):
118+
full_metric_name = (
119+
f"{prometheus_names.name_prefix.FRONTEND}_{full_metric_name}"
120+
)
121+
else: # backend
122+
# Backend uses vllm: prefix
123+
# Check if it's already a full vllm metric name (from our mapping)
124+
if not full_metric_name.startswith("vllm:"):
125+
full_metric_name = f"vllm:{full_metric_name}"
126+
71127
query = f"increase({full_metric_name}_sum[{interval}])/increase({full_metric_name}_count[{interval}])"
72128
result = self.prom.custom_query(query=query)
73129
if not result:
@@ -80,12 +136,27 @@ def _get_average_metric(
80136

81137
values = []
82138
for container in metrics_containers:
83-
# Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
84-
if (
85-
container.metric.model
86-
and container.metric.model.lower() == model_name.lower()
87-
and container.metric.dynamo_namespace == self.dynamo_namespace
88-
):
139+
# Determine which label to use for model filtering
140+
if self.metric_source == "frontend":
141+
# Frontend uses 'model' label and lowercases model names
142+
model_match = (
143+
container.metric.model
144+
and container.metric.model.lower() == model_name.lower()
145+
)
146+
namespace_match = container.metric.dynamo_namespace == self.dynamo_namespace
147+
else: # backend
148+
# Backend (vLLM) uses 'model_name' label - check both for compatibility
149+
backend_model = getattr(container.metric, 'model_name', container.metric.model)
150+
model_match = (
151+
backend_model
152+
and backend_model.lower() == model_name.lower()
153+
)
154+
# Backend metrics don't have dynamo_namespace, filter by pod name containing dynamo namespace
155+
pod_name = getattr(container.metric, 'pod', '')
156+
namespace_match = self.dynamo_namespace in pod_name if pod_name else True
157+
158+
# Filter by model and namespace
159+
if model_match and namespace_match:
89160
values.append(container.value[1])
90161

91162
if not values:
@@ -99,6 +170,64 @@ def _get_average_metric(
99170
logger.error(f"Error getting {operation_name}: {e}")
100171
return 0
101172

173+
def _get_counter_average(self, counter_metric: str, interval: str, model_name: str, operation_name: str) -> float:
174+
"""
175+
Get average value from a counter metric by dividing total increase by request count increase.
176+
Used for vLLM token counters (prompt_tokens_total, generation_tokens_total).
177+
178+
Formula: increase(counter_total[interval]) / increase(request_success_total[interval])
179+
"""
180+
try:
181+
full_metric_name = f"vllm:{counter_metric}"
182+
requests_metric = "vllm:request_success_total"
183+
184+
# Query both the counter and request count
185+
counter_query = f"increase({full_metric_name}[{interval}])"
186+
requests_query = f"increase({requests_metric}[{interval}])"
187+
188+
counter_result = self.prom.custom_query(query=counter_query)
189+
requests_result = self.prom.custom_query(query=requests_query)
190+
191+
if not counter_result or not requests_result:
192+
logger.warning(
193+
f"No prometheus metric data available for {full_metric_name}, use 0 instead"
194+
)
195+
return 0
196+
197+
counter_containers = parse_frontend_metric_containers(counter_result)
198+
requests_containers = parse_frontend_metric_containers(requests_result)
199+
200+
# Sum up values for matching pods
201+
total_counter = 0.0
202+
total_requests = 0.0
203+
204+
for container in counter_containers:
205+
backend_model = getattr(container.metric, 'model_name', container.metric.model)
206+
if backend_model and backend_model.lower() == model_name.lower():
207+
pod_name = getattr(container.metric, 'pod', '')
208+
if self.dynamo_namespace in pod_name if pod_name else True:
209+
total_counter += container.value[1]
210+
211+
for container in requests_containers:
212+
backend_model = getattr(container.metric, 'model_name', container.metric.model)
213+
if backend_model and backend_model.lower() == model_name.lower():
214+
pod_name = getattr(container.metric, 'pod', '')
215+
if self.dynamo_namespace in pod_name if pod_name else True:
216+
total_requests += container.value[1]
217+
218+
if total_requests == 0:
219+
logger.warning(
220+
f"No requests for {operation_name} calculation, use 0 instead"
221+
)
222+
return 0
223+
224+
average = total_counter / total_requests
225+
return average
226+
227+
except Exception as e:
228+
logger.error(f"Error getting {operation_name}: {e}")
229+
return 0
230+
102231
def get_avg_inter_token_latency(self, interval: str, model_name: str):
103232
return self._get_average_metric(
104233
prometheus_names.frontend_service.INTER_TOKEN_LATENCY_SECONDS,
@@ -116,6 +245,15 @@ def get_avg_time_to_first_token(self, interval: str, model_name: str):
116245
)
117246

118247
def get_avg_request_duration(self, interval: str, model_name: str):
248+
if self.metric_source == "backend":
249+
# Backend uses e2e_request_latency_seconds instead of request_duration_seconds
250+
metric_name = self.BACKEND_METRIC_MAP["request_duration_seconds"]
251+
return self._get_average_metric(
252+
metric_name,
253+
interval,
254+
"avg e2e request latency",
255+
model_name,
256+
)
119257
return self._get_average_metric(
120258
prometheus_names.frontend_service.REQUEST_DURATION_SECONDS,
121259
interval,
@@ -124,35 +262,64 @@ def get_avg_request_duration(self, interval: str, model_name: str):
124262
)
125263

126264
def get_avg_request_count(self, interval: str, model_name: str):
127-
# This function follows a different query pattern than the other metrics
265+
"""
266+
Get request count over the specified interval.
267+
268+
For frontend: queries dynamo_frontend_requests_total
269+
For backend: queries vllm:request_success_total
270+
"""
128271
try:
129-
requests_total_metric = prometheus_names.frontend_service.REQUESTS_TOTAL
130-
# Prepend the frontend metric prefix if not already present
131-
if not requests_total_metric.startswith(
132-
prometheus_names.name_prefix.FRONTEND
133-
):
134-
requests_total_metric = (
135-
f"{prometheus_names.name_prefix.FRONTEND}_{requests_total_metric}"
136-
)
272+
if self.metric_source == "frontend":
273+
requests_total_metric = prometheus_names.frontend_service.REQUESTS_TOTAL
274+
# Prepend the frontend metric prefix if not already present
275+
if not requests_total_metric.startswith(
276+
prometheus_names.name_prefix.FRONTEND
277+
):
278+
requests_total_metric = (
279+
f"{prometheus_names.name_prefix.FRONTEND}_{requests_total_metric}"
280+
)
281+
else: # backend
282+
# Backend uses vllm:request_success_total
283+
requests_total_metric = "vllm:request_success_total"
284+
137285
raw_res = self.prom.custom_query(
138286
query=f"increase({requests_total_metric}[{interval}])"
139287
)
140288
metrics_containers = parse_frontend_metric_containers(raw_res)
141289
total_count = 0.0
142290
for container in metrics_containers:
143-
# Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
144-
if (
145-
container.metric.model
146-
and container.metric.model.lower() == model_name.lower()
147-
and container.metric.dynamo_namespace == self.dynamo_namespace
148-
):
291+
# Determine which label to use for model filtering
292+
if self.metric_source == "frontend":
293+
# Frontend uses 'model' label and lowercases model names
294+
model_match = (
295+
container.metric.model
296+
and container.metric.model.lower() == model_name.lower()
297+
)
298+
namespace_match = container.metric.dynamo_namespace == self.dynamo_namespace
299+
else: # backend
300+
# Backend (vLLM) uses 'model_name' label - check both for compatibility
301+
backend_model = getattr(container.metric, 'model_name', container.metric.model)
302+
model_match = (
303+
backend_model
304+
and backend_model.lower() == model_name.lower()
305+
)
306+
# Backend metrics don't have dynamo_namespace, filter by pod name containing dynamo namespace
307+
pod_name = getattr(container.metric, 'pod', '')
308+
namespace_match = self.dynamo_namespace in pod_name if pod_name else True
309+
310+
# Filter by model and namespace
311+
if model_match and namespace_match:
149312
total_count += container.value[1]
150313
return total_count
151314
except Exception as e:
152315
logger.error(f"Error getting avg request count: {e}")
153316
return 0
154317

155318
def get_avg_input_sequence_tokens(self, interval: str, model_name: str):
319+
if self.metric_source == "backend":
320+
# Backend uses prompt_tokens counter (not histogram)
321+
metric_name = self.BACKEND_METRIC_MAP["input_sequence_tokens"]
322+
return self._get_counter_average(metric_name, interval, model_name, "input_sequence_tokens")
156323
return self._get_average_metric(
157324
prometheus_names.frontend_service.INPUT_SEQUENCE_TOKENS,
158325
interval,
@@ -161,6 +328,10 @@ def get_avg_input_sequence_tokens(self, interval: str, model_name: str):
161328
)
162329

163330
def get_avg_output_sequence_tokens(self, interval: str, model_name: str):
331+
if self.metric_source == "backend":
332+
# Backend uses generation_tokens counter (not histogram)
333+
metric_name = self.BACKEND_METRIC_MAP["output_sequence_tokens"]
334+
return self._get_counter_average(metric_name, interval, model_name, "output_sequence_tokens")
164335
return self._get_average_metric(
165336
prometheus_names.frontend_service.OUTPUT_SEQUENCE_TOKENS,
166337
interval,

0 commit comments

Comments
 (0)