@@ -32,9 +32,11 @@ class FrontendMetric(BaseModel):
3232 endpoint : typing .Optional [str ] = None
3333 instance : typing .Optional [str ] = None
3434 job : typing .Optional [str ] = None
35- model : typing .Optional [str ] = None
36- namespace : typing .Optional [str ] = None
37- pod : typing .Optional [str ] = None
35+ model : typing .Optional [str ] = None # Frontend uses this label
36+ model_name : typing .Optional [str ] = None # Backend (vLLM) uses this label
37+ namespace : typing .Optional [str ] = None # Kubernetes namespace
38+ pod : typing .Optional [str ] = None # Pod name (used for backend filtering)
39+ engine : typing .Optional [str ] = None # vLLM engine index
3840
3941
4042class FrontendMetricContainer (BaseModel ):
@@ -43,9 +45,54 @@ class FrontendMetricContainer(BaseModel):
4345
4446
4547class PrometheusAPIClient :
46- def __init__ (self , url : str , dynamo_namespace : str ):
48+ """
49+ Client for querying Dynamo metrics from Prometheus.
50+
51+ Supports querying both frontend and backend metrics:
52+ - Frontend metrics: dynamo_frontend_* (from Dynamo HTTP frontend)
53+ - Backend metrics: vllm:* (from vLLM engine workers)
54+
55+ Usage:
56+ # Query frontend metrics (default)
57+ frontend_client = PrometheusAPIClient(url="http://prometheus:9090",
58+ dynamo_namespace="my-deployment")
59+ ttft = frontend_client.get_avg_time_to_first_token("60s", "llama-3-8b")
60+
61+ # Query backend worker metrics
62+ backend_client = PrometheusAPIClient(url="http://prometheus:9090",
63+ dynamo_namespace="my-deployment",
64+ metric_source="backend")
65+ ttft = backend_client.get_avg_time_to_first_token("60s", "llama-3-8b")
66+ """
67+
68+ # Metric name mapping for backend (vLLM) metrics
69+ # Maps from frontend metric concept to actual vLLM metric name
70+ BACKEND_METRIC_MAP = {
71+ "time_to_first_token_seconds" : "time_to_first_token_seconds" , # histogram
72+ "inter_token_latency_seconds" : "inter_token_latency_seconds" , # histogram
73+ "request_duration_seconds" : "e2e_request_latency_seconds" , # histogram - vLLM's e2e latency
74+ "input_sequence_tokens" : "prompt_tokens_total" , # counter - total prompt tokens
75+ "output_sequence_tokens" : "generation_tokens_total" , # counter - total generation tokens
76+ "requests_total" : "request_success_total" , # counter
77+ }
78+
79+ def __init__ (self , url : str , dynamo_namespace : str , metric_source : str = "frontend" ):
80+ """
81+ Initialize Prometheus API client.
82+
83+ Args:
84+ url: Prometheus server URL
85+ dynamo_namespace: Dynamo namespace to filter metrics
86+ metric_source: Either "frontend" or "backend".
87+ "frontend" queries dynamo_frontend_* metrics
88+ "backend" queries vllm:* metrics from workers
89+ """
90+ if metric_source not in ["frontend" , "backend" ]:
91+ raise ValueError (f"metric_source must be 'frontend' or 'backend', got: { metric_source } " )
92+
4793 self .prom = PrometheusConnect (url = url , disable_ssl = True )
4894 self .dynamo_namespace = dynamo_namespace
95+ self .metric_source = metric_source
4996
5097 def _get_average_metric (
5198 self , full_metric_name : str , interval : str , operation_name : str , model_name : str
@@ -55,19 +102,28 @@ def _get_average_metric(
55102 increase(metric_sum[interval])/increase(metric_count[interval])
56103
57104 Args:
58- full_metric_name: Full metric name (e.g., 'dynamo_frontend_inter_token_latency_seconds')
105+ full_metric_name: Full metric name (e.g., 'dynamo_frontend_inter_token_latency_seconds' or 'time_to_first_token_seconds' )
59106 interval: Time interval for the query (e.g., '60s')
60107 operation_name: Human-readable name for error logging
108+ model_name: Model name to filter by
61109
62110 Returns:
63111 Average metric value or 0 if no data/error
64112 """
65113 try :
66- # Prepend the frontend metric prefix if not already present
67- if not full_metric_name .startswith (prometheus_names .name_prefix .FRONTEND ):
68- full_metric_name = (
69- f"{ prometheus_names .name_prefix .FRONTEND } _{ full_metric_name } "
70- )
114+ # Apply metric prefix based on source
115+ if self .metric_source == "frontend" :
116+ # Prepend the frontend metric prefix if not already present
117+ if not full_metric_name .startswith (prometheus_names .name_prefix .FRONTEND ):
118+ full_metric_name = (
119+ f"{ prometheus_names .name_prefix .FRONTEND } _{ full_metric_name } "
120+ )
121+ else : # backend
122+ # Backend uses vllm: prefix
123+ # Check if it's already a full vllm metric name (from our mapping)
124+ if not full_metric_name .startswith ("vllm:" ):
125+ full_metric_name = f"vllm:{ full_metric_name } "
126+
71127 query = f"increase({ full_metric_name } _sum[{ interval } ])/increase({ full_metric_name } _count[{ interval } ])"
72128 result = self .prom .custom_query (query = query )
73129 if not result :
@@ -80,12 +136,27 @@ def _get_average_metric(
80136
81137 values = []
82138 for container in metrics_containers :
83- # Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
84- if (
85- container .metric .model
86- and container .metric .model .lower () == model_name .lower ()
87- and container .metric .dynamo_namespace == self .dynamo_namespace
88- ):
139+ # Determine which label to use for model filtering
140+ if self .metric_source == "frontend" :
141+ # Frontend uses 'model' label and lowercases model names
142+ model_match = (
143+ container .metric .model
144+ and container .metric .model .lower () == model_name .lower ()
145+ )
146+ namespace_match = container .metric .dynamo_namespace == self .dynamo_namespace
147+ else : # backend
148+ # Backend (vLLM) uses 'model_name' label - check both for compatibility
149+ backend_model = getattr (container .metric , 'model_name' , container .metric .model )
150+ model_match = (
151+ backend_model
152+ and backend_model .lower () == model_name .lower ()
153+ )
154+ # Backend metrics don't have dynamo_namespace, filter by pod name containing dynamo namespace
155+ pod_name = getattr (container .metric , 'pod' , '' )
156+ namespace_match = self .dynamo_namespace in pod_name if pod_name else True
157+
158+ # Filter by model and namespace
159+ if model_match and namespace_match :
89160 values .append (container .value [1 ])
90161
91162 if not values :
@@ -99,6 +170,64 @@ def _get_average_metric(
99170 logger .error (f"Error getting { operation_name } : { e } " )
100171 return 0
101172
173+ def _get_counter_average (self , counter_metric : str , interval : str , model_name : str , operation_name : str ) -> float :
174+ """
175+ Get average value from a counter metric by dividing total increase by request count increase.
176+ Used for vLLM token counters (prompt_tokens_total, generation_tokens_total).
177+
178+ Formula: increase(counter_total[interval]) / increase(request_success_total[interval])
179+ """
180+ try :
181+ full_metric_name = f"vllm:{ counter_metric } "
182+ requests_metric = "vllm:request_success_total"
183+
184+ # Query both the counter and request count
185+ counter_query = f"increase({ full_metric_name } [{ interval } ])"
186+ requests_query = f"increase({ requests_metric } [{ interval } ])"
187+
188+ counter_result = self .prom .custom_query (query = counter_query )
189+ requests_result = self .prom .custom_query (query = requests_query )
190+
191+ if not counter_result or not requests_result :
192+ logger .warning (
193+ f"No prometheus metric data available for { full_metric_name } , use 0 instead"
194+ )
195+ return 0
196+
197+ counter_containers = parse_frontend_metric_containers (counter_result )
198+ requests_containers = parse_frontend_metric_containers (requests_result )
199+
200+ # Sum up values for matching pods
201+ total_counter = 0.0
202+ total_requests = 0.0
203+
204+ for container in counter_containers :
205+ backend_model = getattr (container .metric , 'model_name' , container .metric .model )
206+ if backend_model and backend_model .lower () == model_name .lower ():
207+ pod_name = getattr (container .metric , 'pod' , '' )
208+ if self .dynamo_namespace in pod_name if pod_name else True :
209+ total_counter += container .value [1 ]
210+
211+ for container in requests_containers :
212+ backend_model = getattr (container .metric , 'model_name' , container .metric .model )
213+ if backend_model and backend_model .lower () == model_name .lower ():
214+ pod_name = getattr (container .metric , 'pod' , '' )
215+ if self .dynamo_namespace in pod_name if pod_name else True :
216+ total_requests += container .value [1 ]
217+
218+ if total_requests == 0 :
219+ logger .warning (
220+ f"No requests for { operation_name } calculation, use 0 instead"
221+ )
222+ return 0
223+
224+ average = total_counter / total_requests
225+ return average
226+
227+ except Exception as e :
228+ logger .error (f"Error getting { operation_name } : { e } " )
229+ return 0
230+
102231 def get_avg_inter_token_latency (self , interval : str , model_name : str ):
103232 return self ._get_average_metric (
104233 prometheus_names .frontend_service .INTER_TOKEN_LATENCY_SECONDS ,
@@ -116,6 +245,15 @@ def get_avg_time_to_first_token(self, interval: str, model_name: str):
116245 )
117246
118247 def get_avg_request_duration (self , interval : str , model_name : str ):
248+ if self .metric_source == "backend" :
249+ # Backend uses e2e_request_latency_seconds instead of request_duration_seconds
250+ metric_name = self .BACKEND_METRIC_MAP ["request_duration_seconds" ]
251+ return self ._get_average_metric (
252+ metric_name ,
253+ interval ,
254+ "avg e2e request latency" ,
255+ model_name ,
256+ )
119257 return self ._get_average_metric (
120258 prometheus_names .frontend_service .REQUEST_DURATION_SECONDS ,
121259 interval ,
@@ -124,35 +262,64 @@ def get_avg_request_duration(self, interval: str, model_name: str):
124262 )
125263
126264 def get_avg_request_count (self , interval : str , model_name : str ):
127- # This function follows a different query pattern than the other metrics
265+ """
266+ Get request count over the specified interval.
267+
268+ For frontend: queries dynamo_frontend_requests_total
269+ For backend: queries vllm:request_success_total
270+ """
128271 try :
129- requests_total_metric = prometheus_names .frontend_service .REQUESTS_TOTAL
130- # Prepend the frontend metric prefix if not already present
131- if not requests_total_metric .startswith (
132- prometheus_names .name_prefix .FRONTEND
133- ):
134- requests_total_metric = (
135- f"{ prometheus_names .name_prefix .FRONTEND } _{ requests_total_metric } "
136- )
272+ if self .metric_source == "frontend" :
273+ requests_total_metric = prometheus_names .frontend_service .REQUESTS_TOTAL
274+ # Prepend the frontend metric prefix if not already present
275+ if not requests_total_metric .startswith (
276+ prometheus_names .name_prefix .FRONTEND
277+ ):
278+ requests_total_metric = (
279+ f"{ prometheus_names .name_prefix .FRONTEND } _{ requests_total_metric } "
280+ )
281+ else : # backend
282+ # Backend uses vllm:request_success_total
283+ requests_total_metric = "vllm:request_success_total"
284+
137285 raw_res = self .prom .custom_query (
138286 query = f"increase({ requests_total_metric } [{ interval } ])"
139287 )
140288 metrics_containers = parse_frontend_metric_containers (raw_res )
141289 total_count = 0.0
142290 for container in metrics_containers :
143- # Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
144- if (
145- container .metric .model
146- and container .metric .model .lower () == model_name .lower ()
147- and container .metric .dynamo_namespace == self .dynamo_namespace
148- ):
291+ # Determine which label to use for model filtering
292+ if self .metric_source == "frontend" :
293+ # Frontend uses 'model' label and lowercases model names
294+ model_match = (
295+ container .metric .model
296+ and container .metric .model .lower () == model_name .lower ()
297+ )
298+ namespace_match = container .metric .dynamo_namespace == self .dynamo_namespace
299+ else : # backend
300+ # Backend (vLLM) uses 'model_name' label - check both for compatibility
301+ backend_model = getattr (container .metric , 'model_name' , container .metric .model )
302+ model_match = (
303+ backend_model
304+ and backend_model .lower () == model_name .lower ()
305+ )
306+ # Backend metrics don't have dynamo_namespace, filter by pod name containing dynamo namespace
307+ pod_name = getattr (container .metric , 'pod' , '' )
308+ namespace_match = self .dynamo_namespace in pod_name if pod_name else True
309+
310+ # Filter by model and namespace
311+ if model_match and namespace_match :
149312 total_count += container .value [1 ]
150313 return total_count
151314 except Exception as e :
152315 logger .error (f"Error getting avg request count: { e } " )
153316 return 0
154317
155318 def get_avg_input_sequence_tokens (self , interval : str , model_name : str ):
319+ if self .metric_source == "backend" :
320+ # Backend uses prompt_tokens counter (not histogram)
321+ metric_name = self .BACKEND_METRIC_MAP ["input_sequence_tokens" ]
322+ return self ._get_counter_average (metric_name , interval , model_name , "input_sequence_tokens" )
156323 return self ._get_average_metric (
157324 prometheus_names .frontend_service .INPUT_SEQUENCE_TOKENS ,
158325 interval ,
@@ -161,6 +328,10 @@ def get_avg_input_sequence_tokens(self, interval: str, model_name: str):
161328 )
162329
163330 def get_avg_output_sequence_tokens (self , interval : str , model_name : str ):
331+ if self .metric_source == "backend" :
332+ # Backend uses generation_tokens counter (not histogram)
333+ metric_name = self .BACKEND_METRIC_MAP ["output_sequence_tokens" ]
334+ return self ._get_counter_average (metric_name , interval , model_name , "output_sequence_tokens" )
164335 return self ._get_average_metric (
165336 prometheus_names .frontend_service .OUTPUT_SEQUENCE_TOKENS ,
166337 interval ,
0 commit comments