Skip to content

Commit 85e9525

Browse files
committed
feat(metrics): add request prompt, generation, max_tokens and success metrics
Signed-off-by: googs1025 <[email protected]>
1 parent 699452c commit 85e9525

File tree

3 files changed

+80
-1
lines changed

3 files changed

+80
-1
lines changed

pkg/llm-d-inference-sim/metrics.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9494
return err
9595
}
9696

97+
s.requestPromptTokens = prometheus.NewHistogramVec(
98+
prometheus.HistogramOpts{
99+
Subsystem: "",
100+
Name: "vllm:request_prompt_tokens",
101+
Help: "Number of input prompt tokens in the request.",
102+
Buckets: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000},
103+
},
104+
[]string{vllmapi.PromLabelModelName},
105+
)
106+
if err := s.registry.Register(s.requestPromptTokens); err != nil {
107+
s.logger.Error(err, "Prometheus request_prompt_tokens histogram register failed")
108+
return err
109+
}
110+
111+
s.requestGenerationTokens = prometheus.NewHistogramVec(
112+
prometheus.HistogramOpts{
113+
Subsystem: "",
114+
Name: "vllm:request_generation_tokens",
115+
Help: "Number of generated tokens so far in the request.",
116+
Buckets: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000},
117+
},
118+
[]string{vllmapi.PromLabelModelName},
119+
)
120+
if err := s.registry.Register(s.requestGenerationTokens); err != nil {
121+
s.logger.Error(err, "Prometheus request_generation_tokens histogram register failed")
122+
return err
123+
}
124+
125+
s.requestParamsMaxTokens = prometheus.NewHistogramVec(
126+
prometheus.HistogramOpts{
127+
Subsystem: "",
128+
Name: "vllm:request_params_max_tokens",
129+
Help: "The 'max_tokens' parameter from the request.",
130+
Buckets: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000},
131+
},
132+
[]string{vllmapi.PromLabelModelName},
133+
)
134+
if err := s.registry.Register(s.requestParamsMaxTokens); err != nil {
135+
s.logger.Error(err, "Prometheus request_params_max_tokens histogram register failed")
136+
return err
137+
}
138+
139+
s.requestSuccessTotal = prometheus.NewCounterVec(
140+
prometheus.CounterOpts{
141+
Subsystem: "",
142+
Name: "vllm:request_success_total",
143+
Help: "Total number of successful inference requests.",
144+
},
145+
[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
146+
)
147+
if err := s.registry.Register(s.requestSuccessTotal); err != nil {
148+
s.logger.Error(err, "Prometheus request_success_total counter register failed")
149+
return err
150+
}
151+
97152
s.setInitialPrometheusMetrics()
98153

99154
return nil
@@ -282,3 +337,15 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282337
s.logger.Error(nil, "Zero model reference", "model", lora)
283338
}
284339
}
340+
341+
// recordRequestMetricsOnSuccess records metrics for a successfully completed request
342+
func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
343+
generationTokens int, maxTokens *int64, finishReason string) {
344+
modelName := s.getDisplayedModelName(s.config.Model)
345+
s.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
346+
s.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
347+
if maxTokens != nil {
348+
s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
349+
}
350+
s.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc()
351+
}

pkg/llm-d-inference-sim/simulator.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,14 @@ type VllmSimulator struct {
108108
waitingRequests *prometheus.GaugeVec
109109
// kvCacheUsagePercentage is prometheus gauge
110110
kvCacheUsagePercentage *prometheus.GaugeVec
111+
// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
112+
requestPromptTokens *prometheus.HistogramVec
113+
// requestGenerationTokens is prometheus histogram for number of generated tokens in request
114+
requestGenerationTokens *prometheus.HistogramVec
115+
// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request
116+
requestParamsMaxTokens *prometheus.HistogramVec
117+
// requestSuccessTotal is prometheus counter for total number of successful requests
118+
requestSuccessTotal *prometheus.CounterVec
111119
// channel for requeasts to be passed to workers
112120
reqChan chan *openaiserverapi.CompletionReqCtx
113121
// schema validator for tools parameters
@@ -597,9 +605,12 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
597605
// in case this is prefill pod processing, return special finish reason
598606
finishReason = common.RemoteDecodeFinishReason
599607
}
600-
601608
s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
602609
}
610+
611+
maxTokens, _ := common.GetMaxTokens(reqCtx.CompletionReq.GetMaxCompletionTokens(), nil)
612+
s.recordRequestMetricsOnSuccess(usageData.PromptTokens, usageData.CompletionTokens,
613+
maxTokens, finishReason)
603614
}
604615
reqCtx.Wg.Done()
605616
}

pkg/vllm-api/vllm-models.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ const (
2525
PromLabelRunningLoraAdapters = "running_lora_adapters"
2626
PromLabelMaxLora = "max_lora"
2727
PromLabelModelName = "model_name"
28+
PromLabelFinishReason = "finish_reason"
2829

2930
VllmLoraRequestInfo = "vllm:lora_requests_info"
3031
VllmNumRequestsRunning = "vllm:num_requests_running"

0 commit comments

Comments
 (0)