Skip to content

Commit f578dc0

Browse files
committed
Add e2e request latency histogram to prometheus metrics.
Add reportHistogramValue function to be used for reporting values in histogram metrics Signed-off-by: Maya Barnea <[email protected]>
1 parent 7e913bc commit f578dc0

File tree

4 files changed

+59
-18
lines changed

4 files changed

+59
-18
lines changed

pkg/common/config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,13 @@ type Metrics struct {
253253
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
254254
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
255255
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
256+
// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets,
257+
// each value in this array is a value for the corresponding bucket.
258+
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
259+
// Buckets upper boundaries in seconds are:
260+
// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
261+
// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
262+
E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
256263
}
257264

258265
type LorasMetrics struct {

pkg/common/utils.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
3232
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
3333
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
3434

35+
var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
36+
20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
37+
3538
// ValidateContextWindow checks if the request fits within the model's context window
3639
// Returns validation result, actual completion tokens, and total tokens
3740
func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {

pkg/llm-d-inference-sim/metrics.go

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,21 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
111111
return err
112112
}
113113

114+
s.metrics.e2eReqLatency = prometheus.NewHistogramVec(
115+
prometheus.HistogramOpts{
116+
Subsystem: "",
117+
Name: "vllm:e2e_request_latency_seconds",
118+
Help: "Histogram of end to end request latency in seconds.",
119+
Buckets: common.E2ERequestLatencyBucketsBoundaries,
120+
},
121+
[]string{vllmapi.PromLabelModelName},
122+
)
123+
124+
if err := s.metrics.registry.Register(s.metrics.e2eReqLatency); err != nil {
125+
s.logger.Error(err, "Prometheus end to end request latency histogram register failed")
126+
return err
127+
}
128+
114129
s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
115130
prometheus.GaugeOpts{
116131
Subsystem: "",
@@ -215,6 +230,10 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
215230
for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal {
216231
s.metrics.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal))
217232
}
233+
234+
if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil {
235+
s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
236+
}
218237
}
219238

220239
s.metrics.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
@@ -317,25 +336,14 @@ func (s *VllmSimulator) reportWaitingRequests() {
317336
}
318337
}
319338

320-
// reportTTFT sets information about time to first token
321-
func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
322-
if s.config.FakeMetrics != nil {
323-
return
324-
}
325-
if s.metrics.ttft != nil {
326-
s.metrics.ttft.WithLabelValues(
327-
s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs)
328-
}
329-
}
330-
331-
// reportTPOT sets information about time per output token
332-
func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
339+
// reportHistogramValue sets the given value in the given histogram
340+
func (s *VllmSimulator) reportHistogramValue(hist *prometheus.HistogramVec, val float64) {
333341
if s.config.FakeMetrics != nil {
334342
return
335343
}
336-
if s.metrics.tpot != nil {
337-
s.metrics.tpot.WithLabelValues(
338-
s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs)
344+
if hist != nil {
345+
hist.WithLabelValues(
346+
s.getDisplayedModelName(s.config.Model)).Observe(val)
339347
}
340348
}
341349

@@ -359,6 +367,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
359367
go s.ttftUpdater(ctx)
360368
go s.tpotUpdater(ctx)
361369
go s.recordRequestUpdater(ctx)
370+
go s.e2eReqLatencyUpdater(ctx)
362371
}
363372

364373
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -406,7 +415,7 @@ func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
406415
case <-ctx.Done():
407416
return
408417
case value := <-s.metrics.ttftChan:
409-
s.reportTTFT(value)
418+
s.reportHistogramValue(s.metrics.ttft, value)
410419
}
411420
}
412421
}
@@ -418,7 +427,19 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
418427
case <-ctx.Done():
419428
return
420429
case value := <-s.metrics.tpotChan:
421-
s.reportTPOT(value)
430+
s.reportHistogramValue(s.metrics.tpot, value)
431+
}
432+
}
433+
}
434+
435+
// tpotUpdater updates the time per output token metric by listening on the relevant channel
436+
func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
437+
for {
438+
select {
439+
case <-ctx.Done():
440+
return
441+
case value := <-s.metrics.e2eReqLatencyChan:
442+
s.reportHistogramValue(s.metrics.e2eReqLatency, value)
422443
}
423444
}
424445
}

pkg/llm-d-inference-sim/simulator.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ type metricsData struct {
9393
ttftChan chan float64
9494
// tpotChan is a channel to update time per output token
9595
tpotChan chan float64
96+
// e2eReqLatencyChan is a channel to update request e2e latency
97+
e2eReqLatencyChan chan float64
9698
// kvCacheUsageChan is a channel to update kvCacheUsagePercentage
9799
kvCacheUsageChan chan float64
98100
// registry is a Prometheus registry
@@ -107,6 +109,8 @@ type metricsData struct {
107109
ttft *prometheus.HistogramVec
108110
// tpot is prometheus histogram for time per output token in seconds
109111
tpot *prometheus.HistogramVec
112+
// e2eReqLatency is prometheus histogram of end to end request latency in seconds
113+
e2eReqLatency *prometheus.HistogramVec
110114
// kvCacheUsagePercentage is prometheus gauge
111115
kvCacheUsagePercentage *prometheus.GaugeVec
112116
// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
@@ -271,6 +275,7 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
271275
s.metrics.kvCacheUsageChan = make(chan float64, maxNumberOfRequests)
272276
s.metrics.ttftChan = make(chan float64, maxNumberOfRequests)
273277
s.metrics.tpotChan = make(chan float64, maxNumberOfRequests)
278+
s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests)
274279
s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests)
275280

276281
s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests)
@@ -460,6 +465,11 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC
460465

461466
// handleCompletions general completion requests handler, support both text and chat completion APIs
462467
func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatCompletion bool) {
468+
startTime := time.Now()
469+
defer func() {
470+
s.metrics.e2eReqLatencyChan <- time.Since(startTime).Seconds()
471+
}()
472+
463473
// Check if we should inject a failure
464474
if shouldInjectFailure(s.config) {
465475
failure := getRandomFailure(s.config)

0 commit comments

Comments
 (0)