@@ -111,6 +111,21 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
111111 return err
112112 }
113113
114+ s .metrics .e2eReqLatency = prometheus .NewHistogramVec (
115+ prometheus.HistogramOpts {
116+ Subsystem : "" ,
117+ Name : "vllm:e2e_request_latency_seconds" ,
118+ Help : "Histogram of end to end request latency in seconds." ,
119+ Buckets : common .E2ERequestLatencyBucketsBoundaries ,
120+ },
121+ []string {vllmapi .PromLabelModelName },
122+ )
123+
124+ if err := s .metrics .registry .Register (s .metrics .e2eReqLatency ); err != nil {
125+ s .logger .Error (err , "Prometheus end to end request latency histogram register failed" )
126+ return err
127+ }
128+
114129 s .metrics .kvCacheUsagePercentage = prometheus .NewGaugeVec (
115130 prometheus.GaugeOpts {
116131 Subsystem : "" ,
@@ -215,6 +230,10 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
215230 for reason , requestSuccessTotal := range s .config .FakeMetrics .RequestSuccessTotal {
216231 s .metrics .requestSuccessTotal .WithLabelValues (modelName , reason ).Add (float64 (requestSuccessTotal ))
217232 }
233+
234+ if s .config .FakeMetrics .E2ERequestLatencyBucketValues != nil {
235+ s .initFakeHistogram (s .metrics .tpot , common .E2ERequestLatencyBucketsBoundaries , s .config .FakeMetrics .E2ERequestLatencyBucketValues )
236+ }
218237 }
219238
220239 s .metrics .runningRequests .WithLabelValues (modelName ).Set (nRunningReqs )
@@ -317,25 +336,14 @@ func (s *VllmSimulator) reportWaitingRequests() {
317336 }
318337}
319338
320- // reportTTFT sets information about time to first token
321- func (s * VllmSimulator ) reportTTFT (ttftInSecs float64 ) {
322- if s .config .FakeMetrics != nil {
323- return
324- }
325- if s .metrics .ttft != nil {
326- s .metrics .ttft .WithLabelValues (
327- s .getDisplayedModelName (s .config .Model )).Observe (ttftInSecs )
328- }
329- }
330-
331- // reportTPOT sets information about time per output token
332- func (s * VllmSimulator ) reportTPOT (tpotInSecs float64 ) {
339+ // reportHistogramValue sets the given value in the given histogram
340+ func (s * VllmSimulator ) reportHistogramValue (hist * prometheus.HistogramVec , val float64 ) {
333341 if s .config .FakeMetrics != nil {
334342 return
335343 }
336- if s . metrics . tpot != nil {
337- s . metrics . tpot .WithLabelValues (
338- s .getDisplayedModelName (s .config .Model )).Observe (tpotInSecs )
344+ if hist != nil {
345+ hist .WithLabelValues (
346+ s .getDisplayedModelName (s .config .Model )).Observe (val )
339347 }
340348}
341349
@@ -359,6 +367,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
359367 go s .ttftUpdater (ctx )
360368 go s .tpotUpdater (ctx )
361369 go s .recordRequestUpdater (ctx )
370+ go s .e2eReqLatencyUpdater (ctx )
362371}
363372
364373// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -406,7 +415,7 @@ func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
406415 case <- ctx .Done ():
407416 return
408417 case value := <- s .metrics .ttftChan :
409- s .reportTTFT ( value )
418+ s .reportHistogramValue ( s . metrics . ttft , value )
410419 }
411420 }
412421}
@@ -418,7 +427,19 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
418427 case <- ctx .Done ():
419428 return
420429 case value := <- s .metrics .tpotChan :
421- s .reportTPOT (value )
430+ s .reportHistogramValue (s .metrics .tpot , value )
431+ }
432+ }
433+ }
434+
435+ // tpotUpdater updates the time per output token metric by listening on the relevant channel
436+ func (s * VllmSimulator ) e2eReqLatencyUpdater (ctx context.Context ) {
437+ for {
438+ select {
439+ case <- ctx .Done ():
440+ return
441+ case value := <- s .metrics .e2eReqLatencyChan :
442+ s .reportHistogramValue (s .metrics .e2eReqLatency , value )
422443 }
423444 }
424445}
0 commit comments