Skip to content

Commit 202efc4

Browse files
authored
Add vllm:time_per_output_token_seconds and vllm:time_to_first_token_seconds metrics (#217)
* Add vllm:time_per_output_token_seconds and vllm:time_to_first_token_seconds histogram metrics, including support in fake metrics, and update of readme Signed-off-by: Maya Barnea <[email protected]> * Add test for ttft kae metrics command line parameter with value for the last bucket Signed-off-by: Maya Barnea <[email protected]> * move calculating model name from a loop Signed-off-by: Maya Barnea <[email protected]> * Changes according the PR review Signed-off-by: Maya Barnea <[email protected]> * according review comments Signed-off-by: Maya Barnea <[email protected]> --------- Signed-off-by: Maya Barnea <[email protected]>
1 parent 1adeeb3 commit 202efc4

File tree

9 files changed

+410
-8
lines changed

9 files changed

+410
-8
lines changed

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,12 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
143143
- `running-requests`
144144
- `waiting-requests`
145145
- `kv-cache-usage`
146-
- `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric).
147-
148-
Example:
149-
{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
146+
- `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric).
147+
- `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
148+
- `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
149+
<br>
150+
Example:<br>
151+
--fake-metrics '{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}'
150152
---
151153
- `data-parallel-size`: number of ranks to run in Data Parallel deployment, from 1 to 8, default is 1. The ports will be assigned as follows: rank 0 will run on the configured `port`, rank 1 on `port`+1, etc.
152154
---

manifests/config_with_fake.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,5 @@ fake-metrics:
1414
loras:
1515
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
1616
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
17+
ttft-buckets-values: [10, 20, 30, 10]
18+
tpot-buckets-values: [0, 0, 10, 20, 30]

pkg/common/config.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,20 @@ type Metrics struct {
209209
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
210210
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
211211
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
212+
// TTFTBuckets is an array of values for time-to-first-token buckets,
213+
// each value in this array is a value for the corresponding bucket.
214+
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
215+
// Buckets upper boundaries in seconds are:
216+
// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
217+
// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
218+
TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
219+
// TPOTBuckets is an array of values for time-per-output-token buckets,
220+
// each value in this array is a value for the corresponding bucket.
221+
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
222+
// Buckets upper boundaries in seconds are:
223+
// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
224+
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
225+
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
212226
}
213227

214228
type LorasMetrics struct {
@@ -487,6 +501,26 @@ func (c *Configuration) validate() error {
487501
if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
488502
return errors.New("fake metrics KV cache usage must be between 0 ans 1")
489503
}
504+
if c.FakeMetrics.TTFTBucketValues != nil {
505+
if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 {
506+
return errors.New("fake time-to-first-token array is too long")
507+
}
508+
for v := range c.FakeMetrics.TTFTBucketValues {
509+
if v < 0 {
510+
return errors.New("time-to-first-token fake metrics should contain only non-negative values")
511+
}
512+
}
513+
}
514+
if c.FakeMetrics.TPOTBucketValues != nil {
515+
if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 {
516+
return errors.New("fake time-per-output-token array is too long")
517+
}
518+
for v := range c.FakeMetrics.TPOTBucketValues {
519+
if v < 0 {
520+
return errors.New("time-per-output-token fake metrics should contain only non-negative values")
521+
}
522+
}
523+
}
490524
}
491525

492526
if c.DPSize < 1 || c.DPSize > 8 {

pkg/common/config_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ var _ = Describe("Simulator configuration", func() {
203203
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
204204
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
205205
},
206+
TTFTBucketValues: []int{10, 20, 30, 10},
207+
TPOTBucketValues: []int{0, 0, 10, 20, 30},
206208
}
207209
test = testCase{
208210
name: "config with fake metrics file",
@@ -451,6 +453,16 @@ var _ = Describe("Simulator configuration", func() {
451453
args: []string{"cmd", "--time-factor-under-load", "-1",
452454
"--config", "../../manifests/config.yaml"},
453455
},
456+
{
457+
name: "invalid ttft",
458+
args: []string{"cmd", "--ttft-buckets-values", "[1, 2, -10, 1]",
459+
"--config", "../../manifests/config.yaml"},
460+
},
461+
{
462+
name: "invalid tpot",
463+
args: []string{"cmd", "--tpot-buckets-values", "[1, 2, -10, 1]",
464+
"--config", "../../manifests/config.yaml"},
465+
},
454466
}
455467

456468
for _, test := range invalidTests {

pkg/common/utils.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ import (
2424
"github.com/google/uuid"
2525
)
2626

27+
// Definition of buckets for time-to-first-token and time-per-output-token metrics, each value is an upper boundary of a bucket
28+
var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
29+
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
30+
2560.0}
31+
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
32+
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
33+
2734
// ValidateContextWindow checks if the request fits within the model's context window
2835
// Returns validation result, actual completion tokens, and total tokens
2936
func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {

pkg/llm-d-inference-sim/metrics.go

Lines changed: 116 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727

2828
"github.com/prometheus/client_golang/prometheus"
2929

30+
"github.com/llm-d/llm-d-inference-sim/pkg/common"
3031
vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
3132
)
3233

@@ -64,7 +65,6 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
6465
return err
6566
}
6667

67-
// not supported for now, reports constant value
6868
s.waitingRequests = prometheus.NewGaugeVec(
6969
prometheus.GaugeOpts{
7070
Subsystem: "",
@@ -79,7 +79,36 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
7979
return err
8080
}
8181

82-
// not supported for now, reports constant value
82+
s.ttft = prometheus.NewHistogramVec(
83+
prometheus.HistogramOpts{
84+
Subsystem: "",
85+
Name: "vllm:time_to_first_token_seconds",
86+
Help: "Histogram of time to first token in seconds.",
87+
Buckets: common.TTFTBucketsBoundaries,
88+
},
89+
[]string{vllmapi.PromLabelModelName},
90+
)
91+
92+
if err := s.registry.Register(s.ttft); err != nil {
93+
s.logger.Error(err, "Prometheus time to first token histogram register failed")
94+
return err
95+
}
96+
97+
s.tpot = prometheus.NewHistogramVec(
98+
prometheus.HistogramOpts{
99+
Subsystem: "",
100+
Name: "vllm:time_per_output_token_seconds",
101+
Help: "Histogram of time per output token in seconds.",
102+
Buckets: common.TPOTBucketsBoundaries,
103+
},
104+
[]string{vllmapi.PromLabelModelName},
105+
)
106+
107+
if err := s.registry.Register(s.tpot); err != nil {
108+
s.logger.Error(err, "Prometheus time per output token histogram register failed")
109+
return err
110+
}
111+
83112
s.kvCacheUsagePercentage = prometheus.NewGaugeVec(
84113
prometheus.GaugeOpts{
85114
Subsystem: "",
@@ -107,7 +136,16 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
107136
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
108137
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
109138
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
139+
140+
if s.config.FakeMetrics.TTFTBucketValues != nil {
141+
s.initFakeHistogram(s.ttft, common.TTFTBucketsBoundaries, s.config.FakeMetrics.TTFTBucketValues)
142+
}
143+
144+
if s.config.FakeMetrics.TPOTBucketValues != nil {
145+
s.initFakeHistogram(s.tpot, common.TPOTBucketsBoundaries, s.config.FakeMetrics.TPOTBucketValues)
146+
}
110147
}
148+
111149
modelName := s.getDisplayedModelName(s.config.Model)
112150
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
113151
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
@@ -128,6 +166,34 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
128166
}
129167
}
130168

169+
// initFakeHistogram initializes the given histogram values based on the input
170+
// bucketsBoundaries - upper boudaries of all buckets except the last one. Actual number of buckets is len(bucketsBoundaries)+1.
171+
// This includes the last bucket (last_boundary, +Inf].
172+
// bucketsSamplesCount - array containing number of samples per bucket, starting from the first bucket.
173+
// Trailing empty buckets are not included in this array, so its length can be <= len(bucketsBoundaries)+1
174+
func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketsSamplesCount []int) {
175+
var valueToObserve float64
176+
numOfBoundaries := len(bucketsBoundaries)
177+
modelName := s.getDisplayedModelName(s.config.Model)
178+
179+
for i, bucketSamplesCount := range bucketsSamplesCount {
180+
// for each bucket calculate value to use for Observe function
181+
// for all buckets except the last one it will be the upper boundary (which is included in the bucket)
182+
// for the last bucket it will be top boundary of the previous bucket + 1
183+
if i < numOfBoundaries {
184+
valueToObserve = bucketsBoundaries[i]
185+
} else {
186+
// this is last bucket - use number larger than the upper bound of the previous bucket
187+
valueToObserve = bucketsBoundaries[numOfBoundaries-1] + 1
188+
}
189+
190+
for range bucketSamplesCount {
191+
// create required number of observations for the calculated sample
192+
hist.WithLabelValues(modelName).Observe(valueToObserve)
193+
}
194+
}
195+
}
196+
131197
// reportLoras sets information about loaded LoRA adapters
132198
func (s *VllmSimulator) reportLoras() {
133199
if s.config.FakeMetrics != nil {
@@ -181,6 +247,28 @@ func (s *VllmSimulator) reportWaitingRequests() {
181247
}
182248
}
183249

250+
// reportTTFT sets information about time to first token
251+
func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
252+
if s.config.FakeMetrics != nil {
253+
return
254+
}
255+
if s.ttft != nil {
256+
s.ttft.WithLabelValues(
257+
s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs)
258+
}
259+
}
260+
261+
// reportTPOT sets information about time per output token
262+
func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
263+
if s.config.FakeMetrics != nil {
264+
return
265+
}
266+
if s.tpot != nil {
267+
s.tpot.WithLabelValues(
268+
s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs)
269+
}
270+
}
271+
184272
// reportKVCacheUsage sets information about kv cache usage
185273
func (s *VllmSimulator) reportKVCacheUsage(value float64) {
186274
if s.config.FakeMetrics != nil {
@@ -198,6 +286,8 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198286
go s.runningRequestsUpdater(ctx)
199287
go s.lorasUpdater(ctx)
200288
go s.kvCacheUsageUpdater(ctx)
289+
go s.ttftUpdater(ctx)
290+
go s.tpotUpdater(ctx)
201291
}
202292

203293
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -238,6 +328,30 @@ func (s *VllmSimulator) kvCacheUsageUpdater(ctx context.Context) {
238328
}
239329
}
240330

331+
// ttftUpdater updates the time to first token metric by listening on the relevant channel
332+
func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
333+
for {
334+
select {
335+
case <-ctx.Done():
336+
return
337+
case value := <-s.ttftChan:
338+
s.reportTTFT(value)
339+
}
340+
}
341+
}
342+
343+
// tpotUpdater updates the time per output token metric by listening on the relevant channel
344+
func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
345+
for {
346+
select {
347+
case <-ctx.Done():
348+
return
349+
case value := <-s.tpotChan:
350+
s.reportTPOT(value)
351+
}
352+
}
353+
}
354+
241355
// lorasUpdater updates the running loras metric by listening on the relevant channel
242356
// one function updates both waiting and running loras since they a part of the same prometheus gauge
243357
func (s *VllmSimulator) lorasUpdater(ctx context.Context) {

0 commit comments

Comments
 (0)