Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,12 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
- `running-requests`
- `waiting-requests`
- `kv-cache-usage`
- `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric).

Example:
{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
- `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric).
- `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
- `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
<br>
Example:<br>
--fake-metrics '{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}'
---
- `data-parallel-size`: number of ranks to run in Data Parallel deployment, from 1 to 8, default is 1. The ports will be assigned as follows: rank 0 will run on the configured `port`, rank 1 on `port`+1, etc.
---
Expand Down
2 changes: 2 additions & 0 deletions manifests/config_with_fake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ fake-metrics:
loras:
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
ttft-buckets-values: [10, 20, 30, 10]
tpot-buckets-values: [0, 0, 10, 20, 30]
34 changes: 34 additions & 0 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,20 @@ type Metrics struct {
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
// TTFTBuckets is an array of values for time-to-first-token buckets,
// each value in this array is a value for the corresponding bucket.
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
// Buckets upper boundaries in seconds are:
// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
// TPOTBuckets is an array of values for time-per-output-token buckets,
// each value in this array is a value for the corresponding bucket.
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
// Buckets upper boundaries in seconds are:
// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
}

type LorasMetrics struct {
Expand Down Expand Up @@ -487,6 +501,26 @@ func (c *Configuration) validate() error {
if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
return errors.New("fake metrics KV cache usage must be between 0 ans 1")
}
if c.FakeMetrics.TTFTBucketValues != nil {
if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 {
return errors.New("fake time-to-first-token array is too long")
}
for v := range c.FakeMetrics.TTFTBucketValues {
if v < 0 {
return errors.New("time-to-first-token fake metrics should contain only non-negative values")
}
}
}
if c.FakeMetrics.TPOTBucketValues != nil {
if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 {
return errors.New("fake time-per-output-token array is too long")
}
for v := range c.FakeMetrics.TPOTBucketValues {
if v < 0 {
return errors.New("time-per-output-token fake metrics should contain only non-negative values")
}
}
}
}

if c.DPSize < 1 || c.DPSize > 8 {
Expand Down
12 changes: 12 additions & 0 deletions pkg/common/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ var _ = Describe("Simulator configuration", func() {
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
},
TTFTBucketValues: []int{10, 20, 30, 10},
TPOTBucketValues: []int{0, 0, 10, 20, 30},
}
test = testCase{
name: "config with fake metrics file",
Expand Down Expand Up @@ -451,6 +453,16 @@ var _ = Describe("Simulator configuration", func() {
args: []string{"cmd", "--time-factor-under-load", "-1",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid ttft",
args: []string{"cmd", "--ttft-buckets-values", "[1, 2, -10, 1]",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid tpot",
args: []string{"cmd", "--tpot-buckets-values", "[1, 2, -10, 1]",
"--config", "../../manifests/config.yaml"},
},
}

for _, test := range invalidTests {
Expand Down
7 changes: 7 additions & 0 deletions pkg/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ import (
"github.com/google/uuid"
)

// Definition of buckets for time-to-first-token and time-per-output-token metrics, each value is an upper boundary of a bucket
var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
2560.0}
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}

// ValidateContextWindow checks if the request fits within the model's context window
// Returns validation result, actual completion tokens, and total tokens
func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
Expand Down
118 changes: 116 additions & 2 deletions pkg/llm-d-inference-sim/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (

"github.com/prometheus/client_golang/prometheus"

"github.com/llm-d/llm-d-inference-sim/pkg/common"
vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
)

Expand Down Expand Up @@ -64,7 +65,6 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
return err
}

// not supported for now, reports constant value
s.waitingRequests = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "",
Expand All @@ -79,7 +79,36 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
return err
}

// not supported for now, reports constant value
s.ttft = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: "",
Name: "vllm:time_to_first_token_seconds",
Help: "Histogram of time to first token in seconds.",
Buckets: common.TTFTBucketsBoundaries,
},
[]string{vllmapi.PromLabelModelName},
)

if err := s.registry.Register(s.ttft); err != nil {
s.logger.Error(err, "Prometheus time to first token histogram register failed")
return err
}

s.tpot = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: "",
Name: "vllm:time_per_output_token_seconds",
Help: "Histogram of time per output token in seconds.",
Buckets: common.TPOTBucketsBoundaries,
},
[]string{vllmapi.PromLabelModelName},
)

if err := s.registry.Register(s.tpot); err != nil {
s.logger.Error(err, "Prometheus time per output token histogram register failed")
return err
}

s.kvCacheUsagePercentage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "",
Expand Down Expand Up @@ -107,7 +136,16 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)

if s.config.FakeMetrics.TTFTBucketValues != nil {
s.initFakeHistogram(s.ttft, common.TTFTBucketsBoundaries, s.config.FakeMetrics.TTFTBucketValues)
}

if s.config.FakeMetrics.TPOTBucketValues != nil {
s.initFakeHistogram(s.tpot, common.TPOTBucketsBoundaries, s.config.FakeMetrics.TPOTBucketValues)
}
}

modelName := s.getDisplayedModelName(s.config.Model)
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
Expand All @@ -128,6 +166,34 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
}
}

// initFakeHistogram initializes the given histogram values based on the input
// bucketsBoundaries - upper boudaries of all buckets except the last one. Actual number of buckets is len(bucketsBoundaries)+1.
// This includes the last bucket (last_boundary, +Inf].
// bucketsSamplesCount - array containing number of samples per bucket, starting from the first bucket.
// Trailing empty buckets are not included in this array, so its length can be <= len(bucketsBoundaries)+1
func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketsSamplesCount []int) {
var valueToObserve float64
numOfBoundaries := len(bucketsBoundaries)
modelName := s.getDisplayedModelName(s.config.Model)

for i, bucketSamplesCount := range bucketsSamplesCount {
// for each bucket calculate value to use for Observe function
// for all buckets except the last one it will be the upper boundary (which is included in the bucket)
// for the last bucket it will be top boundary of the previous bucket + 1
if i < numOfBoundaries {
valueToObserve = bucketsBoundaries[i]
} else {
// this is last bucket - use number larger than the upper bound of the previous bucket
valueToObserve = bucketsBoundaries[numOfBoundaries-1] + 1
}

for range bucketSamplesCount {
// create required number of observations for the calculated sample
hist.WithLabelValues(modelName).Observe(valueToObserve)
}
}
}

// reportLoras sets information about loaded LoRA adapters
func (s *VllmSimulator) reportLoras() {
if s.config.FakeMetrics != nil {
Expand Down Expand Up @@ -181,6 +247,28 @@ func (s *VllmSimulator) reportWaitingRequests() {
}
}

// reportTTFT sets information about time to first token
func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
if s.config.FakeMetrics != nil {
return
}
if s.ttft != nil {
s.ttft.WithLabelValues(
s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs)
}
}

// reportTPOT sets information about time per output token
func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
if s.config.FakeMetrics != nil {
return
}
if s.tpot != nil {
s.tpot.WithLabelValues(
s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs)
}
}

// reportKVCacheUsage sets information about kv cache usage
func (s *VllmSimulator) reportKVCacheUsage(value float64) {
if s.config.FakeMetrics != nil {
Expand All @@ -198,6 +286,8 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
go s.runningRequestsUpdater(ctx)
go s.lorasUpdater(ctx)
go s.kvCacheUsageUpdater(ctx)
go s.ttftUpdater(ctx)
go s.tpotUpdater(ctx)
}

// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
Expand Down Expand Up @@ -238,6 +328,30 @@ func (s *VllmSimulator) kvCacheUsageUpdater(ctx context.Context) {
}
}

// ttftUpdater updates the time to first token metric by listening on the relevant channel
func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case value := <-s.ttftChan:
s.reportTTFT(value)
}
}
}

// tpotUpdater updates the time per output token metric by listening on the relevant channel
func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case value := <-s.tpotChan:
s.reportTPOT(value)
}
}
}

// lorasUpdater updates the running loras metric by listening on the relevant channel
// one function updates both waiting and running loras since they a part of the same prometheus gauge
func (s *VllmSimulator) lorasUpdater(ctx context.Context) {
Expand Down
Loading
Loading