Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,12 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
- `running-requests`
- `waiting-requests`
- `kv-cache-usage`
- `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric).

Example:
{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
- `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric).
- `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
- `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
<br>
Example:<br>
--fake-metrics '{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}'
---
- `data-parallel-size`: number of ranks to run in Data Parallel deployment, from 1 to 8, default is 1. The ports will be assigned as follows: rank 0 will run on the configured `port`, rank 1 on `port`+1, etc.
---
Expand Down
2 changes: 2 additions & 0 deletions manifests/config_with_fake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ fake-metrics:
loras:
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
ttft-buckets-values: [10, 20, 30, 10]
tpot-buckets-values: [0, 0, 10, 20, 30]
34 changes: 34 additions & 0 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,20 @@ type Metrics struct {
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
// TTFTBuckets is an array of values for time-to-first-token buckets,
// each value in this array is a value for the corresponding bucket.
// Array may contain less values than number of buckets, all missing values assumed as 0.
// Buckets upper boundaries are:
// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
// TPOTBuckets is an array of values for time-per-output-token buckets,
// each value in this array is a value for the corresponding bucket.
// Array may contain less values than number of buckets, all missing values assumed as 0.
// Buckets upper boundaries are:
// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
}

type LorasMetrics struct {
Expand Down Expand Up @@ -487,6 +501,26 @@ func (c *Configuration) validate() error {
if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
return errors.New("fake metrics KV cache usage must be between 0 ans 1")
}
if c.FakeMetrics.TTFTBucketValues != nil {
if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 {
return errors.New("fake time-to-first-token array is too long")
}
for v := range c.FakeMetrics.TTFTBucketValues {
if v < 0 {
return errors.New("time-to-first-token fake metrics should contain only non-negative values")
}
}
}
if c.FakeMetrics.TPOTBucketValues != nil {
if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 {
return errors.New("fake time-per-output-token array is too long")
}
for v := range c.FakeMetrics.TPOTBucketValues {
if v < 0 {
return errors.New("time-per-output-token fake metrics should contain only non-negative values")
}
}
}
}

if c.DPSize < 1 || c.DPSize > 8 {
Expand Down
7 changes: 7 additions & 0 deletions pkg/common/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ var _ = Describe("Simulator configuration", func() {
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
},
TTFTBucketValues: []int{10, 20, 30, 10},
TPOTBucketValues: []int{0, 0, 10, 20, 30},
}
test = testCase{
name: "config with fake metrics file",
Expand Down Expand Up @@ -451,6 +453,11 @@ var _ = Describe("Simulator configuration", func() {
args: []string{"cmd", "--time-factor-under-load", "-1",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid ttft",
args: []string{"cmd", "--ttft-buckets-values", "[1, 2, -10, 1]",
"--config", "../../manifests/config.yaml"},
},
}

for _, test := range invalidTests {
Expand Down
6 changes: 6 additions & 0 deletions pkg/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ import (
"github.com/google/uuid"
)

var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
2560.0}
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}

// ValidateContextWindow checks if the request fits within the model's context window
// Returns validation result, actual completion tokens, and total tokens
func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
Expand Down
109 changes: 107 additions & 2 deletions pkg/llm-d-inference-sim/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (

"github.com/prometheus/client_golang/prometheus"

"github.com/llm-d/llm-d-inference-sim/pkg/common"
vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
)

Expand Down Expand Up @@ -64,7 +65,6 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
return err
}

// not supported for now, reports constant value
s.waitingRequests = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "",
Expand All @@ -79,7 +79,36 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
return err
}

// not supported for now, reports constant value
s.ttft = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: "",
Name: "vllm:time_to_first_token_seconds",
Help: "Histogram of time to first token in seconds.",
Buckets: common.TTFTBucketsBoundaries,
},
[]string{vllmapi.PromLabelModelName},
)

if err := s.registry.Register(s.ttft); err != nil {
s.logger.Error(err, "Prometheus time to first token histogram register failed")
return err
}

s.tpot = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: "",
Name: "vllm:time_per_output_token_seconds",
Help: "Histogram of time per output token in seconds.",
Buckets: common.TPOTBucketsBoundaries,
},
[]string{vllmapi.PromLabelModelName},
)

if err := s.registry.Register(s.tpot); err != nil {
s.logger.Error(err, "Prometheus time per output token histogram register failed")
return err
}

s.kvCacheUsagePercentage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "",
Expand Down Expand Up @@ -107,7 +136,16 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)

if s.config.FakeMetrics.TTFTBucketValues != nil {
s.initFakeHistogram(s.ttft, common.TTFTBucketsBoundaries, s.config.FakeMetrics.TTFTBucketValues)
}

if s.config.FakeMetrics.TPOTBucketValues != nil {
s.initFakeHistogram(s.tpot, common.TPOTBucketsBoundaries, s.config.FakeMetrics.TPOTBucketValues)
}
}

modelName := s.getDisplayedModelName(s.config.Model)
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
Expand All @@ -128,6 +166,25 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
}
}

func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketValues []int) {
var valueToObserve float64
numOfBuckets := len(bucketsBoundaries)
modelName := s.getDisplayedModelName(s.config.Model)

for i, bucketVal := range bucketValues {
if i < numOfBuckets {
valueToObserve = bucketsBoundaries[i]
} else {
// this is last bucket - use number larger than the upper bound of the last bucket
valueToObserve = bucketsBoundaries[len(bucketsBoundaries)-1] + 1
}

for range bucketVal {
hist.WithLabelValues(modelName).Observe(valueToObserve)
}
}
}

// reportLoras sets information about loaded LoRA adapters
func (s *VllmSimulator) reportLoras() {
if s.config.FakeMetrics != nil {
Expand Down Expand Up @@ -181,6 +238,28 @@ func (s *VllmSimulator) reportWaitingRequests() {
}
}

// reportTTFT sets information about time to first token
func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
if s.config.FakeMetrics != nil {
return
}
if s.ttft != nil {
s.ttft.WithLabelValues(
s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs)
}
}

// reportTTFT sets information about time per output token
func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
if s.config.FakeMetrics != nil {
return
}
if s.tpot != nil {
s.tpot.WithLabelValues(
s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs)
}
}

// reportKVCacheUsage sets information about kv cache usage
func (s *VllmSimulator) reportKVCacheUsage(value float64) {
if s.config.FakeMetrics != nil {
Expand All @@ -198,6 +277,8 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
go s.runningRequestsUpdater(ctx)
go s.lorasUpdater(ctx)
go s.kvCacheUsageUpdater(ctx)
go s.ttftUpdater(ctx)
go s.tpotUpdater(ctx)
}

// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
Expand Down Expand Up @@ -238,6 +319,30 @@ func (s *VllmSimulator) kvCacheUsageUpdater(ctx context.Context) {
}
}

// ttftUpdater updates the time to first token metric by listening on the relevant channel
func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case value := <-s.ttftChan:
s.reportTTFT(value)
}
}
}

// tpotUpdater updates the time per output token metric by listening on the relevant channel
func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case value := <-s.tpotChan:
s.reportTPOT(value)
}
}
}

// lorasUpdater updates the running loras metric by listening on the relevant channel
// one function updates both waiting and running loras since they a part of the same prometheus gauge
func (s *VllmSimulator) lorasUpdater(ctx context.Context) {
Expand Down
63 changes: 62 additions & 1 deletion pkg/llm-d-inference-sim/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package llmdinferencesim
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"os"
Expand Down Expand Up @@ -464,7 +465,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
ctx := context.TODO()
args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
"--fake-metrics",
"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}],\"ttft-buckets-values\":[1, 2, 3],\"tpot-buckets-values\": [0, 0, 1, 2, 3]}",
}

client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
Expand All @@ -482,6 +483,66 @@ var _ = Describe("Simulator metrics", Ordered, func() {
Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"my_model\"} 0.4"))
Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09"))
Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09"))

Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 1"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 3"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 6"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 6"))

Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 1"))
Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 3"))
Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 6"))
Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6"))
})
})

Context("fake ttft metrics", func() {
It("Should respond with fake ttft metrics to /metrics", func() {
ctx := context.TODO()
args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
"--fake-metrics",
"{\"ttft-buckets-values\":[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}",
}

client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
Expect(err).NotTo(HaveOccurred())

resp, err := client.Get(metricsUrl)
Expect(err).NotTo(HaveOccurred())
Expect(resp.StatusCode).To(Equal(http.StatusOK))

data, err := io.ReadAll(resp.Body)
Expect(err).NotTo(HaveOccurred())
metrics := string(data)

fmt.Println("---MAYA---")
fmt.Println(metrics)

Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 0"))
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
})
})
})
Expand Down
Loading
Loading