Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,18 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
| vllm:lora_requests_info | Running stats on LoRA requests |
| vllm:num_requests_running | Number of requests currently running on GPU |
| vllm:num_requests_waiting | Prometheus metric for the number of queued requests |

| vllm:e2e_request_latency_seconds | Histogram of end to end request latency in seconds |
| vllm:request_inference_time_seconds | Histogram of time spent in RUNNING phase for request |
| vllm:request_queue_time_seconds | Histogram of time spent in WAITING phase for request |
| vllm:request_prefill_time_seconds | Histogram of time spent in PREFILL phase for request |
| vllm:request_decode_time_seconds | Histogram of time spent in DECODE phase for request |
| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
| vllm:request_generation_tokens | Number of generation tokens processed |
| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter |
| vllm:request_prompt_tokens | Number of prefill tokens processed |
| vllm:request_success_total | Count of successfully processed requests |

The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.

The simulator supports two modes of operation:
Expand Down
58 changes: 53 additions & 5 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,16 +232,17 @@ type Metrics struct {
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
// TTFTBuckets is an array of values for time-to-first-token buckets,
// each value in this array is a value for the corresponding bucket.

// Histogram metrics - defined by array of values.
// Each value in this array is a value for the corresponding bucket.
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.

// TTFTBuckets is an array of values for time-to-first-token buckets.
// Buckets upper boundaries in seconds are:
// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
// TPOTBuckets is an array of values for time-per-output-token buckets,
// each value in this array is a value for the corresponding bucket.
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
// TPOTBuckets is an array of values for time-per-output-token buckets.
// Buckets upper boundaries in seconds are:
// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
Expand All @@ -253,6 +254,21 @@ type Metrics struct {
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`

// Latency histograms - have same buckets upper boundaries in seconds are:
// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf

// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
// ReqInfTimeBucketValues is an array of values for request inference time buckets.
ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`
}

type LorasMetrics struct {
Expand Down Expand Up @@ -588,6 +604,38 @@ func (c *Configuration) validate() error {
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
}
}

for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
if v < 0 {
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
}
}

for _, v := range c.FakeMetrics.E2ERequestLatencyBucketValues {
if v < 0 {
return errors.New("fake metrics e2erl-buckets-values cannot contain negative values")
}
}
for _, v := range c.FakeMetrics.ReqQueueTimeBucketValues {
if v < 0 {
return errors.New("fake metrics queue-time-buckets-values cannot contain negative values")
}
}
for _, v := range c.FakeMetrics.ReqInfTimeBucketValues {
if v < 0 {
return errors.New("fake metrics inf-time-buckets-values cannot contain negative values")
}
}
for _, v := range c.FakeMetrics.ReqPrefillTimeBucketValues {
if v < 0 {
return errors.New("fake metrics prefill-time-buckets-values cannot contain negative values")
}
}
for _, v := range c.FakeMetrics.ReqDecodeTimeBucketValues {
if v < 0 {
return errors.New("fake metrics decode-time-buckets-values cannot contain negative values")
}
}
}

if c.DPSize < 1 || c.DPSize > 8 {
Expand Down
3 changes: 3 additions & 0 deletions pkg/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}

var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}

// ValidateContextWindow checks if the request fits within the model's context window
// Returns validation result, actual completion tokens, and total tokens
func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
Expand Down
46 changes: 23 additions & 23 deletions pkg/llm-d-inference-sim/failures_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,15 @@ var _ = Describe("Failures", func() {
BeforeEach(func() {
ctx = context.Background()
var err error
client, err = startServerWithArgs(ctx, "", []string{
"cmd", "--model", model,
client, err = startServerWithArgs(ctx, []string{
"cmd", "--model", testModel,
"--failure-injection-rate", "100",
}, nil)
})
Expect(err).ToNot(HaveOccurred())
})

It("should always return an error response for chat completions", func() {
openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
_, err := openaiClient.Chat.Completions.New(ctx, params)
Expect(err).To(HaveOccurred())

Expand All @@ -147,7 +147,7 @@ var _ = Describe("Failures", func() {
})

It("should always return an error response for text completions", func() {
openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
_, err := openaiClient.Chat.Completions.New(ctx, params)
Expect(err).To(HaveOccurred())

Expand All @@ -164,16 +164,16 @@ var _ = Describe("Failures", func() {
BeforeEach(func() {
ctx = context.Background()
var err error
client, err = startServerWithArgs(ctx, "", []string{
"cmd", "--model", model,
client, err = startServerWithArgs(ctx, []string{
"cmd", "--model", testModel,
"--failure-injection-rate", "100",
"--failure-types", common.FailureTypeRateLimit,
}, nil)
})
Expect(err).ToNot(HaveOccurred())
})

It("should return only rate limit errors", func() {
openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
_, err := openaiClient.Chat.Completions.New(ctx, params)
Expect(err).To(HaveOccurred())

Expand All @@ -182,24 +182,24 @@ var _ = Describe("Failures", func() {
Expect(ok).To(BeTrue())
Expect(openaiError.StatusCode).To(Equal(429))
Expect(openaiError.Type).To(Equal(openaiserverapi.ErrorCodeToType(429)))
Expect(strings.Contains(openaiError.Message, model)).To(BeTrue())
Expect(strings.Contains(openaiError.Message, testModel)).To(BeTrue())
})
})

Context("with multiple specific failure types", func() {
BeforeEach(func() {
ctx = context.Background()
var err error
client, err = startServerWithArgs(ctx, "", []string{
"cmd", "--model", model,
client, err = startServerWithArgs(ctx, []string{
"cmd", "--model", testModel,
"--failure-injection-rate", "100",
"--failure-types", common.FailureTypeInvalidAPIKey, common.FailureTypeServerError,
}, nil)
})
Expect(err).ToNot(HaveOccurred())
})

It("should return only specified error types", func() {
openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)

// Make multiple requests to verify we get the expected error types
for i := 0; i < 10; i++ {
Expand All @@ -222,35 +222,35 @@ var _ = Describe("Failures", func() {
BeforeEach(func() {
ctx = context.Background()
var err error
client, err = startServerWithArgs(ctx, "", []string{
"cmd", "--model", model,
client, err = startServerWithArgs(ctx, []string{
"cmd", "--model", testModel,
"--failure-injection-rate", "0",
}, nil)
})
Expect(err).ToNot(HaveOccurred())
})

It("should never return errors and behave like random mode", func() {
openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
resp, err := openaiClient.Chat.Completions.New(ctx, params)
Expect(err).ToNot(HaveOccurred())
Expect(resp.Choices).To(HaveLen(1))
Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
Expect(resp.Model).To(Equal(model))
Expect(resp.Model).To(Equal(testModel))
})
})

Context("testing all predefined failure types", func() {
DescribeTable("should return correct error for each failure type",
func(failureType string, expectedStatusCode int, expectedErrorType string) {
ctx := context.Background()
client, err := startServerWithArgs(ctx, "", []string{
"cmd", "--model", model,
client, err := startServerWithArgs(ctx, []string{
"cmd", "--model", testModel,
"--failure-injection-rate", "100",
"--failure-types", failureType,
}, nil)
})
Expect(err).ToNot(HaveOccurred())

openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
_, err = openaiClient.Chat.Completions.New(ctx, params)
Expect(err).To(HaveOccurred())

Expand Down
12 changes: 6 additions & 6 deletions pkg/llm-d-inference-sim/lora_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,22 @@ var _ = Describe("LoRAs", func() {
Context("LoRAs config and load", func() {
It("Should config, load and load LoRAs correctly", func() {
ctx := context.TODO()
client, err := startServerWithArgs(ctx, "",
[]string{"cmd", "--model", model, "--mode", common.ModeEcho,
client, err := startServerWithArgs(ctx,
[]string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
"--lora-modules", "{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"}, nil)
"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"})
Expect(err).NotTo(HaveOccurred())

// Request to lora3
openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", userMessage, false)
openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", testUserMessage, false)
resp, err := openaiclient.Chat.Completions.New(ctx, params)
Expect(err).ToNot(HaveOccurred())

Expect(resp.Choices).ShouldNot(BeEmpty())
Expect(string(resp.Object)).To(Equal(chatCompletionObject))

msg := resp.Choices[0].Message.Content
Expect(msg).Should(Equal(userMessage))
Expect(msg).Should(Equal(testUserMessage))

// Unknown model, should return 404
params.Model = "lora1"
Expand Down Expand Up @@ -88,7 +88,7 @@ var _ = Describe("LoRAs", func() {
Expect(string(resp.Object)).To(Equal(chatCompletionObject))

msg = resp.Choices[0].Message.Content
Expect(msg).Should(Equal(userMessage))
Expect(msg).Should(Equal(testUserMessage))

// Unload lora3
payload = map[string]string{
Expand Down
Loading
Loading