Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
638d0f7
Add definition of new action input (#123)
shmuelk Aug 6, 2025
9ffe957
KV cache and tokenization related configuration (#125)
irar2 Aug 7, 2025
a5a7d81
Another attempt at adding a latest tag only on release builds (#124)
shmuelk Aug 7, 2025
951f4a3
Publish kv-cache events (#126)
irar2 Aug 12, 2025
6930192
Add failure injection mode to simulator
smarunich Aug 13, 2025
5ec92b8
Refactor failure injection and update simulator error handling
smarunich Aug 14, 2025
8e0eefa
Make tokenizer version configurable from Dockerfile
smarunich Aug 14, 2025
75dcb72
Add failure injection mode to simulator
smarunich Aug 13, 2025
d7bb175
Refactor failure injection and update simulator error handling
smarunich Aug 14, 2025
c35dbca
KV cache and tokenization related configuration (#125)
irar2 Aug 7, 2025
2eca8e6
Publish kv-cache events (#126)
irar2 Aug 12, 2025
28fb65b
Use same version of tokenizer in both Dockerfile and Makefile (#132)
mayabar Aug 14, 2025
3ae7113
Clarify failure injection rate documentation
smarunich Aug 14, 2025
f5ae85b
Set default failure injection rate to 0
smarunich Aug 14, 2025
9dbb689
rebase duplicates
smarunich Aug 14, 2025
106e276
re-base the changes
irar2 Aug 7, 2025
5162226
Update option constructors in simulator tests
smarunich Aug 14, 2025
7bd69e8
Merge branch 'main' into failure-mode
smarunich Aug 14, 2025
5182187
Document failure injection options in README
smarunich Aug 14, 2025
b68115f
Set FailureInjectionRate default to 0 in config
smarunich Aug 14, 2025
bfa02ff
Refactor failure type usage and error response format
smarunich Aug 25, 2025
700e36f
Refactor failure type flag handling and code formatting
smarunich Aug 25, 2025
14860b3
Merge branch 'main' into failure-mode
smarunich Aug 25, 2025
8f6d56c
Fix config validation and simulator test argument handling
smarunich Aug 25, 2025
e0183b7
remove duplicate
smarunich Aug 26, 2025
178a594
Refactor failure handling to use CompletionError struct
smarunich Aug 26, 2025
72dde24
Use one type for all errors. Map code to type
irar2 Aug 27, 2025
13492fc
Merge branch 'main' into failure-mode
irar2 Aug 27, 2025
7994048
Review comments
irar2 Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
- `hash-seed`: seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
- `zmq-endpoint`: ZMQ address to publish events
- `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
- `failure-injection-rate`: probability (0-100) of injecting failures, optional, default is 0
- `failure-types`: list of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found), optional, if empty all types are used
-->
In addition, as we are using klog, the following parameters are available:
- `add_dir_header`: if true, adds the file directory to the header of the log messages
Expand Down
70 changes: 64 additions & 6 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@
vLLMDefaultPort = 8000
ModeRandom = "random"
ModeEcho = "echo"

Check failure on line 37 in pkg/common/config.go

View workflow job for this annotation

GitHub Actions / lint-and-test

File is not properly formatted (gofmt)
// Failure type constants
FailureTypeRateLimit = "rate_limit"
FailureTypeInvalidAPIKey = "invalid_api_key"
FailureTypeContextLength = "context_length"
FailureTypeServerError = "server_error"
FailureTypeInvalidRequest = "invalid_request"
FailureTypeModelNotFound = "model_not_found"
)

type Configuration struct {
Expand Down Expand Up @@ -127,6 +135,11 @@
ZMQEndpoint string `yaml:"zmq-endpoint"`
// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
EventBatchSize int `yaml:"event-batch-size"`

// FailureInjectionRate is the probability (0-100) of injecting failures
FailureInjectionRate int `yaml:"failure-injection-rate"`
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add json annotation

// FailureTypes is a list of specific failure types to inject (empty means all types)
FailureTypes []string `yaml:"failure-types"`
}

type LoraModule struct {
Expand Down Expand Up @@ -182,10 +195,12 @@
MinToolCallArrayParamLength: 1,
ToolCallNotRequiredParamProbability: 50,
ObjectToolCallNotRequiredParamProbability: 50,
KVCacheSize: 1024,
TokenBlockSize: 16,
ZMQEndpoint: "tcp://localhost:5557",
EventBatchSize: 16,
KVCacheSize: 1024,
TokenBlockSize: 16,
ZMQEndpoint: "tcp://localhost:5557",
EventBatchSize: 16,
FailureInjectionRate: 0,
FailureTypes: []string{},
}
}

Expand Down Expand Up @@ -299,6 +314,24 @@
if c.EventBatchSize < 1 {
return errors.New("event batch size cannot less than 1")
}

if c.FailureInjectionRate < 0 || c.FailureInjectionRate > 100 {
return errors.New("failure injection rate should be between 0 and 100")
}

validFailureTypes := map[string]bool{
FailureTypeRateLimit: true,
FailureTypeInvalidAPIKey: true,
FailureTypeContextLength: true,
FailureTypeServerError: true,
FailureTypeInvalidRequest: true,
FailureTypeModelNotFound: true,
}
for _, failureType := range c.FailureTypes {
if !validFailureTypes[failureType] {
return fmt.Errorf("invalid failure type '%s', valid types are: rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found", failureType)
}
}
return nil
}

Expand Down Expand Up @@ -326,7 +359,7 @@
f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")

f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
Expand All @@ -351,6 +384,14 @@
f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")

f.IntVar(&config.FailureInjectionRate, "failure-injection-rate", config.FailureInjectionRate, "Probability (0-100) of injecting failures")

failureTypes := getParamValueFromArgs("failure-types")
var dummyFailureTypes multiString
f.Var(&dummyFailureTypes, "failure-types", "List of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found)")
f.Lookup("failure-types").NoOptDefVal = "dummy"

Check failure on line 393 in pkg/common/config.go

View workflow job for this annotation

GitHub Actions / lint-and-test

string `dummy` has 3 occurrences, make it a constant (goconst)


// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
var dummyString string
Expand Down Expand Up @@ -384,6 +425,23 @@
if servedModelNames != nil {
config.ServedModelNames = servedModelNames
}
if failureTypes != nil {
config.FailureTypes = failureTypes
}

if config.HashSeed == "" {
hashSeed := os.Getenv("PYTHONHASHSEED")
if hashSeed != "" {
config.HashSeed = hashSeed
}
}

if config.HashSeed == "" {
hashSeed := os.Getenv("PYTHONHASHSEED")
if hashSeed != "" {
config.HashSeed = hashSeed
}
}

if config.HashSeed == "" {
hashSeed := os.Getenv("PYTHONHASHSEED")
Expand Down Expand Up @@ -422,4 +480,4 @@
}
}
return values
}
}
126 changes: 126 additions & 0 deletions pkg/llm-d-inference-sim/failures.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
Copyright 2025 The llm-d-inference-sim Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package llmdinferencesim

import (
"fmt"

"github.com/llm-d/llm-d-inference-sim/pkg/common"
)

const (
// Error message templates
RateLimitMessageTemplate = "Rate limit reached for %s in organization org-xxx on requests per min (RPM): Limit 3, Used 3, Requested 1."

Check failure on line 27 in pkg/llm-d-inference-sim/failures.go

View workflow job for this annotation

GitHub Actions / lint-and-test

File is not properly formatted (gofmt)
ModelNotFoundMessageTemplate = "The model '%s-nonexistent' does not exist"
)

type FailureSpec struct {
StatusCode int
ErrorType string
ErrorCode string
Message string
Param *string
}

var predefinedFailures = map[string]FailureSpec{
common.FailureTypeRateLimit: {
StatusCode: 429,
ErrorType: "rate_limit_exceeded",
ErrorCode: "rate_limit_exceeded",
Message: "Rate limit reached for model in organization org-xxx on requests per min (RPM): Limit 3, Used 3, Requested 1.",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use RateLimitMessageTemplate

Param: nil,
},
common.FailureTypeInvalidAPIKey: {
StatusCode: 401,
ErrorType: "invalid_request_error",
ErrorCode: "invalid_api_key",
Message: "Incorrect API key provided",
Param: nil,
},
common.FailureTypeContextLength: {
StatusCode: 400,
ErrorType: "invalid_request_error",
ErrorCode: "context_length_exceeded",
Message: "This model's maximum context length is 4096 tokens. However, your messages resulted in 4500 tokens.",
Param: stringPtr("messages"),
},
common.FailureTypeServerError: {
StatusCode: 503,
ErrorType: "server_error",
ErrorCode: "server_error",
Message: "The server is overloaded or not ready yet.",
Param: nil,
},
common.FailureTypeInvalidRequest: {
StatusCode: 400,
ErrorType: "invalid_request_error",
ErrorCode: "invalid_request_error",
Message: "Invalid request: missing required parameter 'model'.",
Param: stringPtr("model"),
},
common.FailureTypeModelNotFound: {
StatusCode: 404,
ErrorType: "invalid_request_error",
ErrorCode: "model_not_found",
Message: "The model 'gpt-nonexistent' does not exist",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use ModelNotFoundMessageTemplate

Param: stringPtr("model"),
},
}

// ShouldInjectFailure determines whether to inject a failure based on configuration
func ShouldInjectFailure(config *common.Configuration) bool {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be private

if config.FailureInjectionRate == 0 {
return false
}

return common.RandomInt(1, 100) <= config.FailureInjectionRate
}

// GetRandomFailure returns a random failure from configured types or all types if none specified
func GetRandomFailure(config *common.Configuration) FailureSpec {
var availableFailures []string
if len(config.FailureTypes) == 0 {
// Use all failure types if none specified
for failureType := range predefinedFailures {
availableFailures = append(availableFailures, failureType)
}
} else {
availableFailures = config.FailureTypes
}

if len(availableFailures) == 0 {
// Fallback to server_error if no valid types
return predefinedFailures[common.FailureTypeServerError]
}

randomIndex := common.RandomInt(0, len(availableFailures)-1)
randomType := availableFailures[randomIndex]

// Customize message with current model name
failure := predefinedFailures[randomType]
if randomType == common.FailureTypeRateLimit && config.Model != "" {
failure.Message = fmt.Sprintf(RateLimitMessageTemplate, config.Model)
} else if randomType == common.FailureTypeModelNotFound && config.Model != "" {
failure.Message = fmt.Sprintf(ModelNotFoundMessageTemplate, config.Model)
}

return failure
}

func stringPtr(s string) *string {
return &s
}
128 changes: 128 additions & 0 deletions pkg/llm-d-inference-sim/failures_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
Copyright 2025 The llm-d-inference-sim Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package llmdinferencesim_test

import (
"strings"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/llm-d/llm-d-inference-sim/pkg/common"
llmdinferencesim "github.com/llm-d/llm-d-inference-sim/pkg/llm-d-inference-sim"
)

var _ = Describe("Failures", func() {
Describe("ShouldInjectFailure", func() {
It("should not inject failure when injection rate is 0", func() {
config := &common.Configuration{
Mode: common.ModeRandom,
FailureInjectionRate: 0,
}
Expect(llmdinferencesim.ShouldInjectFailure(config)).To(BeFalse())
})

It("should inject failure when injection rate is 100", func() {
config := &common.Configuration{
Mode: common.ModeRandom,
FailureInjectionRate: 100,
}
Expect(llmdinferencesim.ShouldInjectFailure(config)).To(BeTrue())
})

})

Describe("GetRandomFailure", func() {
It("should return a failure from all types when none specified", func() {
config := &common.Configuration{
Model: "test-model",
FailureTypes: []string{},
}
failure := llmdinferencesim.GetRandomFailure(config)
Expect(failure.StatusCode).To(BeNumerically(">=", 400))
Expect(failure.Message).ToNot(BeEmpty())
Expect(failure.ErrorType).ToNot(BeEmpty())
})

It("should return rate limit failure when specified", func() {
config := &common.Configuration{
Model: "test-model",
FailureTypes: []string{common.FailureTypeRateLimit},
}
failure := llmdinferencesim.GetRandomFailure(config)
Expect(failure.StatusCode).To(Equal(429))
Expect(failure.ErrorType).To(Equal("rate_limit_exceeded"))
Expect(failure.ErrorCode).To(Equal("rate_limit_exceeded"))
Expect(strings.Contains(failure.Message, "test-model")).To(BeTrue())
})

It("should return invalid API key failure when specified", func() {
config := &common.Configuration{
FailureTypes: []string{common.FailureTypeInvalidAPIKey},
}
failure := llmdinferencesim.GetRandomFailure(config)
Expect(failure.StatusCode).To(Equal(401))
Expect(failure.ErrorType).To(Equal("invalid_request_error"))
Expect(failure.ErrorCode).To(Equal("invalid_api_key"))
Expect(failure.Message).To(Equal("Incorrect API key provided"))
})

It("should return context length failure when specified", func() {
config := &common.Configuration{
FailureTypes: []string{common.FailureTypeContextLength},
}
failure := llmdinferencesim.GetRandomFailure(config)
Expect(failure.StatusCode).To(Equal(400))
Expect(failure.ErrorType).To(Equal("invalid_request_error"))
Expect(failure.ErrorCode).To(Equal("context_length_exceeded"))
Expect(failure.Param).ToNot(BeNil())
Expect(*failure.Param).To(Equal("messages"))
})

It("should return server error when specified", func() {
config := &common.Configuration{
FailureTypes: []string{common.FailureTypeServerError},
}
failure := llmdinferencesim.GetRandomFailure(config)
Expect(failure.StatusCode).To(Equal(503))
Expect(failure.ErrorType).To(Equal("server_error"))
Expect(failure.ErrorCode).To(Equal("server_error"))
})

It("should return model not found failure when specified", func() {
config := &common.Configuration{
Model: "test-model",
FailureTypes: []string{common.FailureTypeModelNotFound},
}
failure := llmdinferencesim.GetRandomFailure(config)
Expect(failure.StatusCode).To(Equal(404))
Expect(failure.ErrorType).To(Equal("invalid_request_error"))
Expect(failure.ErrorCode).To(Equal("model_not_found"))
Expect(strings.Contains(failure.Message, "test-model-nonexistent")).To(BeTrue())
})

It("should return server error as fallback for empty types", func() {
config := &common.Configuration{
FailureTypes: []string{},
}
// This test is probabilistic since it randomly selects, but we can test structure
failure := llmdinferencesim.GetRandomFailure(config)
Expect(failure.StatusCode).To(BeNumerically(">=", 400))
Expect(failure.ErrorType).ToNot(BeEmpty())
})
})
})
Loading
Loading