Refactor failure injection and update simulator error handling

smarunich · smarunich · commit 2097aeb2733c · 2025-08-14T11:55:29.000-04:00
Failure injection is now controlled by a dedicated 'failure-injection-rate' parameter instead of a separate 'failure' mode. Failure type constants are centralized, and error handling in the simulator is refactored to use a unified method for sending error responses. Documentation and tests are updated to reflect these changes, and the OpenAI error response format now includes an 'object' field.

Signed-off-by: Sergey Marunich &lt;marunich.s@gmail.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -23,7 +23,7 @@ COPY . .
 
 # HuggingFace tokenizer bindings
 RUN mkdir -p lib
-RUN curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib
+RUN curl -L https://github.com/daulet/tokenizers/releases/download/v1.22.1/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib
 RUN ranlib lib/*.a
 
 # Build
diff --git a/README.md b/README.md
@@ -29,10 +29,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 
 The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
 
-The simulator supports three modes of operation:
+The simulator supports two modes of operation:
 - `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
 - `random` mode: the response is randomly chosen from a set of pre-defined sentences.
-- `failure` mode: randomly injects OpenAI API compatible error responses for testing error handling.
+
+Additionally, the simulator can inject OpenAI API compatible error responses for testing error handling using the `failure-injection-rate` parameter.
 
 Timing of the response is defined by the `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
 
@@ -102,7 +103,6 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `mode`: the simulator mode, optional, by default `random`
     - `echo`: returns the same text that was sent in the request
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
-    - `failure`: randomly injects OpenAI API compatible error responses
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
 - `time-to-first-token-std-dev`: standard deviation for time before the first token will be returned, in milliseconds, optional, default is 0, can't be more than 30% of `time-to-first-token`, will not cause the actual time to first token to differ by more than 70% from `time-to-first-token`
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
diff --git a/go.sum b/go.sum
@@ -1,3 +1,5 @@
+github.com/alicebob/miniredis/v2 v2.35.0 h1:QwLphYqCEAo1eu1TqPRN2jgVMPBweeQcR21jeqDCONI=
+github.com/alicebob/miniredis/v2 v2.35.0/go.mod h1:TcL7YfarKPGDAthEtl5NBeHZfeUQj6OXMm/+iu5cLMM=
 github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
 github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -11,8 +13,6 @@ github.com/buaazp/fasthttprouter v0.1.1/go.mod h1:h/Ap5oRVLeItGKTVBb+heQPks+HdIU
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/daulet/tokenizers v1.20.2 h1:tlq/vIOiBTKDPets3596aFvmJYLn3XI6LFKq4q9LKhQ=
-github.com/daulet/tokenizers v1.20.2/go.mod h1:tGnMdZthXdcWY6DGD07IygpwJqiPvG85FQUnhs/wSCs=
 github.com/daulet/tokenizers v1.22.1 h1:3wzAFIxfgRuqGKka8xdkeTbctDmmqOOs12GofqdorpM=
 github.com/daulet/tokenizers v1.22.1/go.mod h1:tGnMdZthXdcWY6DGD07IygpwJqiPvG85FQUnhs/wSCs=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -68,8 +68,6 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
-github.com/llm-d/llm-d-kv-cache-manager v0.2.0 h1:7MXFPjy3P8nZ7HbB1LWhhVLHvNTLbZglkD/ZcT7UU1k=
-github.com/llm-d/llm-d-kv-cache-manager v0.2.0/go.mod h1:ZTqwsnIVC6R5YuTUrYofPIUnCeZ9RvXn1UQAdxLYl1Y=
 github.com/llm-d/llm-d-kv-cache-manager v0.2.2-0.20250810103202-0adf0940f60a h1:PXR37HLgYYfolzWQA2uQOEiJlj3IV9YSvgaEFqCRSa8=
 github.com/llm-d/llm-d-kv-cache-manager v0.2.2-0.20250810103202-0adf0940f60a/go.mod h1:g2UlYKNJ4S860SAQ/QoRnytAFfnp8f1luW4IuZSMwCE=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
@@ -147,6 +145,8 @@ github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZ
 github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
+github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
 go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
 go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
 go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -34,7 +34,14 @@ const (
 	vLLMDefaultPort = 8000
 	ModeRandom      = "random"
 	ModeEcho        = "echo"
-	ModeFailure     = "failure"
+	
+	// Failure type constants
+	FailureTypeRateLimit       = "rate_limit"
+	FailureTypeInvalidAPIKey   = "invalid_api_key"
+	FailureTypeContextLength   = "context_length"
+	FailureTypeServerError     = "server_error"
+	FailureTypeInvalidRequest  = "invalid_request"
+	FailureTypeModelNotFound   = "model_not_found"
 )
 
 type Configuration struct {
@@ -221,8 +228,8 @@ func (c *Configuration) validate() error {
 		c.ServedModelNames = []string{c.Model}
 	}
 
-	if c.Mode != ModeEcho && c.Mode != ModeRandom && c.Mode != ModeFailure {
-		return fmt.Errorf("invalid mode '%s', valid values are 'random', 'echo', and 'failure'", c.Mode)
+	if c.Mode != ModeEcho && c.Mode != ModeRandom {
+		return fmt.Errorf("invalid mode '%s', valid values are 'random' and 'echo'", c.Mode)
 	}
 	if c.Port <= 0 {
 		return fmt.Errorf("invalid port '%d'", c.Port)
@@ -313,12 +320,12 @@ func (c *Configuration) validate() error {
 	}
 
 	validFailureTypes := map[string]bool{
-		"rate_limit":        true,
-		"invalid_api_key":   true,
-		"context_length":    true,
-		"server_error":      true,
-		"invalid_request":   true,
-		"model_not_found":   true,
+		FailureTypeRateLimit:      true,
+		FailureTypeInvalidAPIKey:  true,
+		FailureTypeContextLength:  true,
+		FailureTypeServerError:    true,
+		FailureTypeInvalidRequest: true,
+		FailureTypeModelNotFound:  true,
 	}
 	for _, failureType := range c.FailureTypes {
 		if !validFailureTypes[failureType] {
@@ -353,7 +360,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
 	f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
 
-	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences; failure - randomly injects API errors")
+	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
diff --git a/pkg/llm-d-inference-sim/failures.go b/pkg/llm-d-inference-sim/failures.go
@@ -14,12 +14,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package common
+package llmdinferencesim
 
 import (
 	"fmt"
-	"math/rand"
-	"time"
+
+	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+)
+
+const (
+	// Error message templates
+	RateLimitMessageTemplate    = "Rate limit reached for %s in organization org-xxx on requests per min (RPM): Limit 3, Used 3, Requested 1."
+	ModelNotFoundMessageTemplate = "The model '%s-nonexistent' does not exist"
 )
 
 type FailureSpec struct {
@@ -31,42 +37,42 @@ type FailureSpec struct {
 }
 
 var predefinedFailures = map[string]FailureSpec{
-	"rate_limit": {
+	common.FailureTypeRateLimit: {
 		StatusCode: 429,
 		ErrorType:  "rate_limit_exceeded",
 		ErrorCode:  "rate_limit_exceeded",
 		Message:    "Rate limit reached for model in organization org-xxx on requests per min (RPM): Limit 3, Used 3, Requested 1.",
 		Param:      nil,
 	},
-	"invalid_api_key": {
+	common.FailureTypeInvalidAPIKey: {
 		StatusCode: 401,
 		ErrorType:  "invalid_request_error",
 		ErrorCode:  "invalid_api_key",
 		Message:    "Incorrect API key provided",
 		Param:      nil,
 	},
-	"context_length": {
+	common.FailureTypeContextLength: {
 		StatusCode: 400,
 		ErrorType:  "invalid_request_error",
 		ErrorCode:  "context_length_exceeded",
 		Message:    "This model's maximum context length is 4096 tokens. However, your messages resulted in 4500 tokens.",
 		Param:      stringPtr("messages"),
 	},
-	"server_error": {
+	common.FailureTypeServerError: {
 		StatusCode: 503,
 		ErrorType:  "server_error",
 		ErrorCode:  "server_error",
 		Message:    "The server is overloaded or not ready yet.",
 		Param:      nil,
 	},
-	"invalid_request": {
+	common.FailureTypeInvalidRequest: {
 		StatusCode: 400,
 		ErrorType:  "invalid_request_error",
 		ErrorCode:  "invalid_request_error",
 		Message:    "Invalid request: missing required parameter 'model'.",
 		Param:      stringPtr("model"),
 	},
-	"model_not_found": {
+	common.FailureTypeModelNotFound: {
 		StatusCode: 404,
 		ErrorType:  "invalid_request_error",
 		ErrorCode:  "model_not_found",
@@ -76,19 +82,16 @@ var predefinedFailures = map[string]FailureSpec{
 }
 
 // ShouldInjectFailure determines whether to inject a failure based on configuration
-func ShouldInjectFailure(config *Configuration) bool {
-	if config.Mode != ModeFailure {
+func ShouldInjectFailure(config *common.Configuration) bool {
+	if config.FailureInjectionRate == 0 {
 		return false
 	}
 	
-	rand.Seed(time.Now().UnixNano())
-	return rand.Intn(100) < config.FailureInjectionRate
+	return common.RandomInt(1, 100) <= config.FailureInjectionRate
 }
 
 // GetRandomFailure returns a random failure from configured types or all types if none specified
-func GetRandomFailure(config *Configuration) FailureSpec {
-	rand.Seed(time.Now().UnixNano())
-	
+func GetRandomFailure(config *common.Configuration) FailureSpec {
 	var availableFailures []string
 	if len(config.FailureTypes) == 0 {
 		// Use all failure types if none specified
@@ -101,17 +104,18 @@ func GetRandomFailure(config *Configuration) FailureSpec {
 	
 	if len(availableFailures) == 0 {
 		// Fallback to server_error if no valid types
-		return predefinedFailures["server_error"]
+		return predefinedFailures[common.FailureTypeServerError]
 	}
 	
-	randomType := availableFailures[rand.Intn(len(availableFailures))]
+	randomIndex := common.RandomInt(0, len(availableFailures)-1)
+	randomType := availableFailures[randomIndex]
 	
 	// Customize message with current model name
 	failure := predefinedFailures[randomType]
-	if randomType == "rate_limit" && config.Model != "" {
-		failure.Message = fmt.Sprintf("Rate limit reached for %s in organization org-xxx on requests per min (RPM): Limit 3, Used 3, Requested 1.", config.Model)
-	} else if randomType == "model_not_found" && config.Model != "" {
-		failure.Message = fmt.Sprintf("The model '%s-nonexistent' does not exist", config.Model)
+	if randomType == common.FailureTypeRateLimit && config.Model != "" {
+		failure.Message = fmt.Sprintf(RateLimitMessageTemplate, config.Model)
+	} else if randomType == common.FailureTypeModelNotFound && config.Model != "" {
+		failure.Message = fmt.Sprintf(ModelNotFoundMessageTemplate, config.Model)
 	}
 	
 	return failure
diff --git a/pkg/llm-d-inference-sim/failures_test.go b/pkg/llm-d-inference-sim/failures_test.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package common_test
+package llmdinferencesim_test
 
 import (
 	"strings"
@@ -23,33 +23,27 @@ import (
 	. "github.com/onsi/gomega"
 
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+	llmdinferencesim "github.com/llm-d/llm-d-inference-sim/pkg/llm-d-inference-sim"
 )
 
 var _ = Describe("Failures", func() {
 	Describe("ShouldInjectFailure", func() {
-		It("should not inject failure when not in failure mode", func() {
+		It("should not inject failure when injection rate is 0", func() {
 			config := &common.Configuration{
 				Mode:                 common.ModeRandom,
-				FailureInjectionRate: 100,
-			}
-			Expect(common.ShouldInjectFailure(config)).To(BeFalse())
-		})
-
-		It("should not inject failure when rate is 0", func() {
-			config := &common.Configuration{
-				Mode:                 common.ModeFailure,
 				FailureInjectionRate: 0,
 			}
-			Expect(common.ShouldInjectFailure(config)).To(BeFalse())
+			Expect(llmdinferencesim.ShouldInjectFailure(config)).To(BeFalse())
 		})
 
-		It("should inject failure when in failure mode with 100% rate", func() {
+		It("should inject failure when injection rate is 100", func() {
 			config := &common.Configuration{
-				Mode:                 common.ModeFailure,
+				Mode:                 common.ModeRandom,
 				FailureInjectionRate: 100,
 			}
-			Expect(common.ShouldInjectFailure(config)).To(BeTrue())
+			Expect(llmdinferencesim.ShouldInjectFailure(config)).To(BeTrue())
 		})
+
 	})
 
 	Describe("GetRandomFailure", func() {
@@ -58,7 +52,7 @@ var _ = Describe("Failures", func() {
 				Model:        "test-model",
 				FailureTypes: []string{},
 			}
-			failure := common.GetRandomFailure(config)
+			failure := llmdinferencesim.GetRandomFailure(config)
 			Expect(failure.StatusCode).To(BeNumerically(">=", 400))
 			Expect(failure.Message).ToNot(BeEmpty())
 			Expect(failure.ErrorType).ToNot(BeEmpty())
@@ -67,9 +61,9 @@ var _ = Describe("Failures", func() {
 		It("should return rate limit failure when specified", func() {
 			config := &common.Configuration{
 				Model:        "test-model",
-				FailureTypes: []string{"rate_limit"},
+				FailureTypes: []string{common.FailureTypeRateLimit},
 			}
-			failure := common.GetRandomFailure(config)
+			failure := llmdinferencesim.GetRandomFailure(config)
 			Expect(failure.StatusCode).To(Equal(429))
 			Expect(failure.ErrorType).To(Equal("rate_limit_exceeded"))
 			Expect(failure.ErrorCode).To(Equal("rate_limit_exceeded"))
@@ -78,9 +72,9 @@ var _ = Describe("Failures", func() {
 
 		It("should return invalid API key failure when specified", func() {
 			config := &common.Configuration{
-				FailureTypes: []string{"invalid_api_key"},
+				FailureTypes: []string{common.FailureTypeInvalidAPIKey},
 			}
-			failure := common.GetRandomFailure(config)
+			failure := llmdinferencesim.GetRandomFailure(config)
 			Expect(failure.StatusCode).To(Equal(401))
 			Expect(failure.ErrorType).To(Equal("invalid_request_error"))
 			Expect(failure.ErrorCode).To(Equal("invalid_api_key"))
@@ -89,9 +83,9 @@ var _ = Describe("Failures", func() {
 
 		It("should return context length failure when specified", func() {
 			config := &common.Configuration{
-				FailureTypes: []string{"context_length"},
+				FailureTypes: []string{common.FailureTypeContextLength},
 			}
-			failure := common.GetRandomFailure(config)
+			failure := llmdinferencesim.GetRandomFailure(config)
 			Expect(failure.StatusCode).To(Equal(400))
 			Expect(failure.ErrorType).To(Equal("invalid_request_error"))
 			Expect(failure.ErrorCode).To(Equal("context_length_exceeded"))
@@ -101,9 +95,9 @@ var _ = Describe("Failures", func() {
 
 		It("should return server error when specified", func() {
 			config := &common.Configuration{
-				FailureTypes: []string{"server_error"},
+				FailureTypes: []string{common.FailureTypeServerError},
 			}
-			failure := common.GetRandomFailure(config)
+			failure := llmdinferencesim.GetRandomFailure(config)
 			Expect(failure.StatusCode).To(Equal(503))
 			Expect(failure.ErrorType).To(Equal("server_error"))
 			Expect(failure.ErrorCode).To(Equal("server_error"))
@@ -112,9 +106,9 @@ var _ = Describe("Failures", func() {
 		It("should return model not found failure when specified", func() {
 			config := &common.Configuration{
 				Model:        "test-model",
-				FailureTypes: []string{"model_not_found"},
+				FailureTypes: []string{common.FailureTypeModelNotFound},
 			}
-			failure := common.GetRandomFailure(config)
+			failure := llmdinferencesim.GetRandomFailure(config)
 			Expect(failure.StatusCode).To(Equal(404))
 			Expect(failure.ErrorType).To(Equal("invalid_request_error"))
 			Expect(failure.ErrorCode).To(Equal("model_not_found"))
@@ -126,7 +120,7 @@ var _ = Describe("Failures", func() {
 				FailureTypes: []string{},
 			}
 			// This test is probabilistic since it randomly selects, but we can test structure
-			failure := common.GetRandomFailure(config)
+			failure := llmdinferencesim.GetRandomFailure(config)
 			Expect(failure.StatusCode).To(BeNumerically(">=", 400))
 			Expect(failure.ErrorType).ToNot(BeEmpty())
 		})
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
diff --git a/pkg/openai-server-api/response.go b/pkg/openai-server-api/response.go