llm-d
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pkg/common/config.go‎
Lines changed: 17 additions & 10 deletions b/‎pkg/common/config.go‎
Lines changed: 17 additions & 10 deletions
diff --git a/‎pkg/common/failures.go‎
Lines changed: 0 additions & 122 deletions b/‎pkg/common/failures.go‎
Lines changed: 0 additions & 122 deletions
diff --git a/‎pkg/common/failures_test.go‎
Lines changed: 0 additions & 134 deletions b/‎pkg/common/failures_test.go‎
Lines changed: 0 additions & 134 deletions
@@ -29,10 +29,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 
 The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
 
-The simulator supports three modes of operation:
+The simulator supports two modes of operation:
 - `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
 - `random` mode: the response is randomly chosen from a set of pre-defined sentences.
-- `failure` mode: randomly injects OpenAI API compatible error responses for testing error handling.
+
+Additionally, the simulator can inject OpenAI API compatible error responses for testing error handling using the `failure-injection-rate` parameter.
 
 Additionally, the simulator can inject OpenAI API compatible error responses for testing error handling using the `failure-injection-rate` parameter.
 
@@ -104,7 +105,6 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `mode`: the simulator mode, optional, by default `random`
     - `echo`: returns the same text that was sent in the request
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
-    - `failure`: randomly injects OpenAI API compatible error responses
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
 - `time-to-first-token-std-dev`: standard deviation for time before the first token will be returned, in milliseconds, optional, default is 0, can't be more than 30% of `time-to-first-token`, will not cause the actual time to first token to differ by more than 70% from `time-to-first-token`
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
 
@@ -34,7 +34,14 @@ const (
 	vLLMDefaultPort = 8000
 	ModeRandom      = "random"
 	ModeEcho        = "echo"
-	ModeFailure     = "failure"
+	
+	// Failure type constants
+	FailureTypeRateLimit       = "rate_limit"
+	FailureTypeInvalidAPIKey   = "invalid_api_key"
+	FailureTypeContextLength   = "context_length"
+	FailureTypeServerError     = "server_error"
+	FailureTypeInvalidRequest  = "invalid_request"
+	FailureTypeModelNotFound   = "model_not_found"
 )
 
 type Configuration struct {
@@ -221,8 +228,8 @@ func (c *Configuration) validate() error {
 		c.ServedModelNames = []string{c.Model}
 	}
 
-	if c.Mode != ModeEcho && c.Mode != ModeRandom && c.Mode != ModeFailure {
-		return fmt.Errorf("invalid mode '%s', valid values are 'random', 'echo', and 'failure'", c.Mode)
+	if c.Mode != ModeEcho && c.Mode != ModeRandom {
+		return fmt.Errorf("invalid mode '%s', valid values are 'random' and 'echo'", c.Mode)
 	}
 	if c.Port <= 0 {
 		return fmt.Errorf("invalid port '%d'", c.Port)
@@ -313,12 +320,12 @@ func (c *Configuration) validate() error {
 	}
 
 	validFailureTypes := map[string]bool{
-		"rate_limit":        true,
-		"invalid_api_key":   true,
-		"context_length":    true,
-		"server_error":      true,
-		"invalid_request":   true,
-		"model_not_found":   true,
+		FailureTypeRateLimit:      true,
+		FailureTypeInvalidAPIKey:  true,
+		FailureTypeContextLength:  true,
+		FailureTypeServerError:    true,
+		FailureTypeInvalidRequest: true,
+		FailureTypeModelNotFound:  true,
 	}
 	for _, failureType := range c.FailureTypes {
 		if !validFailureTypes[failureType] {
@@ -353,7 +360,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
 	f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
 
-	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences; failure - randomly injects API errors")
+	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")