llm-d
diff --git a/‎.github/workflows/re-run-action.yml‎
Lines changed: 0 additions & 16 deletions b/‎.github/workflows/re-run-action.yml‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pkg/common/config.go‎
Lines changed: 45 additions & 2 deletions b/‎pkg/common/config.go‎
Lines changed: 45 additions & 2 deletions
diff --git a/‎pkg/common/config_test.go‎
Lines changed: 13 additions & 0 deletions b/‎pkg/common/config_test.go‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎pkg/common/utils.go‎
Lines changed: 69 additions & 2 deletions b/‎pkg/common/utils.go‎
Lines changed: 69 additions & 2 deletions
diff --git a/‎pkg/common/utils_test.go‎
Lines changed: 16 additions & 4 deletions b/‎pkg/common/utils_test.go‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎pkg/llm-d-inference-sim/failures.go‎
Lines changed: 88 additions & 0 deletions b/‎pkg/llm-d-inference-sim/failures.go‎
Lines changed: 88 additions & 0 deletions
@@ -135,6 +135,9 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `zmq-max-connect-attempts`: the maximum number of ZMQ connection attempts, defaults to 0, maximum: 10
 - `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
 ---
+- `failure-injection-rate`: probability (0-100) of injecting failures, optional, default is 0
+- `failure-types`: list of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found), optional, if empty all types are used
+---
 - `fake-metrics`: represents a predefined set of metrics to be sent to Prometheus as a substitute for the real metrics. When specified, only these fake metrics will be reported — real metrics and fake metrics will never be reported together. The set should include values for 
     - `running-requests`
     - `waiting-requests`
@@ -143,7 +146,6 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 
     Example:
       {"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
-      
 
 In addition, as we are using klog, the following parameters are available:
 - `add_dir_header`: if true, adds the file directory to the header of the log messages
 
@@ -34,7 +34,14 @@ const (
 	vLLMDefaultPort = 8000
 	ModeRandom      = "random"
 	ModeEcho        = "echo"
-	dummy           = "dummy"
+	// Failure type constants
+	FailureTypeRateLimit      = "rate_limit"
+	FailureTypeInvalidAPIKey  = "invalid_api_key"
+	FailureTypeContextLength  = "context_length"
+	FailureTypeServerError    = "server_error"
+	FailureTypeInvalidRequest = "invalid_request"
+	FailureTypeModelNotFound  = "model_not_found"
+	dummy                     = "dummy"
 )
 
 type Configuration struct {
@@ -150,6 +157,11 @@ type Configuration struct {
 
 	// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
 	FakeMetrics *Metrics `yaml:"fake-metrics" json:"fake-metrics"`
+
+	// FailureInjectionRate is the probability (0-100) of injecting failures
+	FailureInjectionRate int `yaml:"failure-injection-rate" json:"failure-injection-rate"`
+	// FailureTypes is a list of specific failure types to inject (empty means all types)
+	FailureTypes []string `yaml:"failure-types" json:"failure-types"`
 }
 
 type Metrics struct {
@@ -392,6 +404,27 @@ func (c *Configuration) validate() error {
 	if c.EventBatchSize < 1 {
 		return errors.New("event batch size cannot less than 1")
 	}
+
+	if c.FailureInjectionRate < 0 || c.FailureInjectionRate > 100 {
+		return errors.New("failure injection rate should be between 0 and 100")
+	}
+
+	validFailureTypes := map[string]bool{
+		FailureTypeRateLimit:      true,
+		FailureTypeInvalidAPIKey:  true,
+		FailureTypeContextLength:  true,
+		FailureTypeServerError:    true,
+		FailureTypeInvalidRequest: true,
+		FailureTypeModelNotFound:  true,
+	}
+	for _, failureType := range c.FailureTypes {
+		if !validFailureTypes[failureType] {
+			return fmt.Errorf("invalid failure type '%s', valid types are: %s, %s, %s, %s, %s, %s", failureType,
+				FailureTypeRateLimit, FailureTypeInvalidAPIKey, FailureTypeContextLength,
+				FailureTypeServerError, FailureTypeInvalidRequest, FailureTypeModelNotFound)
+		}
+	}
+
 	if c.ZMQMaxConnectAttempts > 10 {
 		return errors.New("zmq retries times cannot be more than 10")
 	}
@@ -432,7 +465,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
 	f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
 
-	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
+	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 
@@ -466,6 +499,13 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.UintVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect")
 	f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
 
+	f.IntVar(&config.FailureInjectionRate, "failure-injection-rate", config.FailureInjectionRate, "Probability (0-100) of injecting failures")
+
+	failureTypes := getParamValueFromArgs("failure-types")
+	var dummyFailureTypes multiString
+	f.Var(&dummyFailureTypes, "failure-types", "List of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found)")
+	f.Lookup("failure-types").NoOptDefVal = dummy
+
 	// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
 	var dummyString string
 	f.StringVar(&dummyString, "config", "", "The path to a yaml configuration file. The command line values overwrite the configuration file values")
@@ -505,6 +545,9 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	if servedModelNames != nil {
 		config.ServedModelNames = servedModelNames
 	}
+	if failureTypes != nil {
+		config.FailureTypes = failureTypes
+	}
 
 	if config.HashSeed == "" {
 		hashSeed := os.Getenv("PYTHONHASHSEED")
 
@@ -370,6 +370,19 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--event-batch-size", "-35",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid failure injection rate > 100",
+			args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "150"},
+		},
+		{
+			name: "invalid failure injection rate < 0",
+			args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "-10"},
+		},
+		{
+			name: "invalid failure type",
+			args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "50",
+				"--failure-types", "invalid_type"},
+		},
 		{
 			name: "invalid fake metrics: negative running requests",
 			args: []string{"cmd", "--fake-metrics", "{\"running-requests\":-10,\"waiting-requests\":30,\"kv-cache-usage\":0.4}",
 
@@ -39,6 +39,10 @@ const (
 	RemoteDecodeFinishReason = "remote_decode"
 )
 
+// this array defines the probabilities for the buckets to be used for the generation of number of tokens in response
+var respLenBucketsProbabilities = [...]float64{0.2, 0.3, 0.2, 0.05, 0.1, 0.15}
+var cumulativeBucketsProbabilities []float64
+
 // list of responses to use in random mode for comepltion requests
 var chatCompletionFakeResponses = []string{
 	`Testing@, #testing 1$ ,2%,3^, [4&*5], 6~, 7-_ + (8 : 9) / \ < > .`,
@@ -54,6 +58,16 @@ var chatCompletionFakeResponses = []string{
 	`Give a man a fish and you feed him for a day; teach a man to fish and you feed him for a lifetime`,
 }
 
+func init() {
+	cumulativeBucketsProbabilities = make([]float64, len(respLenBucketsProbabilities))
+	sum := 0.0
+
+	for i, val := range respLenBucketsProbabilities {
+		sum += val
+		cumulativeBucketsProbabilities[i] = sum
+	}
+}
+
 // returns the max tokens or error if incorrect
 func GetMaxTokens(maxCompletionTokens *int64, maxTokens *int64) (*int64, error) {
 	var typeToken string
@@ -154,14 +168,67 @@ func GetRandomResponseText(maxCompletionTokens *int64) (string, string) {
 	if maxCompletionTokens == nil {
 		numOfTokens = GetRandomResponseLen()
 	} else {
-		numOfTokens = int(*maxCompletionTokens)
-		finishReason = GetRandomFinishReason()
+		maxTokens := int(*maxCompletionTokens)
+		// max tokens is defined - generate real length of the response based on it
+		numOfTokens = getResponseLengthByHistogram(maxTokens)
+		if numOfTokens == maxTokens {
+			// if response should be create with maximum number of tokens - finish reason will be 'length'
+			finishReason = LengthFinishReason
+		}
 	}
 
 	text := GetRandomText(numOfTokens)
 	return text, finishReason
 }
 
+// getResponseLengthByHistogram calculates the number of tokens to be returned in a response based on the max tokens value and the pre-defined buckets.
+// The response length is distributed according to the probabilities, defined in respLenBucketsProbabilities.
+// The histogram contains equally sized buckets and the last special bucket, which contains only the maxTokens value.
+// The last element of respLenBucketsProbabilities defines the probability of a reposnse with maxToken tokens.
+// Other values define probabilities for the equally sized buckets.
+// If maxToken is small (smaller than number of buckets) - the response length is randomly selected from the range [1, maxTokens]
+func getResponseLengthByHistogram(maxTokens int) int {
+	if maxTokens <= 1 {
+		return maxTokens
+	}
+	// maxTokens is small - no need to use the histogram of probabilities, just select a random value in the range [1, maxTokens]
+	if maxTokens <= len(cumulativeBucketsProbabilities) {
+		res := RandomInt(1, maxTokens)
+		return res
+	}
+
+	r := RandomFloat(0, 1)
+
+	// check if r is in the last bucket, then maxTokens should be returned
+	if r > cumulativeBucketsProbabilities[len(cumulativeBucketsProbabilities)-2] {
+		return maxTokens
+	}
+
+	// determine which bucket to use, the bucket with a cumulative probability larger than r is the bucket to use
+	// initialize bucketIndex with the last bucket to handle the case (which should not happen) when the probabilities sum is less than 1
+	bucketIndex := len(cumulativeBucketsProbabilities) - 1
+	for i, c := range cumulativeBucketsProbabilities {
+		if r <= c {
+			bucketIndex = i
+			break
+		}
+	}
+
+	// calculate the size of all of the buckets (except the special last bucket)
+	bucketSize := float64(maxTokens-1) / float64(len(cumulativeBucketsProbabilities)-1)
+	// start is the minimum number in the required bucket
+	start := int(bucketSize*float64(bucketIndex)) + 1
+	// end is the maximum number in the required bucket
+	end := int(bucketSize * float64(bucketIndex+1))
+	// sometimes end could be maxTokens because of rounding, change the value to maxToken-1
+	if end >= maxTokens {
+		end = maxTokens - 1
+	}
+
+	// pick uniformly within the bucket’s range
+	return RandomInt(start, end)
+}
+
 // GetResponseText returns response text, from a given text
 // considering max completion tokens if it is not nil, and a finish reason (stop or length)
 func GetResponseText(maxCompletionTokens *int64, text string) (string, string) {
 
@@ -38,16 +38,28 @@ var _ = Describe("Utils", Ordered, func() {
 		It("should return short text", func() {
 			maxCompletionTokens := int64(2)
 			text, finishReason := GetRandomResponseText(&maxCompletionTokens)
-			Expect(int64(len(Tokenize(text)))).Should(Equal(maxCompletionTokens))
-			Expect([]string{StopFinishReason, LengthFinishReason}).Should(ContainElement(finishReason))
+			tokensCnt := int64(len(Tokenize(text)))
+			Expect(tokensCnt).Should(BeNumerically("<=", maxCompletionTokens))
+			if tokensCnt == maxCompletionTokens {
+				Expect(finishReason).To(Equal(LengthFinishReason))
+			} else {
+				Expect(tokensCnt).To(BeNumerically("<", maxCompletionTokens))
+				Expect(finishReason).To(Equal(StopFinishReason))
+			}
 		})
 		It("should return long text", func() {
 			// return required number of tokens although it is higher than ResponseLenMax
 			maxCompletionTokens := int64(ResponseLenMax * 5)
 			text, finishReason := GetRandomResponseText(&maxCompletionTokens)
-			Expect(int64(len(Tokenize(text)))).Should(Equal(maxCompletionTokens))
+			tokensCnt := int64(len(Tokenize(text)))
+			Expect(tokensCnt).Should(BeNumerically("<=", maxCompletionTokens))
 			Expect(IsValidText(text)).To(BeTrue())
-			Expect([]string{StopFinishReason, LengthFinishReason}).Should(ContainElement(finishReason))
+			if tokensCnt == maxCompletionTokens {
+				Expect(finishReason).To(Equal(LengthFinishReason))
+			} else {
+				Expect(tokensCnt).To(BeNumerically("<", maxCompletionTokens))
+				Expect(finishReason).To(Equal(StopFinishReason))
+			}
 		})
 	})
 
 
@@ -0,0 +1,88 @@
+/*
+Copyright 2025 The llm-d-inference-sim Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package llmdinferencesim
+
+import (
+	"fmt"
+
+	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
+)
+
+const (
+	// Error message templates
+	rateLimitMessageTemplate     = "Rate limit reached for %s in organization org-xxx on requests per min (RPM): Limit 3, Used 3, Requested 1."
+	modelNotFoundMessageTemplate = "The model '%s-nonexistent' does not exist"
+)
+
+var predefinedFailures = map[string]openaiserverapi.CompletionError{
+	common.FailureTypeRateLimit:     openaiserverapi.NewCompletionError(rateLimitMessageTemplate, 429, nil),
+	common.FailureTypeInvalidAPIKey: openaiserverapi.NewCompletionError("Incorrect API key provided.", 401, nil),
+	common.FailureTypeContextLength: openaiserverapi.NewCompletionError(
+		"This model's maximum context length is 4096 tokens. However, your messages resulted in 4500 tokens.",
+		400, stringPtr("messages")),
+	common.FailureTypeServerError: openaiserverapi.NewCompletionError(
+		"The server is overloaded or not ready yet.", 503, nil),
+	common.FailureTypeInvalidRequest: openaiserverapi.NewCompletionError(
+		"Invalid request: missing required parameter 'model'.", 400, stringPtr("model")),
+	common.FailureTypeModelNotFound: openaiserverapi.NewCompletionError(modelNotFoundMessageTemplate,
+		404, stringPtr("model")),
+}
+
+// shouldInjectFailure determines whether to inject a failure based on configuration
+func shouldInjectFailure(config *common.Configuration) bool {
+	if config.FailureInjectionRate == 0 {
+		return false
+	}
+
+	return common.RandomInt(1, 100) <= config.FailureInjectionRate
+}
+
+// getRandomFailure returns a random failure from configured types or all types if none specified
+func getRandomFailure(config *common.Configuration) openaiserverapi.CompletionError {
+	var availableFailures []string
+	if len(config.FailureTypes) == 0 {
+		// Use all failure types if none specified
+		for failureType := range predefinedFailures {
+			availableFailures = append(availableFailures, failureType)
+		}
+	} else {
+		availableFailures = config.FailureTypes
+	}
+
+	if len(availableFailures) == 0 {
+		// Fallback to server_error if no valid types
+		return predefinedFailures[common.FailureTypeServerError]
+	}
+
+	randomIndex := common.RandomInt(0, len(availableFailures)-1)
+	randomType := availableFailures[randomIndex]
+
+	// Customize message with current model name
+	failure := predefinedFailures[randomType]
+	if randomType == common.FailureTypeRateLimit && config.Model != "" {
+		failure.Message = fmt.Sprintf(rateLimitMessageTemplate, config.Model)
+	} else if randomType == common.FailureTypeModelNotFound && config.Model != "" {
+		failure.Message = fmt.Sprintf(modelNotFoundMessageTemplate, config.Model)
+	}
+
+	return failure
+}
+
+func stringPtr(s string) *string {
+	return &s
+}