re-base the changes

irar2 · smarunich · commit 106e27619f25 · 2025-08-14T17:45:40.000-04:00
Signed-off-by: Sergey Marunich <marunich.s@gmail.com> KV cache and tokenization related configuration (#125) Signed-off-by: Ira <IRAR@il.ibm.com> Publish kv-cache events (#126) * Publish kv-cache events Signed-off-by: Ira <IRAR@il.ibm.com> * Fix lint errors Signed-off-by: Ira <IRAR@il.ibm.com> * Review fixes Signed-off-by: Ira <IRAR@il.ibm.com> * Sleep to allow prevous sub to close Signed-off-by: Ira <IRAR@il.ibm.com> --------- Signed-off-by: Ira <IRAR@il.ibm.com> Signed-off-by: Sergey Marunich <marunich.s@gmail.com> Use same version of tokenizer in both Dockerfile and Makefile (#132) * - Use same version of tokenizer in both Dockerfile and Makefile - Fixes in readme file Signed-off-by: Maya Barnea <mayab@il.ibm.com> * updates according PR's review Signed-off-by: Maya Barnea <mayab@il.ibm.com> --------- Signed-off-by: Maya Barnea <mayab@il.ibm.com> Signed-off-by: Sergey Marunich <marunich.s@gmail.com> Replaces usage of param.NewOpt with openai.Int for MaxTokens and openai.Bool with param.NewOpt for IncludeUsage in simulator_test.go to align with updated API usage. Signed-off-by: Sergey Marunich <marunich.s@gmail.com>
diff --git a/README.md b/README.md
@@ -33,8 +33,6 @@ The simulator supports two modes of operation:
 - `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
 - `random` mode: the response is randomly chosen from a set of pre-defined sentences.
 
-Additionally, the simulator can inject OpenAI API compatible error responses for testing error handling using the `failure-injection-rate` parameter.
-
 Timing of the response is defined by the `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
 
 For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. 
@@ -118,14 +116,13 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `min-tool-call-array-param-length`: the minimum possible length of array parameters in a tool call, optional, defaults to 1
 - `tool-call-not-required-param-probability`: the probability to add a parameter, that is not required, in a tool call, optional, defaults to 50
 - `object-tool-call-not-required-field-probability`: the probability to add a field, that is not required, in an object in a tool call, optional, defaults to 50
+<!-- 
 - `enable-kvcache`: if true, the KV cache support will be enabled in the simulator. In this case, the KV cache will be simulated, and ZQM events will be published when a KV cache block is added or evicted. 
 - `kv-cache-size`: the maximum number of token blocks in kv cache
 - `block-size`: token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128
 - `tokenizers-cache-dir`: the directory for caching tokenizers
 - `hash-seed`: seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
 - `zmq-endpoint`: ZMQ address to publish events
-- `failure-injection-rate`: probability (0-100) of injecting failures, optional, default is 0
-- `failure-types`: list of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found), optional, if empty all types are used
 - `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
 -->
 In addition, as we are using klog, the following parameters are available:
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -34,14 +34,6 @@ const (
 	vLLMDefaultPort = 8000
 	ModeRandom      = "random"
 	ModeEcho        = "echo"
-	
-	// Failure type constants
-	FailureTypeRateLimit       = "rate_limit"
-	FailureTypeInvalidAPIKey   = "invalid_api_key"
-	FailureTypeContextLength   = "context_length"
-	FailureTypeServerError     = "server_error"
-	FailureTypeInvalidRequest  = "invalid_request"
-	FailureTypeModelNotFound   = "model_not_found"
 )
 
 type Configuration struct {
@@ -135,11 +127,6 @@ type Configuration struct {
 	ZMQEndpoint string `yaml:"zmq-endpoint"`
 	// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
 	EventBatchSize int `yaml:"event-batch-size"`
-
-	// FailureInjectionRate is the probability (0-100) of injecting failures
-	FailureInjectionRate int `yaml:"failure-injection-rate"`
-	// FailureTypes is a list of specific failure types to inject (empty means all types)
-	FailureTypes []string `yaml:"failure-types"`
 }
 
 type LoraModule struct {
@@ -195,12 +182,10 @@ func newConfig() *Configuration {
 		MinToolCallArrayParamLength:         1,
 		ToolCallNotRequiredParamProbability: 50,
 		ObjectToolCallNotRequiredParamProbability: 50,
-		KVCacheSize:        1024,
-		TokenBlockSize:     16,
-		ZMQEndpoint:        "tcp://localhost:5557",
-		EventBatchSize:     16,
-		FailureInjectionRate: 0,
-		FailureTypes:       []string{},
+		KVCacheSize:    1024,
+		TokenBlockSize: 16,
+		ZMQEndpoint:    "tcp://localhost:5557",
+		EventBatchSize: 16,
 	}
 }
 
@@ -314,25 +299,6 @@ func (c *Configuration) validate() error {
 	if c.EventBatchSize < 1 {
 		return errors.New("event batch size cannot less than 1")
 	}
-  
-  if c.FailureInjectionRate < 0 || c.FailureInjectionRate > 100 {
-		return errors.New("failure injection rate should be between 0 and 100")
-	}
-
-	validFailureTypes := map[string]bool{
-		FailureTypeRateLimit:      true,
-		FailureTypeInvalidAPIKey:  true,
-		FailureTypeContextLength:  true,
-		FailureTypeServerError:    true,
-		FailureTypeInvalidRequest: true,
-		FailureTypeModelNotFound:  true,
-	}
-	for _, failureType := range c.FailureTypes {
-		if !validFailureTypes[failureType] {
-			return fmt.Errorf("invalid failure type '%s', valid types are: rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found", failureType)
-		}
-	}
-
 	return nil
 }
 
@@ -360,7 +326,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
 	f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
 
-	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
+	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
@@ -385,13 +351,6 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
 	f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
 	f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
-	
-  f.IntVar(&config.FailureInjectionRate, "failure-injection-rate", config.FailureInjectionRate, "Probability (0-100) of injecting failures")
-
-	failureTypes := getParamValueFromArgs("failure-types")
-	var dummyFailureTypes multiString
-	f.Var(&dummyFailureTypes, "failure-types", "List of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found)")
-	f.Lookup("failure-types").NoOptDefVal = "dummy"
 
 	// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
 	var dummyString string
@@ -425,9 +384,6 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	if servedModelNames != nil {
 		config.ServedModelNames = servedModelNames
 	}
-	if failureTypes != nil {
-		config.FailureTypes = failureTypes
-	}
 
 	if config.HashSeed == "" {
 		hashSeed := os.Getenv("PYTHONHASHSEED")
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -31,6 +31,7 @@ import (
 	. "github.com/onsi/gomega"
 	"github.com/openai/openai-go"
 	"github.com/openai/openai-go/option"
+	"github.com/openai/openai-go/packages/param"
 	"github.com/valyala/fasthttp/fasthttputil"
 	"k8s.io/klog/v2"
 )
@@ -43,8 +44,7 @@ const invalidMaxTokensErrMsg = "Max completion tokens and max tokens should be p
 var userMsgTokens int64
 
 func startServer(ctx context.Context, mode string) (*http.Client, error) {
-	// Disable failure injection for tests by default
-	return startServerWithArgs(ctx, mode, []string{"cmd", "--model", model, "--mode", mode, "--failure-injection-rate", "0"})
+	return startServerWithArgs(ctx, mode, nil)
 }
 
 func startServerWithArgs(ctx context.Context, mode string, args []string) (*http.Client, error) {
@@ -56,7 +56,7 @@ func startServerWithArgs(ctx context.Context, mode string, args []string) (*http
 	if args != nil {
 		os.Args = args
 	} else {
-		os.Args = []string{"cmd", "--model", model, "--mode", mode, "--failure-injection-rate", "0"}
+		os.Args = []string{"cmd", "--model", model, "--mode", mode}
 	}
 	logger := klog.Background()
 
@@ -120,7 +120,7 @@ var _ = Describe("Simulator", func() {
 					openai.UserMessage(userMessage),
 				},
 				Model:         model,
-				StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: openai.Bool(true)},
+				StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)},
 			}
 			stream := openaiclient.Chat.Completions.NewStreaming(ctx, params)
 			defer func() {
@@ -183,7 +183,7 @@ var _ = Describe("Simulator", func() {
 					OfString: openai.String(userMessage),
 				},
 				Model:         openai.CompletionNewParamsModel(model),
-				StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: openai.Bool(true)},
+				StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)},
 			}
 			stream := openaiclient.Completions.NewStreaming(ctx, params)
 			defer func() {
@@ -246,11 +246,11 @@ var _ = Describe("Simulator", func() {
 			// if maxTokens and maxCompletionTokens are passsed
 			// maxCompletionTokens is used
 			if maxTokens != 0 {
-				params.MaxTokens = openai.Int(int64(maxTokens))
+				params.MaxTokens = param.NewOpt(int64(maxTokens))
 				numTokens = maxTokens
 			}
 			if maxCompletionTokens != 0 {
-				params.MaxCompletionTokens = openai.Int(int64(maxCompletionTokens))
+				params.MaxCompletionTokens = param.NewOpt(int64(maxCompletionTokens))
 				numTokens = maxCompletionTokens
 			}
 			resp, err := openaiclient.Chat.Completions.New(ctx, params)
@@ -329,7 +329,7 @@ var _ = Describe("Simulator", func() {
 			}
 			numTokens := 0
 			if maxTokens != 0 {
-				params.MaxTokens = openai.Int(int64(maxTokens))
+				params.MaxTokens = param.NewOpt(int64(maxTokens))
 				numTokens = maxTokens
 			}
 			resp, err := openaiclient.Completions.New(ctx, params)