Skip to content

Commit 106e276

Browse files
irar2smarunich
authored andcommitted
re-base the changes
Signed-off-by: Sergey Marunich <[email protected]> KV cache and tokenization related configuration (#125) Signed-off-by: Ira <[email protected]> Publish kv-cache events (#126) * Publish kv-cache events Signed-off-by: Ira <[email protected]> * Fix lint errors Signed-off-by: Ira <[email protected]> * Review fixes Signed-off-by: Ira <[email protected]> * Sleep to allow prevous sub to close Signed-off-by: Ira <[email protected]> --------- Signed-off-by: Ira <[email protected]> Signed-off-by: Sergey Marunich <[email protected]> Use same version of tokenizer in both Dockerfile and Makefile (#132) * - Use same version of tokenizer in both Dockerfile and Makefile - Fixes in readme file Signed-off-by: Maya Barnea <[email protected]> * updates according PR's review Signed-off-by: Maya Barnea <[email protected]> --------- Signed-off-by: Maya Barnea <[email protected]> Signed-off-by: Sergey Marunich <[email protected]> Replaces usage of param.NewOpt with openai.Int for MaxTokens and openai.Bool with param.NewOpt for IncludeUsage in simulator_test.go to align with updated API usage. Signed-off-by: Sergey Marunich <[email protected]>
1 parent 9dbb689 commit 106e276

File tree

3 files changed

+14
-61
lines changed

3 files changed

+14
-61
lines changed

README.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ The simulator supports two modes of operation:
3333
- `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
3434
- `random` mode: the response is randomly chosen from a set of pre-defined sentences.
3535

36-
Additionally, the simulator can inject OpenAI API compatible error responses for testing error handling using the `failure-injection-rate` parameter.
37-
3836
Timing of the response is defined by the `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
3937

4038
For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream.
@@ -118,14 +116,13 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
118116
- `min-tool-call-array-param-length`: the minimum possible length of array parameters in a tool call, optional, defaults to 1
119117
- `tool-call-not-required-param-probability`: the probability to add a parameter, that is not required, in a tool call, optional, defaults to 50
120118
- `object-tool-call-not-required-field-probability`: the probability to add a field, that is not required, in an object in a tool call, optional, defaults to 50
119+
<!--
121120
- `enable-kvcache`: if true, the KV cache support will be enabled in the simulator. In this case, the KV cache will be simulated, and ZQM events will be published when a KV cache block is added or evicted.
122121
- `kv-cache-size`: the maximum number of token blocks in kv cache
123122
- `block-size`: token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128
124123
- `tokenizers-cache-dir`: the directory for caching tokenizers
125124
- `hash-seed`: seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
126125
- `zmq-endpoint`: ZMQ address to publish events
127-
- `failure-injection-rate`: probability (0-100) of injecting failures, optional, default is 0
128-
- `failure-types`: list of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found), optional, if empty all types are used
129126
- `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
130127
-->
131128
In addition, as we are using klog, the following parameters are available:

pkg/common/config.go

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,6 @@ const (
3434
vLLMDefaultPort = 8000
3535
ModeRandom = "random"
3636
ModeEcho = "echo"
37-
38-
// Failure type constants
39-
FailureTypeRateLimit = "rate_limit"
40-
FailureTypeInvalidAPIKey = "invalid_api_key"
41-
FailureTypeContextLength = "context_length"
42-
FailureTypeServerError = "server_error"
43-
FailureTypeInvalidRequest = "invalid_request"
44-
FailureTypeModelNotFound = "model_not_found"
4537
)
4638

4739
type Configuration struct {
@@ -135,11 +127,6 @@ type Configuration struct {
135127
ZMQEndpoint string `yaml:"zmq-endpoint"`
136128
// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
137129
EventBatchSize int `yaml:"event-batch-size"`
138-
139-
// FailureInjectionRate is the probability (0-100) of injecting failures
140-
FailureInjectionRate int `yaml:"failure-injection-rate"`
141-
// FailureTypes is a list of specific failure types to inject (empty means all types)
142-
FailureTypes []string `yaml:"failure-types"`
143130
}
144131

145132
type LoraModule struct {
@@ -195,12 +182,10 @@ func newConfig() *Configuration {
195182
MinToolCallArrayParamLength: 1,
196183
ToolCallNotRequiredParamProbability: 50,
197184
ObjectToolCallNotRequiredParamProbability: 50,
198-
KVCacheSize: 1024,
199-
TokenBlockSize: 16,
200-
ZMQEndpoint: "tcp://localhost:5557",
201-
EventBatchSize: 16,
202-
FailureInjectionRate: 0,
203-
FailureTypes: []string{},
185+
KVCacheSize: 1024,
186+
TokenBlockSize: 16,
187+
ZMQEndpoint: "tcp://localhost:5557",
188+
EventBatchSize: 16,
204189
}
205190
}
206191

@@ -314,25 +299,6 @@ func (c *Configuration) validate() error {
314299
if c.EventBatchSize < 1 {
315300
return errors.New("event batch size cannot less than 1")
316301
}
317-
318-
if c.FailureInjectionRate < 0 || c.FailureInjectionRate > 100 {
319-
return errors.New("failure injection rate should be between 0 and 100")
320-
}
321-
322-
validFailureTypes := map[string]bool{
323-
FailureTypeRateLimit: true,
324-
FailureTypeInvalidAPIKey: true,
325-
FailureTypeContextLength: true,
326-
FailureTypeServerError: true,
327-
FailureTypeInvalidRequest: true,
328-
FailureTypeModelNotFound: true,
329-
}
330-
for _, failureType := range c.FailureTypes {
331-
if !validFailureTypes[failureType] {
332-
return fmt.Errorf("invalid failure type '%s', valid types are: rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found", failureType)
333-
}
334-
}
335-
336302
return nil
337303
}
338304

@@ -360,7 +326,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
360326
f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
361327
f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
362328

363-
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
329+
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
364330
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
365331
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
366332
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
@@ -385,13 +351,6 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
385351
f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
386352
f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
387353
f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
388-
389-
f.IntVar(&config.FailureInjectionRate, "failure-injection-rate", config.FailureInjectionRate, "Probability (0-100) of injecting failures")
390-
391-
failureTypes := getParamValueFromArgs("failure-types")
392-
var dummyFailureTypes multiString
393-
f.Var(&dummyFailureTypes, "failure-types", "List of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found)")
394-
f.Lookup("failure-types").NoOptDefVal = "dummy"
395354

396355
// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
397356
var dummyString string
@@ -425,9 +384,6 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
425384
if servedModelNames != nil {
426385
config.ServedModelNames = servedModelNames
427386
}
428-
if failureTypes != nil {
429-
config.FailureTypes = failureTypes
430-
}
431387

432388
if config.HashSeed == "" {
433389
hashSeed := os.Getenv("PYTHONHASHSEED")

pkg/llm-d-inference-sim/simulator_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
. "github.com/onsi/gomega"
3232
"github.com/openai/openai-go"
3333
"github.com/openai/openai-go/option"
34+
"github.com/openai/openai-go/packages/param"
3435
"github.com/valyala/fasthttp/fasthttputil"
3536
"k8s.io/klog/v2"
3637
)
@@ -43,8 +44,7 @@ const invalidMaxTokensErrMsg = "Max completion tokens and max tokens should be p
4344
var userMsgTokens int64
4445

4546
func startServer(ctx context.Context, mode string) (*http.Client, error) {
46-
// Disable failure injection for tests by default
47-
return startServerWithArgs(ctx, mode, []string{"cmd", "--model", model, "--mode", mode, "--failure-injection-rate", "0"})
47+
return startServerWithArgs(ctx, mode, nil)
4848
}
4949

5050
func startServerWithArgs(ctx context.Context, mode string, args []string) (*http.Client, error) {
@@ -56,7 +56,7 @@ func startServerWithArgs(ctx context.Context, mode string, args []string) (*http
5656
if args != nil {
5757
os.Args = args
5858
} else {
59-
os.Args = []string{"cmd", "--model", model, "--mode", mode, "--failure-injection-rate", "0"}
59+
os.Args = []string{"cmd", "--model", model, "--mode", mode}
6060
}
6161
logger := klog.Background()
6262

@@ -120,7 +120,7 @@ var _ = Describe("Simulator", func() {
120120
openai.UserMessage(userMessage),
121121
},
122122
Model: model,
123-
StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: openai.Bool(true)},
123+
StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)},
124124
}
125125
stream := openaiclient.Chat.Completions.NewStreaming(ctx, params)
126126
defer func() {
@@ -183,7 +183,7 @@ var _ = Describe("Simulator", func() {
183183
OfString: openai.String(userMessage),
184184
},
185185
Model: openai.CompletionNewParamsModel(model),
186-
StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: openai.Bool(true)},
186+
StreamOptions: openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)},
187187
}
188188
stream := openaiclient.Completions.NewStreaming(ctx, params)
189189
defer func() {
@@ -246,11 +246,11 @@ var _ = Describe("Simulator", func() {
246246
// if maxTokens and maxCompletionTokens are passsed
247247
// maxCompletionTokens is used
248248
if maxTokens != 0 {
249-
params.MaxTokens = openai.Int(int64(maxTokens))
249+
params.MaxTokens = param.NewOpt(int64(maxTokens))
250250
numTokens = maxTokens
251251
}
252252
if maxCompletionTokens != 0 {
253-
params.MaxCompletionTokens = openai.Int(int64(maxCompletionTokens))
253+
params.MaxCompletionTokens = param.NewOpt(int64(maxCompletionTokens))
254254
numTokens = maxCompletionTokens
255255
}
256256
resp, err := openaiclient.Chat.Completions.New(ctx, params)
@@ -329,7 +329,7 @@ var _ = Describe("Simulator", func() {
329329
}
330330
numTokens := 0
331331
if maxTokens != 0 {
332-
params.MaxTokens = openai.Int(int64(maxTokens))
332+
params.MaxTokens = param.NewOpt(int64(maxTokens))
333333
numTokens = maxTokens
334334
}
335335
resp, err := openaiclient.Completions.New(ctx, params)

0 commit comments

Comments
 (0)