Skip to content

Commit 9187884

Browse files
irar2smarunich
authored andcommitted
KV cache and tokenization related configuration (#125)
Signed-off-by: Ira <[email protected]>
1 parent 885c5ee commit 9187884

File tree

4 files changed

+7
-73
lines changed

4 files changed

+7
-73
lines changed

README.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,6 @@ The simulator supports two modes of operation:
3333
- `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
3434
- `random` mode: the response is randomly chosen from a set of pre-defined sentences.
3535

36-
Additionally, the simulator can inject OpenAI API compatible error responses for testing error handling using the `failure-injection-rate` parameter.
37-
38-
Additionally, the simulator can inject OpenAI API compatible error responses for testing error handling using the `failure-injection-rate` parameter.
39-
4036
Timing of the response is defined by the `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
4137

4238
For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream.
@@ -126,8 +122,6 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
126122
- `tokenizers-cache-dir`: the directory for caching tokenizers
127123
- `hash-seed`: seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
128124
- `zmq-endpoint`: ZMQ address to publish events
129-
- `failure-injection-rate`: probability (0-100) of injecting failures when in failure mode, optional, default is 10
130-
- `failure-types`: list of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found), optional, if empty all types are used
131125

132126
In addition, as we are using klog, the following parameters are available:
133127
- `add_dir_header`: if true, adds the file directory to the header of the log messages

pkg/common/config.go

Lines changed: 4 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,6 @@ const (
3434
vLLMDefaultPort = 8000
3535
ModeRandom = "random"
3636
ModeEcho = "echo"
37-
38-
// Failure type constants
39-
FailureTypeRateLimit = "rate_limit"
40-
FailureTypeInvalidAPIKey = "invalid_api_key"
41-
FailureTypeContextLength = "context_length"
42-
FailureTypeServerError = "server_error"
43-
FailureTypeInvalidRequest = "invalid_request"
44-
FailureTypeModelNotFound = "model_not_found"
4537
)
4638

4739
type Configuration struct {
@@ -133,13 +125,6 @@ type Configuration struct {
133125

134126
// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
135127
ZMQEndpoint string `yaml:"zmq-endpoint"`
136-
// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
137-
EventBatchSize int `yaml:"event-batch-size"`
138-
139-
// FailureInjectionRate is the probability (0-100) of injecting failures when in failure mode
140-
FailureInjectionRate int `yaml:"failure-injection-rate"`
141-
// FailureTypes is a list of specific failure types to inject (empty means all types)
142-
FailureTypes []string `yaml:"failure-types"`
143128
}
144129

145130
type LoraModule struct {
@@ -195,12 +180,9 @@ func newConfig() *Configuration {
195180
MinToolCallArrayParamLength: 1,
196181
ToolCallNotRequiredParamProbability: 50,
197182
ObjectToolCallNotRequiredParamProbability: 50,
198-
KVCacheSize: 1024,
199-
TokenBlockSize: 16,
200-
ZMQEndpoint: "tcp://localhost:5557",
201-
EventBatchSize: 16,
202-
FailureInjectionRate: 10,
203-
FailureTypes: []string{},
183+
KVCacheSize: 1024,
184+
TokenBlockSize: 16,
185+
ZMQEndpoint: "tcp://localhost:5557",
204186
}
205187
}
206188

@@ -311,28 +293,6 @@ func (c *Configuration) validate() error {
311293
if c.KVCacheSize < 0 {
312294
return errors.New("KV cache size cannot be negative")
313295
}
314-
if c.EventBatchSize < 1 {
315-
return errors.New("event batch size cannot less than 1")
316-
}
317-
318-
if c.FailureInjectionRate < 0 || c.FailureInjectionRate > 100 {
319-
return errors.New("failure injection rate should be between 0 and 100")
320-
}
321-
322-
validFailureTypes := map[string]bool{
323-
FailureTypeRateLimit: true,
324-
FailureTypeInvalidAPIKey: true,
325-
FailureTypeContextLength: true,
326-
FailureTypeServerError: true,
327-
FailureTypeInvalidRequest: true,
328-
FailureTypeModelNotFound: true,
329-
}
330-
for _, failureType := range c.FailureTypes {
331-
if !validFailureTypes[failureType] {
332-
return fmt.Errorf("invalid failure type '%s', valid types are: rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found", failureType)
333-
}
334-
}
335-
336296
return nil
337297
}
338298

@@ -360,7 +320,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
360320
f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
361321
f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
362322

363-
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
323+
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
364324
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
365325
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
366326
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
@@ -384,14 +344,6 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
384344
f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers")
385345
f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
386346
f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
387-
f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
388-
389-
f.IntVar(&config.FailureInjectionRate, "failure-injection-rate", config.FailureInjectionRate, "Probability (0-100) of injecting failures when in failure mode")
390-
391-
failureTypes := getParamValueFromArgs("failure-types")
392-
var dummyFailureTypes multiString
393-
f.Var(&dummyFailureTypes, "failure-types", "List of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found)")
394-
f.Lookup("failure-types").NoOptDefVal = "dummy"
395347

396348
// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
397349
var dummyString string
@@ -425,9 +377,6 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
425377
if servedModelNames != nil {
426378
config.ServedModelNames = servedModelNames
427379
}
428-
if failureTypes != nil {
429-
config.FailureTypes = failureTypes
430-
}
431380

432381
if config.HashSeed == "" {
433382
hashSeed := os.Getenv("PYTHONHASHSEED")

pkg/common/config_test.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,11 @@ var _ = Describe("Simulator configuration", func() {
103103
"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
104104
"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}",
105105
}
106-
c.EventBatchSize = 5
107106
test = testCase{
108107
name: "config file with command line args",
109108
args: []string{"cmd", "--model", model, "--config", "../../manifests/config.yaml", "--port", "8002",
110109
"--served-model-name", "alias1", "alias2", "--seed", "100",
111110
"--lora-modules", "{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}", "{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}",
112-
"--event-batch-size", "5",
113111
},
114112
expectedConfig: c,
115113
}
@@ -293,11 +291,6 @@ var _ = Describe("Simulator configuration", func() {
293291
args: []string{"cmd", "--block-size", "35",
294292
"--config", "../../manifests/config.yaml"},
295293
},
296-
{
297-
name: "invalid (negative) event-batch-size",
298-
args: []string{"cmd", "--event-batch-size", "-35",
299-
"--config", "../../manifests/config.yaml"},
300-
},
301294
}
302295

303296
for _, test := range invalidTests {

pkg/kv-cache/kv_cache.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,15 @@ func NewKVCacheHelper(config *common.Configuration, logger logr.Logger) (*KVCach
4747
tokenizationConfig.TokenizersCacheDir = config.TokenizersCacheDir
4848
}
4949
tokenizer, err := tokenization.NewCachedHFTokenizer(tokenizationConfig.HFTokenizerConfig)
50+
5051
if err != nil {
5152
return nil, fmt.Errorf("failed to create tokenizer: %w", err)
5253
}
53-
blockCache, err := newBlockCache(config, logger)
54-
if err != nil {
55-
return nil, fmt.Errorf("failed to create block cache: %w", err)
56-
}
54+
5755
return &KVCacheHelper{
5856
tokenizer: tokenizer,
5957
tokensProcessor: tokensProcessor,
60-
blockCache: blockCache,
58+
blockCache: newBlockCache(config.KVCacheSize, logger),
6159
logger: logger,
6260
}, nil
6361
}

0 commit comments

Comments
 (0)