Skip to content

Commit e1e27ea

Browse files
authored
Support vllm:max_num_generation_tokens metrics (#250)
* Add initial support for vllm:max_num_generation_tokens metrics, since we never return responses with more than one choice, the implementation is basic. Once 'n' request property will be supported - need to change to support real maximum. Added support in fake metrics. Tests added too. Signed-off-by: Maya Barnea <[email protected]> * update readme Signed-off-by: Maya Barnea <[email protected]> * Fix and extend 'Simulator configuration' test: add expected error message and fix, fix arguments in invalid configuration tests. Fix validation of ttft and tpot fake definitions. Signed-off-by: Maya Barnea <[email protected]> * Change zmq-max-connect-attempts to int to get nicer error message, fix invalid lora test in config, add missing comments Signed-off-by: Maya Barnea <[email protected]> * fix typo Signed-off-by: Maya Barnea <[email protected]> * fix typo Signed-off-by: Maya Barnea <[email protected]> * fix compilation error Signed-off-by: Maya Barnea <[email protected]> --------- Signed-off-by: Maya Barnea <[email protected]>
1 parent a9dd5c5 commit e1e27ea

File tree

10 files changed

+168
-61
lines changed

10 files changed

+168
-61
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
3434
| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
3535
| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
3636
| vllm:request_generation_tokens | Number of generation tokens processed |
37+
| vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
3738
| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter |
3839
| vllm:request_prompt_tokens | Number of prefill tokens processed |
3940
| vllm:request_success_total | Count of successfully processed requests |
@@ -235,6 +236,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
235236
960.0, 1920.0, 7680.0, +Inf.
236237
- `request-prompt-tokens` - array of values for prompt-length buckets
237238
- `request-generation-tokens` - array of values for generation-length buckets
239+
- `request-max-generation-tokens` - array of values for max_num_generation_tokens buckets
238240
- `request-params-max-tokens` - array of values for max_tokens parameter buckets
239241
- `request-success-total` - number of successful requests per finish reason, key: finish-reason (stop, length, etc.).
240242
<br>

manifests/config_with_fake.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ fake-metrics:
1616
request-prompt-tokens: [ 10, 20, 30, 15 ]
1717
request-generation-tokens: [ 50, 60, 40 ]
1818
request-params-max-tokens: [ 128, 256, 512 ]
19+
request-max-generation-tokens: [0, 0, 10, 20]
1920
loras:
2021
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
2122
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'

pkg/common/config.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ type Configuration struct {
182182
// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
183183
ZMQEndpoint string `yaml:"zmq-endpoint" json:"zmq-endpoint"`
184184
// ZMQMaxConnectAttempts defines the maximum number (10) of retries when ZMQ connection fails
185-
ZMQMaxConnectAttempts uint `yaml:"zmq-max-connect-attempts" json:"zmq-max-connect-attempts"`
185+
ZMQMaxConnectAttempts int `yaml:"zmq-max-connect-attempts" json:"zmq-max-connect-attempts"`
186186

187187
// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
188188
EventBatchSize int `yaml:"event-batch-size" json:"event-batch-size"`
@@ -249,9 +249,10 @@ type Metrics struct {
249249
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
250250
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
251251
// Each value will be passed to Observe() once at start-up.
252-
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
253-
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
254-
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
252+
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
253+
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
254+
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
255+
RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
255256
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
256257
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
257258

@@ -544,19 +545,22 @@ func (c *Configuration) validate() error {
544545
if c.ZMQMaxConnectAttempts > 10 {
545546
return errors.New("zmq retries times cannot be more than 10")
546547
}
548+
if c.ZMQMaxConnectAttempts < 0 {
549+
return errors.New("zmq retries times cannot be negative")
550+
}
547551

548552
if c.FakeMetrics != nil {
549553
if c.FakeMetrics.RunningRequests < 0 || c.FakeMetrics.WaitingRequests < 0 {
550554
return errors.New("fake metrics request counters cannot be negative")
551555
}
552556
if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
553-
return errors.New("fake metrics KV cache usage must be between 0 ans 1")
557+
return errors.New("fake metrics KV cache usage must be between 0 and 1")
554558
}
555559
if c.FakeMetrics.TTFTBucketValues != nil {
556560
if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 {
557561
return errors.New("fake time-to-first-token array is too long")
558562
}
559-
for v := range c.FakeMetrics.TTFTBucketValues {
563+
for _, v := range c.FakeMetrics.TTFTBucketValues {
560564
if v < 0 {
561565
return errors.New("time-to-first-token fake metrics should contain only non-negative values")
562566
}
@@ -566,7 +570,7 @@ func (c *Configuration) validate() error {
566570
if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 {
567571
return errors.New("fake time-per-output-token array is too long")
568572
}
569-
for v := range c.FakeMetrics.TPOTBucketValues {
573+
for _, v := range c.FakeMetrics.TPOTBucketValues {
570574
if v < 0 {
571575
return errors.New("time-per-output-token fake metrics should contain only non-negative values")
572576
}
@@ -604,10 +608,9 @@ func (c *Configuration) validate() error {
604608
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
605609
}
606610
}
607-
608-
for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
611+
for _, v := range c.FakeMetrics.RequestMaxGenerationTokens {
609612
if v < 0 {
610-
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
613+
return errors.New("fake metrics request-max-generation-tokens cannot contain negative values")
611614
}
612615
}
613616

@@ -730,7 +733,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
730733
f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers")
731734
f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
732735
f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
733-
f.UintVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect")
736+
f.IntVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect")
734737
f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
735738
f.IntVar(&config.DPSize, "data-parallel-size", config.DPSize, "Number of ranks to run")
736739

0 commit comments

Comments
 (0)