Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
| vllm:request_generation_tokens | Number of generation tokens processed |
| vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter |
| vllm:request_prompt_tokens | Number of prefill tokens processed |
| vllm:request_success_total | Count of successfully processed requests |
Expand Down Expand Up @@ -235,6 +236,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
960.0, 1920.0, 7680.0, +Inf.
- `request-prompt-tokens` - array of values for prompt-length buckets
- `request-generation-tokens` - array of values for generation-length buckets
- `request-max-generation-tokens` - array of values for max_num_generation_tokens buckets
- `request-params-max-tokens` - array of values for max_tokens parameter buckets
- `request-success-total` - number of successful requests per finish reason, key: finish-reason (stop, length, etc.).
<br>
Expand Down
1 change: 1 addition & 0 deletions manifests/config_with_fake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ fake-metrics:
request-prompt-tokens: [ 10, 20, 30, 15 ]
request-generation-tokens: [ 50, 60, 40 ]
request-params-max-tokens: [ 128, 256, 512 ]
request-max-generation-tokens: [0, 0, 10, 20]
loras:
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
Expand Down
25 changes: 14 additions & 11 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ type Configuration struct {
// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
ZMQEndpoint string `yaml:"zmq-endpoint" json:"zmq-endpoint"`
// ZMQMaxConnectAttempts defines the maximum number (10) of retries when ZMQ connection fails
ZMQMaxConnectAttempts uint `yaml:"zmq-max-connect-attempts" json:"zmq-max-connect-attempts"`
ZMQMaxConnectAttempts int `yaml:"zmq-max-connect-attempts" json:"zmq-max-connect-attempts"`

// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
EventBatchSize int `yaml:"event-batch-size" json:"event-batch-size"`
Expand Down Expand Up @@ -249,9 +249,10 @@ type Metrics struct {
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
// Each value will be passed to Observe() once at start-up.
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`

Expand Down Expand Up @@ -544,19 +545,22 @@ func (c *Configuration) validate() error {
if c.ZMQMaxConnectAttempts > 10 {
return errors.New("zmq retries times cannot be more than 10")
}
if c.ZMQMaxConnectAttempts < 0 {
return errors.New("zmq retries times cannot be negative")
}

if c.FakeMetrics != nil {
if c.FakeMetrics.RunningRequests < 0 || c.FakeMetrics.WaitingRequests < 0 {
return errors.New("fake metrics request counters cannot be negative")
}
if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
return errors.New("fake metrics KV cache usage must be between 0 ans 1")
return errors.New("fake metrics KV cache usage must be between 0 and 1")
}
if c.FakeMetrics.TTFTBucketValues != nil {
if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 {
return errors.New("fake time-to-first-token array is too long")
}
for v := range c.FakeMetrics.TTFTBucketValues {
for _, v := range c.FakeMetrics.TTFTBucketValues {
if v < 0 {
return errors.New("time-to-first-token fake metrics should contain only non-negative values")
}
Expand All @@ -566,7 +570,7 @@ func (c *Configuration) validate() error {
if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 {
return errors.New("fake time-per-output-token array is too long")
}
for v := range c.FakeMetrics.TPOTBucketValues {
for _, v := range c.FakeMetrics.TPOTBucketValues {
if v < 0 {
return errors.New("time-per-output-token fake metrics should contain only non-negative values")
}
Expand Down Expand Up @@ -604,10 +608,9 @@ func (c *Configuration) validate() error {
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
}
}

for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
for _, v := range c.FakeMetrics.RequestMaxGenerationTokens {
if v < 0 {
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
return errors.New("fake metrics request-max-generation-tokens cannot contain negative values")
}
}

Expand Down Expand Up @@ -730,7 +733,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers")
f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
f.UintVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect")
f.IntVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect")
f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
f.IntVar(&config.DPSize, "data-parallel-size", config.DPSize, "Number of ranks to run")

Expand Down
Loading
Loading