llm-d
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎manifests/config_with_fake.yaml‎
Lines changed: 1 addition & 0 deletions b/‎manifests/config_with_fake.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/common/config.go‎
Lines changed: 14 additions & 11 deletions b/‎pkg/common/config.go‎
Lines changed: 14 additions & 11 deletions
@@ -34,6 +34,7 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 | vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
 | vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
 | vllm:request_generation_tokens | Number of generation tokens processed |
+| vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
 | vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | 
 | vllm:request_prompt_tokens | Number of prefill tokens processed |
 | vllm:request_success_total | Count of successfully processed requests |
@@ -235,6 +236,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     960.0, 1920.0, 7680.0, +Inf.
     - `request-prompt-tokens` - array of values for prompt-length buckets
     - `request-generation-tokens` - array of values for generation-length buckets
+    - `request-max-generation-tokens` - array of values for max_num_generation_tokens buckets
     - `request-params-max-tokens` - array of values for  max_tokens parameter buckets
     - `request-success-total` - number of successful requests per finish reason, key: finish-reason (stop, length, etc.).
     <br>
 
@@ -16,6 +16,7 @@ fake-metrics:
   request-prompt-tokens: [ 10, 20, 30, 15 ]
   request-generation-tokens: [ 50, 60, 40 ]
   request-params-max-tokens: [ 128, 256, 512 ]
+  request-max-generation-tokens: [0, 0, 10, 20]
   loras:
   - '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
   - '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
 
@@ -182,7 +182,7 @@ type Configuration struct {
 	// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
 	ZMQEndpoint string `yaml:"zmq-endpoint" json:"zmq-endpoint"`
 	// ZMQMaxConnectAttempts defines the maximum number (10) of retries when ZMQ connection fails
-	ZMQMaxConnectAttempts uint `yaml:"zmq-max-connect-attempts" json:"zmq-max-connect-attempts"`
+	ZMQMaxConnectAttempts int `yaml:"zmq-max-connect-attempts" json:"zmq-max-connect-attempts"`
 
 	// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
 	EventBatchSize int `yaml:"event-batch-size" json:"event-batch-size"`
@@ -249,9 +249,10 @@ type Metrics struct {
 	TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
 	// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
 	// Each value will be passed to Observe() once at start-up.
-	RequestPromptTokens     []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"`         // prompt-length samples
-	RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
-	RequestParamsMaxTokens  []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
+	RequestPromptTokens        []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"`                 // prompt-length samples
+	RequestGenerationTokens    []int `yaml:"request-generation-tokens" json:"request-generation-tokens"`         // generation-length samples
+	RequestParamsMaxTokens     []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"`         // max_tokens parameter samples
+	RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
 
@@ -544,19 +545,22 @@ func (c *Configuration) validate() error {
 	if c.ZMQMaxConnectAttempts > 10 {
 		return errors.New("zmq retries times cannot be more than 10")
 	}
+	if c.ZMQMaxConnectAttempts < 0 {
+		return errors.New("zmq retries times cannot be negative")
+	}
 
 	if c.FakeMetrics != nil {
 		if c.FakeMetrics.RunningRequests < 0 || c.FakeMetrics.WaitingRequests < 0 {
 			return errors.New("fake metrics request counters cannot be negative")
 		}
 		if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
-			return errors.New("fake metrics KV cache usage must be between 0 ans 1")
+			return errors.New("fake metrics KV cache usage must be between 0 and 1")
 		}
 		if c.FakeMetrics.TTFTBucketValues != nil {
 			if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 {
 				return errors.New("fake time-to-first-token array is too long")
 			}
-			for v := range c.FakeMetrics.TTFTBucketValues {
+			for _, v := range c.FakeMetrics.TTFTBucketValues {
 				if v < 0 {
 					return errors.New("time-to-first-token fake metrics should contain only non-negative values")
 				}
@@ -566,7 +570,7 @@ func (c *Configuration) validate() error {
 			if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 {
 				return errors.New("fake time-per-output-token array is too long")
 			}
-			for v := range c.FakeMetrics.TPOTBucketValues {
+			for _, v := range c.FakeMetrics.TPOTBucketValues {
 				if v < 0 {
 					return errors.New("time-per-output-token fake metrics should contain only non-negative values")
 				}
@@ -604,10 +608,9 @@ func (c *Configuration) validate() error {
 				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
 			}
 		}
-
-		for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
+		for _, v := range c.FakeMetrics.RequestMaxGenerationTokens {
 			if v < 0 {
-				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
+				return errors.New("fake metrics request-max-generation-tokens cannot contain negative values")
 			}
 		}
 
@@ -730,7 +733,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers")
 	f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
 	f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
-	f.UintVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect")
+	f.IntVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect")
 	f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
 	f.IntVar(&config.DPSize, "data-parallel-size", config.DPSize, "Number of ranks to run")