diff --git a/README.md b/README.md index f57f148b..b162c7b9 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar | vllm:time_to_first_token_seconds | Histogram of time to first token in seconds | | vllm:time_per_output_token_seconds | Histogram of time per output token in seconds | | vllm:request_generation_tokens | Number of generation tokens processed | +| vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned | | vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | | vllm:request_prompt_tokens | Number of prefill tokens processed | | vllm:request_success_total | Count of successfully processed requests | @@ -235,6 +236,7 @@ For more details see the 10 { return errors.New("zmq retries times cannot be more than 10") } + if c.ZMQMaxConnectAttempts < 0 { + return errors.New("zmq retries times cannot be negative") + } if c.FakeMetrics != nil { if c.FakeMetrics.RunningRequests < 0 || c.FakeMetrics.WaitingRequests < 0 { return errors.New("fake metrics request counters cannot be negative") } if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 { - return errors.New("fake metrics KV cache usage must be between 0 ans 1") + return errors.New("fake metrics KV cache usage must be between 0 and 1") } if c.FakeMetrics.TTFTBucketValues != nil { if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 { return errors.New("fake time-to-first-token array is too long") } - for v := range c.FakeMetrics.TTFTBucketValues { + for _, v := range c.FakeMetrics.TTFTBucketValues { if v < 0 { return errors.New("time-to-first-token fake metrics should contain only non-negative values") } @@ -566,7 +570,7 @@ func (c *Configuration) validate() error { if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 { return errors.New("fake time-per-output-token array is too long") } - for v := range c.FakeMetrics.TPOTBucketValues { + for _, v := range c.FakeMetrics.TPOTBucketValues { if v < 0 { return errors.New("time-per-output-token fake metrics should contain only non-negative values") } @@ -604,10 +608,9 @@ func (c *Configuration) validate() error { return errors.New("fake metrics request-params-max-tokens cannot contain negative values") } } - - for _, v := range c.FakeMetrics.RequestParamsMaxTokens { + for _, v := range c.FakeMetrics.RequestMaxGenerationTokens { if v < 0 { - return errors.New("fake metrics request-params-max-tokens cannot contain negative values") + return errors.New("fake metrics request-max-generation-tokens cannot contain negative values") } } @@ -730,7 +733,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers") f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)") f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events") - f.UintVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect") + f.IntVar(&config.ZMQMaxConnectAttempts, "zmq-max-connect-attempts", config.ZMQMaxConnectAttempts, "Maximum number of times to try ZMQ connect") f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together") f.IntVar(&config.DPSize, "data-parallel-size", config.DPSize, "Number of ranks to run") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 6f106e89..ac5a42e2 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -57,6 +57,7 @@ func createDefaultConfig(model string) *Configuration { type testCase struct { name string args []string + expectedError string expectedConfig *Configuration } @@ -203,11 +204,12 @@ var _ = Describe("Simulator configuration", func() { "{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}", "{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}", }, - TTFTBucketValues: []int{10, 20, 30, 10}, - TPOTBucketValues: []int{0, 0, 10, 20, 30}, - RequestPromptTokens: []int{10, 20, 30, 15}, - RequestGenerationTokens: []int{50, 60, 40}, - RequestParamsMaxTokens: []int{128, 256, 512}, + TTFTBucketValues: []int{10, 20, 30, 10}, + TPOTBucketValues: []int{0, 0, 10, 20, 30}, + RequestPromptTokens: []int{10, 20, 30, 15}, + RequestGenerationTokens: []int{50, 60, 40}, + RequestParamsMaxTokens: []int{128, 256, 512}, + RequestMaxGenerationTokens: []int{0, 0, 10, 20}, RequestSuccessTotal: map[string]int64{ StopFinishReason: 20, LengthFinishReason: 0, @@ -283,204 +285,252 @@ var _ = Describe("Simulator configuration", func() { // Invalid configurations invalidTests := []testCase{ { - name: "invalid model", - args: []string{"cmd", "--model", "", "--config", "../../manifests/config.yaml"}, + name: "invalid model", + args: []string{"cmd", "--model", "", "--config", "../../manifests/config.yaml"}, + expectedError: "model parameter is empty", }, { - name: "invalid port", - args: []string{"cmd", "--port", "-50", "--config", "../../manifests/config.yaml"}, + name: "invalid port", + args: []string{"cmd", "--port", "-50", "--config", "../../manifests/config.yaml"}, + expectedError: "invalid port", }, { - name: "invalid max-loras", - args: []string{"cmd", "--max-loras", "15", "--config", "../../manifests/config.yaml"}, + name: "invalid max-loras", + args: []string{"cmd", "--max-loras", "15", "--config", "../../manifests/config.yaml"}, + expectedError: "max CPU LoRAs cannot be less than max LoRAs", }, { - name: "invalid mode", - args: []string{"cmd", "--mode", "hello", "--config", "../../manifests/config.yaml"}, + name: "invalid mode", + args: []string{"cmd", "--mode", "hello", "--config", "../../manifests/config.yaml"}, + expectedError: "invalid mode ", }, { name: "invalid lora", args: []string{"cmd", "--config", "../../manifests/config.yaml", - "--lora-modules", "[{\"path\":\"/path/to/lora15\"}]"}, + "--lora-modules", "{\"path\":\"/path/to/lora15\"}"}, + expectedError: "empty LoRA name", }, { - name: "invalid max-model-len", - args: []string{"cmd", "--max-model-len", "0", "--config", "../../manifests/config.yaml"}, + name: "invalid max-model-len", + args: []string{"cmd", "--max-model-len", "0", "--config", "../../manifests/config.yaml"}, + expectedError: "max model len cannot be less than 1", }, { - name: "invalid tool-call-not-required-param-probability", - args: []string{"cmd", "--tool-call-not-required-param-probability", "-10", "--config", "../../manifests/config.yaml"}, + name: "invalid tool-call-not-required-param-probability", + args: []string{"cmd", "--tool-call-not-required-param-probability", "-10", "--config", "../../manifests/config.yaml"}, + expectedError: "ToolCallNotRequiredParamProbability should be between 0 and 100", }, { name: "invalid max-tool-call-number-param", args: []string{"cmd", "--max-tool-call-number-param", "-10", "--min-tool-call-number-param", "0", "--config", "../../manifests/config.yaml"}, + expectedError: "MaxToolCallNumberParam cannot be less than MinToolCallNumberParam", }, { name: "invalid max-tool-call-integer-param", args: []string{"cmd", "--max-tool-call-integer-param", "-10", "--min-tool-call-integer-param", "0", "--config", "../../manifests/config.yaml"}, + expectedError: "MaxToolCallIntegerParam cannot be less than MinToolCallIntegerParam", }, { name: "invalid max-tool-call-array-param-length", args: []string{"cmd", "--max-tool-call-array-param-length", "-10", "--min-tool-call-array-param-length", "0", "--config", "../../manifests/config.yaml"}, + expectedError: "MaxToolCallArrayParamLength cannot be less than MinToolCallArrayParamLength", }, { name: "invalid tool-call-not-required-param-probability", args: []string{"cmd", "--tool-call-not-required-param-probability", "-10", "--config", "../../manifests/config.yaml"}, + expectedError: "ToolCallNotRequiredParamProbability should be between 0 and 100", }, { name: "invalid object-tool-call-not-required-field-probability", args: []string{"cmd", "--object-tool-call-not-required-field-probability", "1210", "--config", "../../manifests/config.yaml"}, + expectedError: "ObjectToolCallNotRequiredParamProbability should be between 0 and 100", }, { name: "invalid time-to-first-token-std-dev", args: []string{"cmd", "--time-to-first-token-std-dev", "3000", "--config", "../../manifests/config.yaml"}, + expectedError: "time to first token standard deviation cannot be more than 30%", }, { name: "invalid (negative) time-to-first-token-std-dev", args: []string{"cmd", "--time-to-first-token-std-dev", "10", "--time-to-first-token-std-dev", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "time to first token standard deviation cannot be negative", }, { name: "invalid inter-token-latency-std-dev", - args: []string{"cmd", "--inter-token-latency", " 1000", "--inter-token-latency-std-dev", "301", + args: []string{"cmd", "--inter-token-latency", "1000", "--inter-token-latency-std-dev", "301", "--config", "../../manifests/config.yaml"}, + expectedError: "inter token latency standard deviation cannot be more than 30%", }, { name: "invalid (negative) inter-token-latency-std-dev", - args: []string{"cmd", "--inter-token-latency", " 1000", "--inter-token-latency-std-dev", "-1", + args: []string{"cmd", "--inter-token-latency", "1000", "--inter-token-latency-std-dev", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "inter token latency standard deviation cannot be negative", }, { name: "invalid kv-cache-transfer-latency-std-dev", args: []string{"cmd", "--kv-cache-transfer-latency", "70", "--kv-cache-transfer-latency-std-dev", "35", "--config", "../../manifests/config.yaml"}, + expectedError: "kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer", }, { name: "invalid (negative) kv-cache-transfer-latency-std-dev", args: []string{"cmd", "--kv-cache-transfer-latency-std-dev", "-35", "--config", "../../manifests/config.yaml"}, + expectedError: "kv-cache tranfer time standard deviation cannot be negative", }, { name: "invalid (negative) kv-cache-size", args: []string{"cmd", "--kv-cache-size", "-35", "--config", "../../manifests/config.yaml"}, + expectedError: "KV cache size cannot be negative", }, { name: "invalid block-size", args: []string{"cmd", "--block-size", "35", "--config", "../../manifests/config.yaml"}, + expectedError: "token block size should be one of the following", }, { name: "invalid (negative) event-batch-size", args: []string{"cmd", "--event-batch-size", "-35", "--config", "../../manifests/config.yaml"}, + expectedError: "event batch size cannot less than 1", }, { - name: "invalid failure injection rate > 100", - args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "150"}, + name: "invalid failure injection rate > 100", + args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "150"}, + expectedError: "failure injection rate should be between 0 and 100", }, { - name: "invalid failure injection rate < 0", - args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "-10"}, + name: "invalid failure injection rate < 0", + args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "-10"}, + expectedError: "failure injection rate should be between 0 and 100", }, { name: "invalid failure type", args: []string{"cmd", "--model", "test-model", "--failure-injection-rate", "50", "--failure-types", "invalid_type"}, + expectedError: "invalid failure type", }, { name: "invalid fake metrics: negative running requests", args: []string{"cmd", "--fake-metrics", "{\"running-requests\":-10,\"waiting-requests\":30,\"kv-cache-usage\":0.4}", "--config", "../../manifests/config.yaml"}, + expectedError: "fake metrics request counters cannot be negative", }, { name: "invalid fake metrics: kv cache usage", args: []string{"cmd", "--fake-metrics", "{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":40}", "--config", "../../manifests/config.yaml"}, + expectedError: "fake metrics KV cache usage must be between 0 and 1", }, { - name: "invalid (negative) zmq-max-connect-attempts for argument", - args: []string{"cmd", "zmq-max-connect-attempts", "-1", "--config", "../../manifests/config.yaml"}, + name: "invalid (negative) zmq-max-connect-attempts for argument", + args: []string{"cmd", "--zmq-max-connect-attempts", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "zmq retries times cannot be negative", }, { - name: "invalid (negative) zmq-max-connect-attempts for config file", - args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"}, + name: "invalid (negative) zmq-max-connect-attempts for config file", + args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"}, + expectedError: "zmq retries times cannot be negative", }, { name: "invalid (negative) prefill-overhead", args: []string{"cmd", "--prefill-overhead", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "prefill overhead cannot be negative", }, { name: "invalid (negative) prefill-time-per-token", args: []string{"cmd", "--prefill-time-per-token", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "prefill time per token cannot be negative", }, { name: "invalid (negative) prefill-time-std-dev", args: []string{"cmd", "--prefill-time-std-dev", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "prefill time standard deviation cannot be negative", }, { name: "invalid (negative) kv-cache-transfer-time-per-token", args: []string{"cmd", "--kv-cache-transfer-time-per-token", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "kv-cache tranfer time per token cannot be negative", }, { name: "invalid (negative) kv-cache-transfer-time-std-dev", args: []string{"cmd", "--kv-cache-transfer-time-std-dev", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "kv-cache tranfer time standard deviation cannot be negative", }, { name: "invalid data-parallel-size", args: []string{"cmd", "--data-parallel-size", "15", "--config", "../../manifests/config.yaml"}, + expectedError: "data parallel size must be between 1 ans 8", }, { name: "invalid max-num-seqs", args: []string{"cmd", "--max-num-seqs", "0", "--config", "../../manifests/config.yaml"}, + expectedError: "max num seqs cannot be less than 1", }, { name: "invalid max-num-seqs", args: []string{"cmd", "--max-num-seqs", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "max num seqs cannot be less than 1", }, { name: "invalid max-waiting-queue-length", args: []string{"cmd", "--max-waiting-queue-length", "0", "--config", "../../manifests/config.yaml"}, + expectedError: "max waiting queue size cannot be less than 1", }, { name: "invalid max-waiting-queue-length", args: []string{"cmd", "--max-waiting-queue-length", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "max waiting queue size cannot be less than 1", }, { name: "invalid time-factor-under-load", args: []string{"cmd", "--time-factor-under-load", "0", "--config", "../../manifests/config.yaml"}, + expectedError: "time factor under load cannot be less than 1.0", }, { name: "invalid time-factor-under-load", args: []string{"cmd", "--time-factor-under-load", "-1", "--config", "../../manifests/config.yaml"}, + expectedError: "time factor under load cannot be less than 1.0", }, { name: "invalid ttft", - args: []string{"cmd", "--ttft-buckets-values", "[1, 2, -10, 1]", + args: []string{"cmd", "--fake-metrics", "{\"ttft-buckets-values\":[1, 2, -10, 1]}", "--config", "../../manifests/config.yaml"}, + expectedError: "time-to-first-token fake metrics should contain only non-negative values", }, { name: "invalid tpot", - args: []string{"cmd", "--tpot-buckets-values", "[1, 2, -10, 1]", + args: []string{"cmd", "--fake-metrics", "{\"tpot-buckets-values\":[1, 2, -10, 1]}", + "--config", "../../manifests/config.yaml"}, + expectedError: "time-per-output-token fake metrics should contain only non-negative values", + }, + { + name: "invalid request-max-generation-tokens", + args: []string{"cmd", "--fake-metrics", "{\"request-max-generation-tokens\": [1, -1, 2]}", "--config", "../../manifests/config.yaml"}, + expectedError: "fake metrics request-max-generation-tokens cannot contain negative values", }, } @@ -488,7 +538,10 @@ var _ = Describe("Simulator configuration", func() { When(test.name, func() { It("should fail for invalid configuration", func() { _, err := createSimConfig(test.args) + // ensure that error occurred Expect(err).To(HaveOccurred()) + // ensure that an expected error occurred + Expect(err.Error()).To(ContainSubstring(test.expectedError)) }) }) } diff --git a/pkg/common/utils.go b/pkg/common/utils.go index 692707a0..2ac0cd5a 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -17,6 +17,7 @@ limitations under the License. package common import ( + "errors" "fmt" "math/rand" "regexp" @@ -161,3 +162,18 @@ func WriteToChannel[T any](channel chan T, object T, logger logr.Logger, channel logger.V(logging.WARN).Info("failed to write to", "channel", channelName) } } + +// MaxIntSlice receives a slice of ints, returns the maximum value in the slice if not empty, +// and error if the slice is empty +func MaxIntSlice(numbers []int) (int, error) { + if len(numbers) == 0 { + return 0, errors.New("cannot return maximum of an empty slice") + } + max := numbers[0] + for _, num := range numbers[1:] { + if num > max { + max = num + } + } + return max, nil +} diff --git a/pkg/kv-cache/block_cache.go b/pkg/kv-cache/block_cache.go index 75517ef5..833b6c33 100644 --- a/pkg/kv-cache/block_cache.go +++ b/pkg/kv-cache/block_cache.go @@ -52,7 +52,7 @@ func newBlockCache(config *common.Configuration, logger logr.Logger, usageChan c var publisher *common.Publisher var err error if config.ZMQEndpoint != "" { - publisher, err = common.NewPublisher(config.ZMQEndpoint, config.ZMQMaxConnectAttempts) + publisher, err = common.NewPublisher(config.ZMQEndpoint, uint(config.ZMQMaxConnectAttempts)) if err != nil { return nil, err } diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 8607289d..70d4e062 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -33,21 +33,22 @@ import ( ) const ( - e2eReqLatencyMetricName = "vllm:e2e_request_latency_seconds" - reqQueueTimeMetricName = "vllm:request_queue_time_seconds" - reqInferenceTimeMetricName = "vllm:request_inference_time_seconds" - prefillTimeMetricName = "vllm:request_prefill_time_seconds" - decodeTimeMetricName = "vllm:request_decode_time_seconds" - ttftMetricName = "vllm:time_to_first_token_seconds" - tpotMetricName = "vllm:time_per_output_token_seconds" - generationTokensMetricName = "vllm:request_generation_tokens" - paramMaxTokensMetricName = "vllm:request_params_max_tokens" - promptTokensMetricName = "vllm:request_prompt_tokens" - successTotalMetricName = "vllm:request_success_total" - loraRequestsMetricName = "vllm:lora_requests_info" - reqRunningMetricName = "vllm:num_requests_running" - reqWaitingMetricName = "vllm:num_requests_waiting" - gpuCacheUsageMetricName = "vllm:gpu_cache_usage_perc" + e2eReqLatencyMetricName = "vllm:e2e_request_latency_seconds" + reqQueueTimeMetricName = "vllm:request_queue_time_seconds" + reqInferenceTimeMetricName = "vllm:request_inference_time_seconds" + prefillTimeMetricName = "vllm:request_prefill_time_seconds" + decodeTimeMetricName = "vllm:request_decode_time_seconds" + ttftMetricName = "vllm:time_to_first_token_seconds" + tpotMetricName = "vllm:time_per_output_token_seconds" + maxNumGenerationTokensMetricName = "vllm:max_num_generation_tokens" + generationTokensMetricName = "vllm:request_generation_tokens" + paramMaxTokensMetricName = "vllm:request_params_max_tokens" + promptTokensMetricName = "vllm:request_prompt_tokens" + successTotalMetricName = "vllm:request_success_total" + loraRequestsMetricName = "vllm:lora_requests_info" + reqRunningMetricName = "vllm:num_requests_running" + reqWaitingMetricName = "vllm:num_requests_waiting" + gpuCacheUsageMetricName = "vllm:gpu_cache_usage_perc" ) // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator @@ -232,6 +233,20 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } + s.metrics.maxNumGenerationTokens = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: maxNumGenerationTokensMetricName, + Help: "Histogram of maximum number of requested generation tokens.", + Buckets: build125Buckets(s.config.MaxModelLen), + }, + []string{vllmapi.PromLabelModelName}, + ) + if err := s.metrics.registry.Register(s.metrics.maxNumGenerationTokens); err != nil { + s.logger.Error(err, "prometheus max_num_generation_tokens histogram register failed") + return err + } + s.metrics.requestGenerationTokens = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", @@ -304,6 +319,9 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { if s.config.FakeMetrics.RequestParamsMaxTokens != nil { s.initFakeHistogram(s.metrics.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens) } + if s.config.FakeMetrics.RequestMaxGenerationTokens != nil { + s.initFakeHistogram(s.metrics.maxNumGenerationTokens, buckets, s.config.FakeMetrics.RequestMaxGenerationTokens) + } for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal { s.metrics.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal)) @@ -644,6 +662,7 @@ func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) { s.recordRequestMetricsOnSuccess( event.promptTokens, event.generationTokens, + event.genTokensPerChoice, event.maxTokens, event.finishReason, ) @@ -656,8 +675,12 @@ func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) { type requestSuccessEvent struct { // promptTokens is the number of input (prompt) tokens in the request promptTokens int - // generationTokens is the number of generated (output) tokens in the response + // generationTokens is the number of generated (output) tokens in the response, + // in case of response with multiple choices contains sum of lengths of all choices generationTokens int + // genTokensPerChoice array of generated tokens count per choice, + // sum of all elements in this array should be equal to generationTokens + genTokensPerChoice []int // maxTokens is the maximum number of tokens allowed for generation (if specified in the request) maxTokens *int64 // finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls") @@ -666,7 +689,7 @@ type requestSuccessEvent struct { // recordRequestMetricsOnSuccess records metrics for a successfully completed request func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens, - generationTokens int, maxTokens *int64, finishReason string) { + generationTokens int, genTokensPerChoice []int, maxTokens *int64, finishReason string) { modelName := s.getDisplayedModelName(s.config.Model) s.metrics.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens)) s.metrics.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens)) @@ -674,6 +697,9 @@ func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens, s.metrics.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens)) } s.metrics.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc() + if maxGenTokens, err := common.MaxIntSlice(genTokensPerChoice); err == nil { + s.metrics.maxNumGenerationTokens.WithLabelValues(modelName).Observe(float64(maxGenTokens)) + } } // build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5]. diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index bafc3a8a..21d9ca2b 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -617,6 +617,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { `},` + `"request-prompt-tokens":[10,20,30],` + `"request-generation-tokens":[10,20,30],` + + `"request-max-generation-tokens":[10,20,30],` + `"request-params-max-tokens":[10,20,30],` + `"ttft-buckets-values":[1,2,3],` + `"tpot-buckets-values":[0,0,1,2,3],` + @@ -677,6 +678,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { } Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, boundary, expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, maxNumGenerationTokensMetricName, boundary, expectedCount))) Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boundary, expectedCount))) Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boundary, expectedCount))) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 6fafe9b1..289a6f59 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -134,6 +134,8 @@ type metricsData struct { requestPromptTokens *prometheus.HistogramVec // requestGenerationTokens is prometheus histogram for number of generated tokens in request requestGenerationTokens *prometheus.HistogramVec + // maxNumGenerationTokens is prometheus histogram for maximum number of generated tokens in request + maxNumGenerationTokens *prometheus.HistogramVec // requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request requestParamsMaxTokens *prometheus.HistogramVec // requestSuccessTotal is prometheus counter for total number of successful requests diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go index 5848c871..a256d196 100644 --- a/pkg/llm-d-inference-sim/worker.go +++ b/pkg/llm-d-inference-sim/worker.go @@ -164,8 +164,10 @@ func (s *VllmSimulator) processRequestAsync(reqCtx *openaiserverapi.CompletionRe requestSuccessEvent{ promptTokens: usageData.PromptTokens, generationTokens: usageData.CompletionTokens, - maxTokens: reqCtx.CompletionReq.GetMaxCompletionTokens(), - finishReason: finishReason}, + // currently only responses with a single choice are supported + genTokensPerChoice: []int{usageData.CompletionTokens}, + maxTokens: reqCtx.CompletionReq.GetMaxCompletionTokens(), + finishReason: finishReason}, s.logger, "metrics.requestSuccessChan") }