diff --git a/manifests/config_with_fake.yaml b/manifests/config_with_fake.yaml index 1c75c115..4f85065d 100644 --- a/manifests/config_with_fake.yaml +++ b/manifests/config_with_fake.yaml @@ -7,10 +7,15 @@ time-to-first-token: 2000 inter-token-latency: 1000 kv-cache-transfer-latency: 100 seed: 100100100 -fake-metrics: +fake-metrics: running-requests: 16 waiting-requests: 3 kv-cache-usage: 0.3 + request-success-total: + stop: 20 + request-prompt-tokens: [ 10, 20, 30, 15 ] + request-generation-tokens: [ 50, 60, 40 ] + request-params-max-tokens: [ 128, 256, 512 ] loras: - '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}' - '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}' diff --git a/pkg/common/config.go b/pkg/common/config.go index e4ae22bb..67837801 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -43,6 +43,27 @@ const ( FailureTypeServerError = "server_error" FailureTypeInvalidRequest = "invalid_request" FailureTypeModelNotFound = "model_not_found" + + StopFinishReason = "stop" + LengthFinishReason = "length" + ToolsFinishReason = "tool_calls" + RemoteDecodeFinishReason = "remote_decode" +) + +var ( + requiredFinishReasons = []string{ + StopFinishReason, + LengthFinishReason, + ToolsFinishReason, + RemoteDecodeFinishReason, + } + + validFinishReasons = map[string]struct{}{ + StopFinishReason: {}, + LengthFinishReason: {}, + ToolsFinishReason: {}, + RemoteDecodeFinishReason: {}, + } ) type Configuration struct { @@ -223,6 +244,13 @@ type Metrics struct { // 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, // 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"` + // RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init. + // Each value will be passed to Observe() once at start-up. + RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples + RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples + RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples + // RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.). + RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"` } type LorasMetrics struct { @@ -521,6 +549,38 @@ func (c *Configuration) validate() error { } } } + if c.FakeMetrics.RequestSuccessTotal != nil { + for reason, count := range c.FakeMetrics.RequestSuccessTotal { + if count < 0 { + return fmt.Errorf("fake metrics request-success-total.%s "+ + "cannot be negative, got %d", reason, count) + } + if _, ok := validFinishReasons[reason]; !ok { + return fmt.Errorf("invalid finish reason in request-success-total: "+ + "%s (valid reasons: %v)", reason, requiredFinishReasons) + } + } + for _, reason := range requiredFinishReasons { + if _, exists := c.FakeMetrics.RequestSuccessTotal[reason]; !exists { + c.FakeMetrics.RequestSuccessTotal[reason] = 0 + } + } + } + for _, v := range c.FakeMetrics.RequestPromptTokens { + if v < 0 { + return errors.New("fake metrics request-prompt-tokens cannot contain negative values") + } + } + for _, v := range c.FakeMetrics.RequestGenerationTokens { + if v < 0 { + return errors.New("fake metrics request-generation-tokens cannot contain negative values") + } + } + for _, v := range c.FakeMetrics.RequestParamsMaxTokens { + if v < 0 { + return errors.New("fake metrics request-params-max-tokens cannot contain negative values") + } + } } if c.DPSize < 1 || c.DPSize > 8 { diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 816dd412..d1bc93df 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -203,8 +203,17 @@ var _ = Describe("Simulator configuration", func() { "{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}", "{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}", }, - TTFTBucketValues: []int{10, 20, 30, 10}, - TPOTBucketValues: []int{0, 0, 10, 20, 30}, + TTFTBucketValues: []int{10, 20, 30, 10}, + TPOTBucketValues: []int{0, 0, 10, 20, 30}, + RequestPromptTokens: []int{10, 20, 30, 15}, + RequestGenerationTokens: []int{50, 60, 40}, + RequestParamsMaxTokens: []int{128, 256, 512}, + RequestSuccessTotal: map[string]int64{ + StopFinishReason: 20, + LengthFinishReason: 0, + ToolsFinishReason: 0, + RemoteDecodeFinishReason: 0, + }, } test = testCase{ name: "config with fake metrics file", diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 45c340d3..5e785893 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -20,6 +20,7 @@ package llmdinferencesim import ( "context" + "math" "strconv" "strings" "sync" @@ -65,6 +66,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } + // not supported for now, reports constant value s.waitingRequests = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", @@ -123,6 +125,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } + s.requestPromptTokens = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:request_prompt_tokens", + Help: "Number of prefill tokens processed.", + Buckets: build125Buckets(s.config.MaxModelLen), + }, + []string{vllmapi.PromLabelModelName}, + ) + if err := s.registry.Register(s.requestPromptTokens); err != nil { + s.logger.Error(err, "Prometheus request_prompt_tokens histogram register failed") + return err + } + + s.requestGenerationTokens = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:request_generation_tokens", + Help: "Number of generation tokens processed.", + Buckets: build125Buckets(s.config.MaxModelLen), + }, + []string{vllmapi.PromLabelModelName}, + ) + if err := s.registry.Register(s.requestGenerationTokens); err != nil { + s.logger.Error(err, "Prometheus request_generation_tokens histogram register failed") + return err + } + + s.requestParamsMaxTokens = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:request_params_max_tokens", + Help: "Histogram of the max_tokens request parameter.", + Buckets: build125Buckets(s.config.MaxModelLen), + }, + []string{vllmapi.PromLabelModelName}, + ) + if err := s.registry.Register(s.requestParamsMaxTokens); err != nil { + s.logger.Error(err, "Prometheus request_params_max_tokens histogram register failed") + return err + } + + s.requestSuccessTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: "", + Name: "vllm:request_success_total", + Help: "Count of successfully processed requests.", + }, + []string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason}, + ) + if err := s.registry.Register(s.requestSuccessTotal); err != nil { + s.logger.Error(err, "Prometheus request_success_total counter register failed") + return err + } + s.setInitialPrometheusMetrics() return nil @@ -132,11 +189,11 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { // the fake metrics if set func (s *VllmSimulator) setInitialPrometheusMetrics() { var nRunningReqs, nWaitingReqs, kvCacheUsage float64 + modelName := s.getDisplayedModelName(s.config.Model) if s.config.FakeMetrics != nil { nRunningReqs = float64(s.config.FakeMetrics.RunningRequests) nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests) kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage) - if s.config.FakeMetrics.TTFTBucketValues != nil { s.initFakeHistogram(s.ttft, common.TTFTBucketsBoundaries, s.config.FakeMetrics.TTFTBucketValues) } @@ -144,9 +201,22 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { if s.config.FakeMetrics.TPOTBucketValues != nil { s.initFakeHistogram(s.tpot, common.TPOTBucketsBoundaries, s.config.FakeMetrics.TPOTBucketValues) } + buckets := build125Buckets(s.config.MaxModelLen) + if s.config.FakeMetrics.RequestPromptTokens != nil { + s.initFakeHistogram(s.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens) + } + if s.config.FakeMetrics.RequestGenerationTokens != nil { + s.initFakeHistogram(s.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens) + } + if s.config.FakeMetrics.RequestParamsMaxTokens != nil { + s.initFakeHistogram(s.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens) + } + + for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal { + s.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal)) + } } - modelName := s.getDisplayedModelName(s.config.Model) s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs) s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs) s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage) @@ -288,6 +358,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) { go s.kvCacheUsageUpdater(ctx) go s.ttftUpdater(ctx) go s.tpotUpdater(ctx) + go s.recordRequestUpdater(ctx) } // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel @@ -396,3 +467,75 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) { s.logger.Error(nil, "Zero model reference", "model", lora) } } + +// recordRequestUpdater listens on requestSuccessChan and drives the Prometheus metric +// for successfully completed requests. +func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case event := <-s.requestSuccessChan: + s.recordRequestMetricsOnSuccess( + event.promptTokens, + event.generationTokens, + event.maxTokens, + event.finishReason, + ) + } + } +} + +// requestSuccessEvent represents the data associated with a successfully completed request, +// which is sent through the requestSuccessChan for asynchronous metrics recording. +type requestSuccessEvent struct { + // promptTokens is the number of input (prompt) tokens in the request + promptTokens int + // generationTokens is the number of generated (output) tokens in the response + generationTokens int + // maxTokens is the maximum number of tokens allowed for generation (if specified in the request) + maxTokens *int64 + // finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls") + finishReason string +} + +// recordRequestMetricsOnSuccess records metrics for a successfully completed request +func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens, + generationTokens int, maxTokens *int64, finishReason string) { + modelName := s.getDisplayedModelName(s.config.Model) + s.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens)) + s.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens)) + if maxTokens != nil { + s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens)) + } + s.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc() +} + +// build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5]. +// This matches vLLM's build_1_2_5_buckets() in metrics.py. +// +// Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175 +func build125Buckets(maxValue int) []float64 { + if maxValue <= 0 { + return []float64{} + } + var buckets []float64 + exponent := 0 + mantissa := []int{1, 2, 5} + + for { + complete := true + for _, m := range mantissa { + value := m * int(math.Pow10(exponent)) + if value <= maxValue { + buckets = append(buckets, float64(value)) + complete = false + } + } + if complete { + break + } + exponent++ + } + return buckets +} diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index f0f8bb58..ea12bc59 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -22,11 +22,13 @@ import ( "io" "net/http" "os" + "reflect" "regexp" "sort" "strconv" "strings" "sync" + "testing" "time" "github.com/llm-d/llm-d-inference-sim/pkg/common" @@ -105,6 +107,77 @@ var _ = Describe("Simulator metrics", Ordered, func() { wg.Wait() }) + It("Should record correct prompt and generation token counts", func() { + modelName := "testmodel" + prompt := strings.Repeat("hello ", 25) + maxTokens := 25 + + ctx := context.TODO() + args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom, + "--time-to-first-token", "100", "--max-num-seqs", "4"} + + client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + Expect(err).NotTo(HaveOccurred()) + + openaiclient := openai.NewClient( + option.WithBaseURL(baseURL), + option.WithHTTPClient(client)) + + params := openai.ChatCompletionNewParams{ + Messages: []openai.ChatCompletionMessageParamUnion{ + openai.UserMessage(prompt), + }, + Model: modelName, + MaxTokens: openai.Int(int64(maxTokens)), + Temperature: openai.Float(0.0), + } + + _, err = openaiclient.Chat.Completions.New(ctx, params) + Expect(err).NotTo(HaveOccurred()) + + time.Sleep(500 * time.Millisecond) + + metricsResp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(metricsResp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(metricsResp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + // request_prompt_tokens_bucket + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1`)) + // request_params_max_tokens_bucket + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 1`)) + // request_generation_tokens + // We do not verify the distribution of the number of tokens generated per request, + // as the number of generated tokens is unpredictable in this test. + // Therefore, we only verify the number of requests and the total number of generated tokens, + // and skip the bucket distribution. + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_count{model_name="testmodel"} 1`)) + // request_success_total + Expect(metrics).To(MatchRegexp(`vllm:request_success_total{finish_reason="(stop|length)",model_name="testmodel"} 1`)) + }) + It("Should send correct lora metrics", func() { ctx := context.TODO() args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, @@ -587,7 +660,34 @@ var _ = Describe("Simulator metrics", Ordered, func() { ctx := context.TODO() args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--fake-metrics", - "{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}],\"ttft-buckets-values\":[1, 2, 3],\"tpot-buckets-values\": [0, 0, 1, 2, 3]}", + `{` + + `"running-requests":10,` + + `"waiting-requests":30,` + + `"kv-cache-usage":0.4,` + + `"request-success-total":{` + + `"stop":20,` + + `"length":0,` + + `"tool_calls":0,` + + `"remote_decode":0` + + `},` + + `"request-prompt-tokens":[10,20,30],` + + `"request-generation-tokens":[10,20,30],` + + `"request-params-max-tokens":[10,20,30],` + + `"ttft-buckets-values":[1,2,3],` + + `"tpot-buckets-values":[0,0,1,2,3],` + + `"loras":[` + + `{` + + `"running":"lora4,lora2",` + + `"waiting":"lora3",` + + `"timestamp":1257894567` + + `},` + + `{` + + `"running":"lora4,lora3",` + + `"waiting":"",` + + `"timestamp":1257894569` + + `}` + + `]` + + `}`, } client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) @@ -617,6 +717,47 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 3")) Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 6")) Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6")) + + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="1"} 10`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="2"} 30`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="5"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="10"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="20"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="50"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="100"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="200"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="500"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="1000"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="+Inf"} 60`)) + + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1"} 10`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="2"} 30`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="5"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="10"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="20"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="50"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="100"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="200"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="500"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1000"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="+Inf"} 60`)) + + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1"} 10`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="2"} 30`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="5"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="10"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="20"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="50"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="100"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="200"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="500"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1000"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="+Inf"} 60`)) + + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="my_model"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="my_model"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="my_model"} 20`)) + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="my_model"} 0`)) }) }) @@ -771,3 +912,77 @@ func findIntMetric(metrics []string, metricPrefix string) *int { } return &val } + +// TestBuild125Buckets tests the build125Buckets function with various inputs. +func TestBuild125Buckets(t *testing.T) { + tests := []struct { + name string + maxValue int + want []float64 + }{ + { + name: "max_value zero", + maxValue: 0, + want: []float64{}, // no bucket <= 0 + }, + { + name: "max_value one", + maxValue: 1, + want: []float64{1}, + }, + { + name: "max_value five", + maxValue: 5, + want: []float64{1, 2, 5}, + }, + { + name: "max_value ten", + maxValue: 10, + want: []float64{1, 2, 5, 10}, + }, + { + name: "max_value 100", + maxValue: 100, + want: []float64{1, 2, 5, 10, 20, 50, 100}, + }, + { + name: "max_value 999", + maxValue: 999, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500}, + }, + { + name: "max_value 1024", + maxValue: 1024, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}, + }, + { + name: "max_value 4096", + maxValue: 4096, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000}, + }, + { + name: "max_value 32768", + maxValue: 32768, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000}, + }, + { + name: "max_value just below power of 10", + maxValue: 99, + want: []float64{1, 2, 5, 10, 20, 50}, + }, + { + name: "max_value negative", + maxValue: -1, + want: []float64{}, // no positive bucket <= -1 + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := build125Buckets(tt.maxValue) + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want) + } + }) + } +} diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 3851c517..30a03148 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -88,6 +88,8 @@ type VllmSimulator struct { nRunningReqs int64 // runReqChan is a channel to update nRunningReqs runReqChan chan int64 + // requestSuccessChan is a channel to update requestSuccessReqs + requestSuccessChan chan requestSuccessEvent // nWaitingReqs is the number of inference requests that are waiting to be processed nWaitingReqs int64 // waitingReqChan is a channel to update nWaitingReqs @@ -112,6 +114,14 @@ type VllmSimulator struct { tpot *prometheus.HistogramVec // kvCacheUsagePercentage is prometheus gauge kvCacheUsagePercentage *prometheus.GaugeVec + // requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request + requestPromptTokens *prometheus.HistogramVec + // requestGenerationTokens is prometheus histogram for number of generated tokens in request + requestGenerationTokens *prometheus.HistogramVec + // requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request + requestParamsMaxTokens *prometheus.HistogramVec + // requestSuccessTotal is prometheus counter for total number of successful requests + requestSuccessTotal *prometheus.CounterVec // channel for requeasts to be passed to workers reqChan chan *openaiserverapi.CompletionReqCtx // schema validator for tools parameters @@ -136,18 +146,19 @@ func New(logger logr.Logger) (*VllmSimulator, error) { } return &VllmSimulator{ - logger: logger, - reqChan: make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests), - toolsValidator: toolsValidator, - kvcacheHelper: nil, // kvcache helper will be created only if required after reading configuration - namespace: os.Getenv(podNsEnv), - pod: os.Getenv(podNameEnv), - runReqChan: make(chan int64, maxNumberOfRequests), - waitingReqChan: make(chan int64, maxNumberOfRequests), - ttftChan: make(chan float64, maxNumberOfRequests), - tpotChan: make(chan float64, maxNumberOfRequests), - lorasChan: make(chan loraUsage, maxNumberOfRequests), - kvCacheUsageChan: make(chan float64, maxNumberOfRequests), + logger: logger, + reqChan: make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests), + toolsValidator: toolsValidator, + kvcacheHelper: nil, // kvcache helper will be created only if required after reading configuration + namespace: os.Getenv(podNsEnv), + pod: os.Getenv(podNameEnv), + runReqChan: make(chan int64, maxNumberOfRequests), + waitingReqChan: make(chan int64, maxNumberOfRequests), + ttftChan: make(chan float64, maxNumberOfRequests), + tpotChan: make(chan float64, maxNumberOfRequests), + lorasChan: make(chan loraUsage, maxNumberOfRequests), + kvCacheUsageChan: make(chan float64, maxNumberOfRequests), + requestSuccessChan: make(chan requestSuccessEvent, maxNumberOfRequests), }, nil } @@ -411,9 +422,18 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) { // in case this is prefill pod processing, return special finish reason finishReason = dataset.RemoteDecodeFinishReason } - s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData) } + select { + case s.requestSuccessChan <- requestSuccessEvent{ + promptTokens: usageData.PromptTokens, + generationTokens: usageData.CompletionTokens, + maxTokens: reqCtx.CompletionReq.GetMaxCompletionTokens(), + finishReason: finishReason, + }: + default: + s.logger.V(1).Info("requestSuccessChan full, dropping success event") + } } reqCtx.Wg.Done() } diff --git a/pkg/vllm-api/vllm-models.go b/pkg/vllm-api/vllm-models.go index 76c564af..6c83af69 100644 --- a/pkg/vllm-api/vllm-models.go +++ b/pkg/vllm-api/vllm-models.go @@ -25,6 +25,7 @@ const ( PromLabelRunningLoraAdapters = "running_lora_adapters" PromLabelMaxLora = "max_lora" PromLabelModelName = "model_name" + PromLabelFinishReason = "finish_reason" VllmLoraRequestInfo = "vllm:lora_requests_info" VllmNumRequestsRunning = "vllm:num_requests_running"