diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 70d4e062..f4703f8c 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -49,6 +49,7 @@ const ( reqRunningMetricName = "vllm:num_requests_running" reqWaitingMetricName = "vllm:num_requests_waiting" gpuCacheUsageMetricName = "vllm:gpu_cache_usage_perc" + cacheConfigName = "vllm:cache_config_info" ) // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator @@ -85,7 +86,6 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } - // not supported for now, reports constant value s.metrics.waitingRequests = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", @@ -288,14 +288,27 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } - s.setInitialPrometheusMetrics() + cacheConfig := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: "", + Name: cacheConfigName, + Help: "Information of the LLMEngine CacheConfig.", + }, + []string{vllmapi.PromLabelCacheBlockSize, vllmapi.PromLabelCacheNumGPUBlocks}, + ) + if err := s.metrics.registry.Register(cacheConfig); err != nil { + s.logger.Error(err, "prometheus cache config register failed") + return err + } + + s.setInitialPrometheusMetrics(cacheConfig) return nil } // setInitialPrometheusMetrics sends the default values to prometheus or // the fake metrics if set -func (s *VllmSimulator) setInitialPrometheusMetrics() { +func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.GaugeVec) { var nRunningReqs, nWaitingReqs, kvCacheUsage float64 modelName := s.getDisplayedModelName(s.config.Model) if s.config.FakeMetrics != nil { @@ -352,6 +365,8 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { s.metrics.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs) s.metrics.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage) + cacheConfig.WithLabelValues(strconv.Itoa(s.config.TokenBlockSize), strconv.Itoa(s.config.KVCacheSize)).Set(1) + if s.config.FakeMetrics != nil && len(s.config.FakeMetrics.LoraMetrics) != 0 { for _, metrics := range s.config.FakeMetrics.LoraMetrics { s.metrics.loraInfo.WithLabelValues( diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 21d9ca2b..e92cc303 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -598,6 +598,24 @@ var _ = Describe("Simulator metrics", Ordered, func() { }() wg.Wait() }) + + It("Should send correct kv cache config metrics", func() { + ctx := context.TODO() + args := []string{"cmd", "--model", qwenModelName, "--mode", common.ModeRandom, + "--kv-cache-size", "16", "--block-size", "8"} + + client, err := startServerWithArgs(ctx, args) + Expect(err).NotTo(HaveOccurred()) + + metricsResp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(metricsResp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(metricsResp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + Expect(metrics).To(ContainSubstring("vllm:cache_config_info{block_size=\"8\",num_gpu_blocks=\"16\"} 1")) + }) }) Context("fake metrics", func() { diff --git a/pkg/vllm-api/vllm-models.go b/pkg/vllm-api/vllm-models.go index 333a8284..9aa5dd64 100644 --- a/pkg/vllm-api/vllm-models.go +++ b/pkg/vllm-api/vllm-models.go @@ -26,6 +26,8 @@ const ( PromLabelMaxLora = "max_lora" PromLabelModelName = "model_name" PromLabelFinishReason = "finish_reason" + PromLabelCacheBlockSize = "block_size" + PromLabelCacheNumGPUBlocks = "num_gpu_blocks" ) // modelInfo defines data about model returned by /models API