Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions pkg/llm-d-inference-sim/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ const (
reqRunningMetricName = "vllm:num_requests_running"
reqWaitingMetricName = "vllm:num_requests_waiting"
gpuCacheUsageMetricName = "vllm:gpu_cache_usage_perc"
cacheConfigName = "vllm:cache_config_info"
)

// createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
Expand Down Expand Up @@ -85,7 +86,6 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
return err
}

// not supported for now, reports constant value
s.metrics.waitingRequests = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "",
Expand Down Expand Up @@ -288,14 +288,27 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
return err
}

s.setInitialPrometheusMetrics()
cacheConfig := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "",
Name: cacheConfigName,
Help: "Information of the LLMEngine CacheConfig.",
},
[]string{vllmapi.PromLabelCacheBlockSize, vllmapi.PromLabelCacheNumGPUBlocks},
)
if err := s.metrics.registry.Register(cacheConfig); err != nil {
s.logger.Error(err, "prometheus cache config register failed")
return err
}

s.setInitialPrometheusMetrics(cacheConfig)

return nil
}

// setInitialPrometheusMetrics sends the default values to prometheus or
// the fake metrics if set
func (s *VllmSimulator) setInitialPrometheusMetrics() {
func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.GaugeVec) {
var nRunningReqs, nWaitingReqs, kvCacheUsage float64
modelName := s.getDisplayedModelName(s.config.Model)
if s.config.FakeMetrics != nil {
Expand Down Expand Up @@ -352,6 +365,8 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
s.metrics.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
s.metrics.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)

cacheConfig.WithLabelValues(strconv.Itoa(s.config.TokenBlockSize), strconv.Itoa(s.config.KVCacheSize)).Set(1)

if s.config.FakeMetrics != nil && len(s.config.FakeMetrics.LoraMetrics) != 0 {
for _, metrics := range s.config.FakeMetrics.LoraMetrics {
s.metrics.loraInfo.WithLabelValues(
Expand Down
18 changes: 18 additions & 0 deletions pkg/llm-d-inference-sim/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,24 @@ var _ = Describe("Simulator metrics", Ordered, func() {
}()
wg.Wait()
})

It("Should send correct kv cache config metrics", func() {
ctx := context.TODO()
args := []string{"cmd", "--model", qwenModelName, "--mode", common.ModeRandom,
"--kv-cache-size", "16", "--block-size", "8"}

client, err := startServerWithArgs(ctx, args)
Expect(err).NotTo(HaveOccurred())

metricsResp, err := client.Get(metricsUrl)
Expect(err).NotTo(HaveOccurred())
Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))

data, err := io.ReadAll(metricsResp.Body)
Expect(err).NotTo(HaveOccurred())
metrics := string(data)
Expect(metrics).To(ContainSubstring("vllm:cache_config_info{block_size=\"8\",num_gpu_blocks=\"16\"} 1"))
})
})

Context("fake metrics", func() {
Expand Down
2 changes: 2 additions & 0 deletions pkg/vllm-api/vllm-models.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ const (
PromLabelMaxLora = "max_lora"
PromLabelModelName = "model_name"
PromLabelFinishReason = "finish_reason"
PromLabelCacheBlockSize = "block_size"
PromLabelCacheNumGPUBlocks = "num_gpu_blocks"
)

// modelInfo defines data about model returned by /models API
Expand Down