- Add full list of supported metrics to readme

mayabar · mayabar · commit cda00c79abe0 · 2025-10-28T12:09:14.000+02:00
- Create constants for all metrics
- Define all latency related fake metrics in config
- Add validation for new fake metrics in config

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/README.md b/README.md
@@ -26,7 +26,18 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 | vllm:lora_requests_info | Running stats on LoRA requests |
 | vllm:num_requests_running | Number of requests currently running on GPU |
 | vllm:num_requests_waiting | Prometheus metric for the number of queued requests |
-
+| vllm:e2e_request_latency_seconds | Histogram of end to end request latency in seconds |
+| vllm:request_inference_time_seconds | Histogram of time spent in RUNNING phase for request |
+| vllm:request_queue_time_seconds | Histogram of time spent in WAITING phase for request |
+| vllm:request_prefill_time_seconds | Histogram of time spent in PREFILL phase for request |
+| vllm:request_decode_time_seconds | Histogram of time spent in DECODE phase for request |
+| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
+| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
+| vllm:request_generation_tokens | Number of generation tokens processed |
+| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | 
+| vllm:request_prompt_tokens | Number of prefill tokens processed |
+| vllm:request_success_total | Count of successfully processed requests |
+  
 The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
 
 The simulator supports two modes of operation:
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -232,16 +232,17 @@ type Metrics struct {
 	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
 	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
 	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
-	// TTFTBuckets is an array of values for time-to-first-token buckets,
-	// each value in this array is a value for the corresponding bucket.
+
+	// Histogram metrics - defined by array of values.
+	// Each value in this array is a value for the corresponding bucket.
 	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+
+	// TTFTBuckets is an array of values for time-to-first-token buckets.
 	// Buckets upper boundaries in seconds are:
 	// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
 	// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
 	TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
-	// TPOTBuckets is an array of values for time-per-output-token buckets,
-	// each value in this array is a value for the corresponding bucket.
-	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+	// TPOTBuckets is an array of values for time-per-output-token buckets.
 	// Buckets upper boundaries in seconds are:
 	// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
@@ -253,13 +254,21 @@ type Metrics struct {
 	RequestParamsMaxTokens  []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
-	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets,
-	// each value in this array is a value for the corresponding bucket.
-	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
-	// Buckets upper boundaries in seconds are:
+
+	// Latency histograms - have same buckets upper boundaries in seconds are:
 	// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
 	// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
+
+	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
 	E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
+	// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
+	ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
+	// ReqInfTimeBucketValues is an array of values for request inference time buckets.
+	ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
+	// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
+	ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
+	// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
+	ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`
 }
 
 type LorasMetrics struct {
@@ -595,6 +604,38 @@ func (c *Configuration) validate() error {
 				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
 			}
 		}
+
+		for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
+			if v < 0 {
+				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
+			}
+		}
+
+		for _, v := range c.FakeMetrics.E2ERequestLatencyBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics e2erl-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqQueueTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics queue-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqInfTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics inf-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqPrefillTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics prefill-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqDecodeTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics decode-time-buckets-values cannot contain negative values")
+			}
+		}
 	}
 
 	if c.DPSize < 1 || c.DPSize > 8 {
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -43,6 +43,11 @@ const (
 	generationTokensMetricName = "vllm:request_generation_tokens"
 	paramMaxTokensMetricName   = "vllm:request_params_max_tokens"
 	promptTokensMetricName     = "vllm:request_prompt_tokens"
+	successTotalMetricName     = "vllm:request_success_total"
+	loraRequestsMetricName     = "vllm:lora_requests_info"
+	reqRunningMetricName       = "vllm:num_requests_running"
+	reqWaitingMetricName       = "vllm:num_requests_waiting"
+	gpuCacheUsageMetricName    = "vllm:gpu_cache_usage_perc"
 )
 
 // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
@@ -54,7 +59,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.loraInfo = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:lora_requests_info",
+			Name:      loraRequestsMetricName,
 			Help:      "Running stats on lora requests.",
 		},
 		[]string{vllmapi.PromLabelMaxLora, vllmapi.PromLabelRunningLoraAdapters, vllmapi.PromLabelWaitingLoraAdapters},
@@ -68,7 +73,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.runningRequests = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:num_requests_running",
+			Name:      reqRunningMetricName,
 			Help:      "Number of requests currently running on GPU.",
 		},
 		[]string{vllmapi.PromLabelModelName},
@@ -83,7 +88,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.waitingRequests = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:num_requests_waiting",
+			Name:      reqWaitingMetricName,
 			Help:      "Prometheus metric for the number of queued requests.",
 		},
 		[]string{vllmapi.PromLabelModelName},
@@ -202,7 +207,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:gpu_cache_usage_perc",
+			Name:      gpuCacheUsageMetricName,
 			Help:      "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).",
 		},
 		[]string{vllmapi.PromLabelModelName},
@@ -258,7 +263,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Subsystem: "",
-			Name:      "vllm:request_success_total",
+			Name:      successTotalMetricName,
 			Help:      "Count of successfully processed requests.",
 		},
 		[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
@@ -107,8 +107,8 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		data, err := io.ReadAll(metricsResp.Body)
 		Expect(err).NotTo(HaveOccurred())
 		metrics := string(data)
-		Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 2"))
-		Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 1"))
+		Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 2)))
+		Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 1)))
 	})
 
 	It("Should record correct prompt and generation token counts", func() {
@@ -168,7 +168,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		// as the number of generated tokens is unpredictable in this test.
 		// Therefore, we only verify the number of requests and the total number of generated tokens,
 		// and skip the bucket distribution.
-		Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_count{model_name="testmodel"} 1`))
+		Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, generationTokensMetricName+"_count", 1)))
 		// request_success_total
 		Expect(metrics).To(MatchRegexp(`vllm:request_success_total{finish_reason="(stop|length)",model_name="testmodel"} 1`))
 	})
@@ -512,9 +512,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				Expect(err).NotTo(HaveOccurred())
 				metrics := string(data)
 				// Expect three running requests and two blocks in the kv cache - usage 2/16=0.125
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 3"))
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
-				Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.125"))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 3)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.125)))
 
 				time.Sleep(4 * time.Second)
 				metricsResp, err = client.Get(metricsUrl)
@@ -525,9 +525,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				Expect(err).NotTo(HaveOccurred())
 				metrics = string(data)
 				// The requests finished running, expect 0 usage
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
-				Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 0)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0)))
 			}()
 			wg.Wait()
 		})
@@ -592,9 +592,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				// The requests were sent with 500 millisecond intervals, and the first two should be still running.
 				// The third is waiting, and is still not in the kv-cache.
 				// We expect one block in the kv-cache, usage 1/16=0.0625.
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 2"))
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 1"))
-				Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.0625"))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 2)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 1)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.0625)))
 			}()
 			wg.Wait()
 		})
@@ -645,9 +645,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			data, err := io.ReadAll(resp.Body)
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 10"))
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 30"))
-			Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"testmodel\"} 0.4"))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 10)))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 30)))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, gpuCacheUsageMetricName, 0.4)))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09"))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09"))
 
diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go
@@ -245,7 +245,7 @@ func getLastLoraMetrics(metrics []string) ([]string, error) {
 	lastTimestamp := float64(0)
 	var lastMetrics []string
 	for _, metric := range metrics {
-		if strings.HasPrefix(metric, "vllm:lora_requests_info") {
+		if strings.HasPrefix(metric, loraRequestsMetricName) {
 			timestamp, err := extractTimestamp(metric)
 			if err != nil {
 				return nil, err
@@ -347,6 +347,14 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6
 	return fmt.Sprintf("%s %d", getFloatBucketMetricPrefix(model, metric, bucketBoundary), count)
 }
 
+func getCountMetricPrefix(model string, metric string) string {
+	return fmt.Sprintf("%s{model_name=\"%s\"}", metric, model)
+}
+
+func getCountMetricLine(model string, metric string, count float64) string {
+	return fmt.Sprintf("%s %g", getCountMetricPrefix(model, metric), count)
+}
+
 // same as getFloatBucketMetricLine but without the value part
 func getFloatBucketMetricPrefix(model string, metric string, bucketBoundary float64) string {
 	buckerBoundStr := "+Inf"
diff --git a/pkg/llm-d-inference-sim/worker_test.go b/pkg/llm-d-inference-sim/worker_test.go
@@ -300,8 +300,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 
 			// max-num-seqs is 12, so number of running requests should be 12
 			// and the number of waiting requests 1000-12=988
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 12"))
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 988"))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 12)))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 988)))
 
 			// max-loras is 2, so the last lora metric should be:
 			// running: two loras (doesn't matter which two)
@@ -326,8 +326,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 		})
 
 		It("Should work correctly with many simultaneous requests with many workers", func() {
-			runningMetric := "vllm:num_requests_running{model_name=\"testmodel\"}"
-			waitingMetric := "vllm:num_requests_waiting{model_name=\"testmodel\"}"
+			runningMetric := getCountMetricPrefix(testModel, reqRunningMetricName)
+			waitingMetric := getCountMetricPrefix(testModel, reqWaitingMetricName)
 			ctx := context.TODO()
 			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--time-to-first-token", "2000", "--time-to-first-token-std-dev", "600",
diff --git a/pkg/vllm-api/vllm-models.go b/pkg/vllm-api/vllm-models.go
@@ -26,9 +26,6 @@ const (
 	PromLabelMaxLora             = "max_lora"
 	PromLabelModelName           = "model_name"
 	PromLabelFinishReason        = "finish_reason"
-
-	VllmLoraRequestInfo    = "vllm:lora_requests_info"
-	VllmNumRequestsRunning = "vllm:num_requests_running"
 )
 
 // modelInfo defines data about model returned by /models API

Original file line number	Diff line number	Diff line change
`@@ -26,9 +26,6 @@ const (`
`26`	`26`	`PromLabelMaxLora = "max_lora"`
`27`	`27`	`PromLabelModelName = "model_name"`
`28`	`28`	`PromLabelFinishReason = "finish_reason"`
`29`		`-`
`30`		`- VllmLoraRequestInfo = "vllm:lora_requests_info"`
`31`		`- VllmNumRequestsRunning = "vllm:num_requests_running"`
`32`	`29`	`)`
`33`	`30`
`34`	`31`	`// modelInfo defines data about model returned by /models API`